# 02 - Data Cleaning (V3)

- Drop NA on target; cast to int
- Impute numeric (median) and categorical (mode)
- Handle duplicates; outliers via IQR (drop/winsorize)
- Remove negative numeric anomalies
- Save to `../v3_data/employee_promotion_clean.csv`


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

RAW = Path('../data/employee-promotion.csv')
OUT = Path('../v3_data/employee_promotion_clean.csv')
OUT.parent.mkdir(parents=True, exist_ok=True)

# Load with delimiter detection
df = pd.read_csv(RAW)
if len(df.columns) == 1 and ';' in df.columns[0]:
    df = pd.read_csv(RAW, sep=';')

TARGET = 'Promotion_Eligible'

# Drop NA on target and cast to int
df = df.dropna(subset=[TARGET]).copy()
df[TARGET] = df[TARGET].astype(int)

# Numeric/Categorical split
num_cols = df.select_dtypes(include=np.number).columns.drop([TARGET])
cat_cols = df.select_dtypes(exclude=np.number).columns

# Impute numeric (median) and categorical (mode)
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())
for c in cat_cols:
    if df[c].isna().sum() > 0:
        df[c] = df[c].fillna(df[c].mode()[0])

# Duplicates
if df.duplicated().any():
    df = df.drop_duplicates()

# Outliers by IQR (drop if <5%, else winsorize)
for c in num_cols:
    q1, q3 = df[c].quantile(0.25), df[c].quantile(0.75)
    iqr = q3 - q1
    lb, ub = q1 - 1.5*iqr, q3 + 1.5*iqr
    mask = (df[c] < lb) | (df[c] > ub)
    pct = 100 * mask.mean()
    if pct < 5:
        df = df[~mask]
    else:
        df[c] = np.where(df[c] < lb, lb, np.where(df[c] > ub, ub, df[c]))

# Remove negative numeric rows
neg_mask = (df[num_cols] < 0).any(axis=1)
df = df[~neg_mask]

# Save
df.to_csv(OUT, index=False)
print('Saved:', OUT, 'Shape:', df.shape)
