In [192]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder,TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer


In [193]:
df = pd.DataFrame({
    'Color': ['Red', '', 'Green', 'Blue', 'Red'],
    'Salary': ['Low','','', 'Medium','Medium'],
    'y'    : [0, 0, 1, 1, 0]
})

df

Unnamed: 0,Color,Salary,y
0,Red,Low,0
1,,,0
2,Green,,1
3,Blue,Medium,1
4,Red,Medium,0


In [194]:
df.isna().sum()

Color     0
Salary    0
y         0
dtype: int64

In [195]:
# Replace '' => np.NaN
df.replace({'':np.NAN}, inplace=True)
df

Unnamed: 0,Color,Salary,y
0,Red,Low,0
1,,,0
2,Green,,1
3,Blue,Medium,1
4,Red,Medium,0


## Deal with NaN values

1. Drop rows with NaN
2. Drop Columns with NaN
3. Impute (fill) with mean, mode , or 'Missing'

In [196]:
X = df.drop(columns='y')
y = df['y']
X

Unnamed: 0,Color,Salary
0,Red,Low
1,,
2,Green,
3,Blue,Medium
4,Red,Medium


In [197]:
# X.dropna(axis=0, how='any')

In [198]:
X['Color'] = X['Color'].fillna('Missing')
X['Salary'] = X['Salary'].fillna('Unknown')
X

Unnamed: 0,Color,Salary
0,Red,Low
1,Missing,Unknown
2,Green,Unknown
3,Blue,Medium
4,Red,Medium


In [200]:
# Encode Ordinal feature

# X['Salary_LabelEncoder'] = LabelEncoder().fit_transform(X['Salary'])
# X['Salary_OrdinalEncoder'] = OrdinalEncoder(categories=[['Unknown','Low','Medium']]).fit_transform(X[['Salary']])

# Done: no smoothing and no CV
target_encoder = TargetEncoder(smooth=0)

target_encoder.fit(X[['Salary']], y)
X['Salary_TargetEncoder'] = target_encoder.transform(X[['Salary']])

df = X
df['y'] = y
df

Unnamed: 0,Color,Salary,Salary_TargetEncoder,y
0,Red,Low,0.0,0
1,Missing,Unknown,0.5,0
2,Green,Unknown,0.5,1
3,Blue,Medium,0.5,1
4,Red,Medium,0.5,0


In [165]:
# Check the distribution of the target variable for each Salary category
for salary in X['Salary'].unique():
    print(f"{salary}: {y[X['Salary'] == salary].mean()}")

Low: 0.0
Unknown: 0.5
Medium: 0.5


In [49]:
# Encode Nominal feature

encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the Salary column
salary_encoded = encoder.fit_transform(df[['Color']])

# Create a DataFrame from the encoded array
salary_encoded_df = pd.DataFrame(salary_encoded, columns=encoder.get_feature_names_out(['Color']))

# Concatenate the new DataFrame with the original DataFrame
df = pd.concat([df, salary_encoded_df], axis=1)

In [50]:
df

Unnamed: 0,Color,Salary,y,Salary_LabelEncoder,Salary_OrdinalEncoder,Salary_TargetEncoder,Color_Blue,Color_Green,Color_Red
0,Red,Low,0,1,0.0,0.5,0.0,0.0,1.0
1,Blue,Medium,0,2,1.0,0.5,1.0,0.0,0.0
2,Green,High,1,0,2.0,0.5,0.0,1.0,0.0
3,Blue,Medium,1,2,1.0,0.0,1.0,0.0,0.0
4,Red,Medium,0,2,1.0,0.0,0.0,0.0,1.0
