In [71]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder,TargetEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer


In [72]:
df = pd.DataFrame({
    'Color': ['Red', '', 'Green', 'Blue', 'Red'],
    'Salary': ['Low','','', 'Medium','Medium'],
    'y'    : [0, 0, 1, 1, 0]
})

df

Unnamed: 0,Color,Salary,y
0,Red,Low,0
1,,,0
2,Green,,1
3,Blue,Medium,1
4,Red,Medium,0


In [73]:
df.isna().sum()

Color     0
Salary    0
y         0
dtype: int64

In [74]:
# Replace '' => np.NaN
df.replace({'':np.NAN}, inplace=True)
df

Unnamed: 0,Color,Salary,y
0,Red,Low,0
1,,,0
2,Green,,1
3,Blue,Medium,1
4,Red,Medium,0


## Deal with NaN values

1. Drop rows with NaN
2. Drop Columns with NaN
3. Impute (fill) with mean, mode , or 'Missing'

In [75]:
X = df.drop(columns='y')
X

Unnamed: 0,Color,Salary
0,Red,Low
1,,
2,Green,
3,Blue,Medium
4,Red,Medium


In [76]:
# X.dropna(axis=0, how='any')

In [78]:
X['Color'] = X['Color'].fillna('Missing')
X['Salary'] = X['Salary'].fillna('NA')
X

Unnamed: 0,Color,Salary
0,Red,Low
1,Missing,
2,Green,
3,Blue,Medium
4,Red,Medium


In [48]:
# Encode Ordinal feature

df['Salary_LabelEncoder'] = LabelEncoder().fit_transform(df['Salary'])
df['Salary_OrdinalEncoder'] = OrdinalEncoder(categories=[['Low','Medium','High']]).fit_transform(df[['Salary']])

# TODO: explain how works
df['Salary_TargetEncoder'] = TargetEncoder(cv=2).fit_transform(df[['Salary']], df['y'])

df

Unnamed: 0,Color,Salary,y,Salary_LabelEncoder,Salary_OrdinalEncoder,Salary_TargetEncoder
0,Red,Low,0,1,0.0,0.5
1,Blue,Medium,0,2,1.0,0.5
2,Green,High,1,0,2.0,0.5
3,Blue,Medium,1,2,1.0,0.0
4,Red,Medium,0,2,1.0,0.0


In [49]:
# Encode Nominal feature

encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the Salary column
salary_encoded = encoder.fit_transform(df[['Color']])

# Create a DataFrame from the encoded array
salary_encoded_df = pd.DataFrame(salary_encoded, columns=encoder.get_feature_names_out(['Color']))

# Concatenate the new DataFrame with the original DataFrame
df = pd.concat([df, salary_encoded_df], axis=1)

In [50]:
df

Unnamed: 0,Color,Salary,y,Salary_LabelEncoder,Salary_OrdinalEncoder,Salary_TargetEncoder,Color_Blue,Color_Green,Color_Red
0,Red,Low,0,1,0.0,0.5,0.0,0.0,1.0
1,Blue,Medium,0,2,1.0,0.5,1.0,0.0,0.0
2,Green,High,1,0,2.0,0.5,0.0,1.0,0.0
3,Blue,Medium,1,2,1.0,0.0,1.0,0.0,0.0
4,Red,Medium,0,2,1.0,0.0,0.0,0.0,1.0
