# Data Encoding 
1. Nominal/OHE Encoding -> Converts categorical values into multiple binary columns (0/1), creating one column per unique category.
2. Label and Ordinal Encoding -> Assigns each category an integer value; Label Encoding is arbitrary, whereas Ordinal Encoding follows a meaningful order.
3. Target Guided Ordinal Encoding -> Encodes categories based on their relationship with the target variable (e.g., mean target value per category).

In [1]:
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder

In [54]:
#Create a simple dataframe
df = pd.DataFrame({
    'color' : ['red', 'blue', 'green', 'green', 'red', 'blue']
})

In [8]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [15]:
#Create an instance of OneHotEncoder
encoder = OneHotEncoder() 


In [16]:
##Perform fit and transform 
encoded = encoder.fit_transform(df[['color']]).toarray()

In [17]:
import pandas as pd
encoder_df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out())

In [18]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [21]:
##for new data
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [22]:
pd.concat([df,encoder_df],axis = 1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


In [48]:
import seaborn as sns
df1 = sns.load_dataset('tips')

In [49]:
df1.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [40]:
encoder = OneHotEncoder(sparse_output=False)

In [50]:
encoded = encoder.fit_transform(df1[['sex', 'smoker', 'day', 'time']])

In [51]:
import pandas as pd
encoder_df = pd.DataFrame(encoded, columns = encoder.get_feature_names_out(['sex','smoker','day','time']))

In [43]:
encoder_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [52]:
pd.concat([df1,encoder_df],axis = 1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


# Label Encoding

In [55]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [58]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()

In [59]:
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 0, 1, 1, 2, 0])

In [60]:
lbl_encoder.transform([['red']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

In [61]:
lbl_encoder.transform([['blue']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([0])

# Ordinal Encoding 


In [62]:
from sklearn.preprocessing import OrdinalEncoder

In [63]:
#Create a sample dataframe with an ordinal variable 
df = pd.DataFrame({
    'size' : ['small', 'medium', 'large', 'medium', 'small', 'large']
})


In [64]:
df

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [65]:
##Create an instance of ordinal encoder and then fit_transform 
encoder = OrdinalEncoder(categories = [['small','medium','large']])

In [66]:
encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [67]:
encoder.transform([['small']])



array([[0.]])

# Target Guided Ordinal Encoding 

In [90]:
import pandas as pd 

df = pd.DataFrame({
    'city' : ['New York', 'London', 'Paris', 'Tokyo', 'New York', 'Paris'],
    'price' : [200,150,300,250,180,320]
})

In [91]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [92]:
mean_price = df.groupby('city')['price'].mean().to_dict()

In [93]:
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [94]:
df['city_encoded'] = df['city'].map(mean_price)

In [97]:
df[['price','city_encoded']]

Unnamed: 0,price,city_encoded
0,200,190.0
1,150,150.0
2,300,310.0
3,250,250.0
4,180,190.0
5,320,310.0


In [101]:
import seaborn as sns 
df = sns.load_dataset('tips')

In [102]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [103]:
mean_price = df.groupby('time')['total_bill'].mean().to_dict()

  mean_price = df.groupby('time')['total_bill'].mean().to_dict()


In [104]:
mean_price

{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}

In [105]:
df['time_encoded'] = df['time'].map(mean_price)

In [107]:
df[['total_bill','time_encoded']]

Unnamed: 0,total_bill,time_encoded
0,16.99,20.797159
1,10.34,20.797159
2,21.01,20.797159
3,23.68,20.797159
4,24.59,20.797159
...,...,...
239,29.03,20.797159
240,27.18,20.797159
241,22.67,20.797159
242,17.82,20.797159
