# Data Encoding
    1. Nominal/One Hot Encoder Encoding
    2. Label and Ordinal Encoding
    3. Target Guided Ordinal Encoding

## 1. Nominal/One Hot Encoder Encoding
*eg:*
        1. Red: [1,0,0]
        2. Green: [0,1,0]
        3. Blue: [0,0,1]

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Create a Simple DataFrame
df=pd.DataFrame({
    'color': ['red','blue','green','green','red','blue','green','red','blue','green','red','blue','green','red','blue','green','red','blue']
})

In [3]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue
6,green
7,red
8,blue
9,green


In [4]:
# Create an Instance of OnehotEncoder
encoder=OneHotEncoder()

In [5]:
encoded=encoder.fit_transform(df[['color']]).toarray()

In [6]:
encoded

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [7]:
import pandas as pd
encoded_df=pd.DataFrame(encoded,columns=encoder.get_feature_names_out())

In [8]:
encoded_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0
6,0.0,1.0,0.0
7,0.0,0.0,1.0
8,1.0,0.0,0.0
9,0.0,1.0,0.0


In [9]:
encoder.transform([['red']]).toarray()



array([[0., 0., 1.]])

In [10]:
pd.concat([df,encoded_df],axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0
6,green,0.0,1.0,0.0
7,red,0.0,0.0,1.0
8,blue,1.0,0.0,0.0
9,green,0.0,1.0,0.0


In [11]:
import seaborn as sns
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [12]:
sex_encoded=encoder.fit_transform(df[['sex']]).toarray()

In [13]:
sex_encoded_df1=pd.DataFrame(sex_encoded,columns=encoder.get_feature_names_out())

In [14]:
sex_encoded_df1

Unnamed: 0,sex_Female,sex_Male
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0
...,...,...
239,0.0,1.0
240,1.0,0.0
241,0.0,1.0
242,0.0,1.0


In [15]:
smoker_day_time=encoder.fit_transform(df[['smoker','day','time']]).toarray()

In [16]:
smoker_day_time

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.]])

In [17]:
smoker_day_time_df2=pd.DataFrame(smoker_day_time,columns=encoder.get_feature_names_out())

In [18]:
smoker_day_time_df2

Unnamed: 0,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
239,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [19]:
pd.concat([df,sex_encoded_df1,smoker_day_time_df2],axis=1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,21.01,3.50,Male,No,Sun,Dinner,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,17.82,1.75,Male,No,Sat,Dinner,2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## 2.1 Label Encoding
*eg:*

        1. Red: 1
        2. Green: 2
        3. Blue: 3

In [20]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [22]:
df2=pd.DataFrame({
    'color': ['red','blue','green','green','red','blue','green','red','blue','green','red','blue','green','red','blue','green','red','blue']
})

In [23]:
df2

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue
6,green
7,red
8,blue
9,green


In [24]:
from sklearn.preprocessing import LabelEncoder

In [25]:
labelencode=LabelEncoder()

In [27]:
label_encode=labelencode.fit_transform(df2[['color']])

  y = column_or_1d(y, warn=True)


In [31]:
label_encode_df=pd.DataFrame(label_encode,columns=['colors'])

In [32]:
label_encode_df

Unnamed: 0,colors
0,2
1,0
2,1
3,1
4,2
5,0
6,1
7,2
8,0
9,1


In [34]:
pd.concat([df2,label_encode_df],axis=1)

Unnamed: 0,color,colors
0,red,2
1,blue,0
2,green,1
3,green,1
4,red,2
5,blue,0
6,green,1
7,red,2
8,blue,0
9,green,1


## 2.2 Ordinal Encoding: 
- **Assigning Rank for COMPARISION which is not in Label Encoding**
  
      1. High School: 1
      2. College: 2
      3. Graduate: 3
      4. Post Graduate: 4
  

In [36]:
from sklearn.preprocessing import OrdinalEncoder

In [38]:
df3=pd.DataFrame({
    'size':['small','medium','large','medium','small','large']
})

In [39]:
df3

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [42]:
ord_encode=OrdinalEncoder(categories=[['small','medium','large']])

In [44]:
ord_encode.fit_transform(df3[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [46]:
ord_encode.transform([['medium']])



array([[1.]])

## 3. Target Guided Ordinal Encoding

In [47]:
df4=pd.DataFrame({
    'city':['New York','London','Paris','Tokyo','New York','Paris'],
    'price':[200,150,300,250,180,320]
})

In [48]:
df4

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [52]:
mean_price=df4.groupby('city')['price'].mean().to_dict()

In [53]:
mean_price

{'London': 150.0, 'New York': 190.0, 'Paris': 310.0, 'Tokyo': 250.0}

In [55]:
df4['city_encoded']=df4['city'].map(mean_price)

In [56]:
df4

Unnamed: 0,city,price,city_encoded
0,New York,200,190.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New York,180,190.0
5,Paris,320,310.0


In [59]:
df4[['price','city_encoded']]  #City Converted to Numerical value(Mean)

Unnamed: 0,price,city_encoded
0,200,190.0
1,150,150.0
2,300,310.0
3,250,250.0
4,180,190.0
5,320,310.0


In [61]:
import seaborn as sns
df5=sns.load_dataset('tips')

In [62]:
df5

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [65]:
mean_bill=df5.groupby('time')['total_bill'].mean().to_dict()

In [66]:
mean_bill

{'Lunch': 17.168676470588235, 'Dinner': 20.79715909090909}

In [67]:
df5['time_encoded']=df['time'].map(mean_bill)

In [68]:
df5

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,time_encoded
0,16.99,1.01,Female,No,Sun,Dinner,2,20.797159
1,10.34,1.66,Male,No,Sun,Dinner,3,20.797159
2,21.01,3.50,Male,No,Sun,Dinner,3,20.797159
3,23.68,3.31,Male,No,Sun,Dinner,2,20.797159
4,24.59,3.61,Female,No,Sun,Dinner,4,20.797159
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,20.797159
240,27.18,2.00,Female,Yes,Sat,Dinner,2,20.797159
241,22.67,2.00,Male,Yes,Sat,Dinner,2,20.797159
242,17.82,1.75,Male,No,Sat,Dinner,2,20.797159


In [70]:
df5[['total_bill','time_encoded']]

Unnamed: 0,total_bill,time_encoded
0,16.99,20.797159
1,10.34,20.797159
2,21.01,20.797159
3,23.68,20.797159
4,24.59,20.797159
...,...,...
239,29.03,20.797159
240,27.18,20.797159
241,22.67,20.797159
242,17.82,20.797159
