In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# One Hot Encoding

One Hot Encoding search create *m* number of binary features. Where *m* is the number of diferents categories there are.

In [2]:
df = pd.read_csv("../../datasets/pokemon.csv")
features_to_vis = ["Name", "Generation", "Legendary", "Lgnd_Label", "Gen_Label"]

df[features_to_vis[0:3]].iloc[158:165]

Unnamed: 0,Name,Generation,Legendary
158,Moltres,Gen 1,True
159,Dratini,Gen 1,False
160,Dragonair,Gen 1,False
161,Dragonite,Gen 1,False
162,Mewtwo,Gen 1,True
163,MewtwoMega Mewtwo X,Gen 1,True
164,MewtwoMega Mewtwo Y,Gen 1,True


In [66]:
# Give numerical format to generations
gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(df["Generation"])
df["Gen_Label"] = gen_labels

# Give numerical format to legendary
lgnd_le = LabelEncoder()
lgnd_labels = lgnd_le.fit_transform(df["Legendary"])
df["Lgnd_Label"] = lgnd_labels

df[features_to_vis].iloc[164: 170]

Unnamed: 0,Name,Generation,Legendary,Lgnd_Label,Gen_Label
164,MewtwoMega Mewtwo Y,Gen 1,True,1,0
165,Mew,Gen 1,False,0,0
166,Chikorita,Gen 2,False,0,1
167,Bayleef,Gen 2,False,0,1
168,Meganium,Gen 2,False,0,1
169,Cyndaquil,Gen 2,False,0,1


In [8]:
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(df[["Gen_Label"]]).toarray()
gen_feature_labels = list(gen_le.classes_)
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)
gen_features.head()

Unnamed: 0,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
lgnd_ohe = OneHotEncoder()
lgnd_feature_arr = lgnd_ohe.fit_transform(df[["Legendary"]]).toarray()
lgnd_feature_labels = ["Legendary_"+str(cls_label) for cls_label in lgnd_le.classes_]
lgnd_features = pd.DataFrame(lgnd_feature_arr, columns=lgnd_feature_labels)
lgnd_features.head()

Unnamed: 0,Legendary_False,Legendary_True
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [9]:
df_sub = df[["Name", "Generation", "Gen_Label", "Legendary", "Lgnd_Label"]]
df_ohe = pd.concat([df_sub, gen_features, lgnd_features], axis=1)
columns = sum([["Name", "Generation", "Gen_Label"], gen_feature_labels, ["Legendary", "Lgnd_Label"], lgnd_feature_labels], [])
df_ohe[columns].iloc[164: 170]

Unnamed: 0,Name,Generation,Gen_Label,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6,Legendary,Lgnd_Label,Legendary_False,Legendary_True
164,MewtwoMega Mewtwo Y,Gen 1,0,1.0,0.0,0.0,0.0,0.0,0.0,True,1,0.0,1.0
165,Mew,Gen 1,0,1.0,0.0,0.0,0.0,0.0,0.0,False,0,1.0,0.0
166,Chikorita,Gen 2,1,0.0,1.0,0.0,0.0,0.0,0.0,False,0,1.0,0.0
167,Bayleef,Gen 2,1,0.0,1.0,0.0,0.0,0.0,0.0,False,0,1.0,0.0
168,Meganium,Gen 2,1,0.0,1.0,0.0,0.0,0.0,0.0,False,0,1.0,0.0
169,Cyndaquil,Gen 2,1,0.0,1.0,0.0,0.0,0.0,0.0,False,0,1.0,0.0


In [32]:
# Here only I did the same that above but with the two features at same time
encoder = OneHotEncoder()
both_features = encoder.fit_transform(df[["Gen_Label", "Lgnd_Label"]]).toarray()
labels_both_feat = ['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6', "Legendary_False", "Legendary_True"]
pd.DataFrame(both_features, columns=labels_both_feat).head()

Unnamed: 0,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6,Legendary_False,Legendary_True
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [59]:
# Mock data (test data)
new_df = pd.DataFrame([["PikaZoom", "Gen 3", True],
                       ["CharMyToast", "Gen 4", False]],
                      columns=["Name", "Generation", "Legendary"]
                     )
new_gen_labels = gen_le.transform(new_df["Generation"])
new_df["Gen_Label"] = new_gen_labels

new_lgnd_labels = lgnd_le.transform(new_df["Legendary"])
new_df["Lgnd_Label"] = new_lgnd_labels

new_df

Unnamed: 0,Name,Generation,Legendary,Gen_Label,Lgnd_Label
0,PikaZoom,Gen 3,True,2,1
1,CharMyToast,Gen 4,False,3,0


In [60]:
new_gen_feature_arr = gen_ohe.transform(new_df[["Gen_Label"]]).toarray()
new_gen_features = pd.DataFrame(new_gen_feature_arr, columns=gen_feature_labels)
new_gen_features

Unnamed: 0,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0


In [61]:
new_lgnd_feature_arr = lgnd_ohe.transform(new_df[["Legendary"]]).toarray()
new_lgnd_features = pd.DataFrame(new_lgnd_feature_arr, columns=lgnd_feature_labels)
new_lgnd_features

Unnamed: 0,Legendary_False,Legendary_True
0,0.0,1.0
1,1.0,0.0


In [62]:
new_df = pd.concat([new_df, new_lgnd_features, new_gen_features], axis=1)

In [67]:
new_df[columns]

Unnamed: 0,Name,Generation,Gen_Label,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6,Legendary,Lgnd_Label,Legendary_False,Legendary_True
0,PikaZoom,Gen 3,2,0.0,0.0,1.0,0.0,0.0,0.0,True,1,0.0,1.0
1,CharMyToast,Gen 4,3,0.0,0.0,0.0,1.0,0.0,0.0,False,0,1.0,0.0


## Observations

We use fit_transform to train data, and instead we use **transform** to the test data.