# Encoding Categorical Data

# One Hot Encoding

In [2]:
#Using one-hot encoding on the titanic dataset

import matplotlib.pyplot as plt
import seaborn as sns


plt.rcParams["figure.figsize"] = [8,6]
sns.set_style("darkgrid")


titanic_data = sns.load_dataset("titanic")

titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
#Filtering for only the categorical columns

titanic_data = titanic_data[["sex","class","embark_town"]]
titanic_data.head()

Unnamed: 0,sex,class,embark_town
0,male,Third,Southampton
1,female,First,Cherbourg
2,female,Third,Southampton
3,female,First,Southampton
4,male,Third,Southampton


In [4]:
#Printing the unique values in the three columns


print(titanic_data["sex"].unique())
print(titanic_data["class"].unique())
print(titanic_data["embark_town"].unique())

['male' 'female']
[Third, First, Second]
Categories (3, object): [Third, First, Second]
['Southampton' 'Cherbourg' 'Queenstown' nan]


In [5]:
# Using the get_dummies method from pandas to convert to a one hot encoded column

import pandas as pd

temp = pd.get_dummies(titanic_data["sex"])

temp.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [6]:
#Combining the original sex columns with the one hot encoded one

pd.concat([titanic_data["sex"],pd.get_dummies(titanic_data["sex"])],axis=1).head()

Unnamed: 0,sex,female,male
0,male,0,1
1,female,1,0
2,female,1,0
3,female,1,0
4,male,0,1


In [7]:
#Converting the embark_town column into a one hot encoded column

temp = pd.get_dummies(titanic_data["embark_town"])
temp.head()

Unnamed: 0,Cherbourg,Queenstown,Southampton
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [8]:
# Dropping the first column created by get_dummies() by passing True for drop_first

temp = pd.get_dummies(titanic_data["embark_town"],drop_first=True)
temp.head()

Unnamed: 0,Queenstown,Southampton
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [9]:
# Creating a one hot encoded column for null values passing True as a value for dummy_na 

temp = pd.get_dummies(titanic_data["embark_town"],dummy_na=True,drop_first=True)

temp.head()

Unnamed: 0,Queenstown,Southampton,NaN
0,0,1,0
1,0,0,0
2,0,1,0
3,0,1,0
4,0,1,0


# Label Encoding

In [10]:
#Performing label/integer encoding on the class column of the titanic dataset

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(titanic_data["class"])

titanic_data["le_class"] = le.transform(titanic_data["class"])

titanic_data.head()

Unnamed: 0,sex,class,embark_town,le_class
0,male,Third,Southampton,2
1,female,First,Cherbourg,0
2,female,Third,Southampton,2
3,female,First,Southampton,0
4,male,Third,Southampton,2


# Frequency Encoding

In [11]:
# Applying Frequency Encoding to the embark_town column of the titanic dataset. If the column contains some null values they will
# have to be removed first

titanic_data.dropna(inplace = True)

In [12]:
value_counts = titanic_data["embark_town"].value_counts().to_dict()
print(value_counts)

{'Southampton': 644, 'Cherbourg': 168, 'Queenstown': 77}


In [13]:
titanic_data["embark_town"] = titanic_data["embark_town"].map(value_counts)
titanic_data.head()

Unnamed: 0,sex,class,embark_town,le_class
0,male,Third,644,2
1,female,First,168,0
2,female,Third,644,2
3,female,First,644,0
4,male,Third,644,2


In [14]:
#adding percentage frequency by dividing the label count by the total number of rows follows
frequency_count = (titanic_data["embark_town"].value_counts()/len(titanic_data)).to_dict()
print(frequency_count)

{644: 0.7244094488188977, 168: 0.1889763779527559, 77: 0.08661417322834646}


In [15]:
titanic_data["embark_town"] = titanic_data["embark_town"].map(frequency_count)

titanic_data.head()

Unnamed: 0,sex,class,embark_town,le_class
0,male,Third,0.724409,2
1,female,First,0.188976,0
2,female,Third,0.724409,2
3,female,First,0.724409,0
4,male,Third,0.724409,2


# Ordinal Encoding

In [16]:
#Applying ordinal encoding on the class column of the titanic dataset

titanic_data = sns.load_dataset("titanic")

titanic_data = titanic_data[["sex","class","embark_town","survived"]]

titanic_data.groupby(["class"])["survived"].mean().sort_values()

class
Third     0.242363
Second    0.472826
First     0.629630
Name: survived, dtype: float64

In [19]:
ordered_cats = titanic_data.groupby(["class"])["survived"].mean().sort_values().index

cat_map = {k:i for i,k in enumerate(ordered_cats,0)}

titanic_data["class_ordered"] = titanic_data["class"].map(cat_map)

titanic_data.head()

Unnamed: 0,sex,class,embark_town,survived,class_ordered
0,male,Third,Southampton,0,0
1,female,First,Cherbourg,1,2
2,female,Third,Southampton,1,0
3,female,First,Southampton,1,2
4,male,Third,Southampton,0,0


# Mean Encoding

In [20]:
#Applying mean encoding to the titanic dataset


titanic_data.groupby(["class"])["survived"].mean()

class
First     0.629630
Second    0.472826
Third     0.242363
Name: survived, dtype: float64

In [21]:
mean_labels = titanic_data.groupby(["class"])["survived"].mean().to_dict()

titanic_data["class_mean"] = titanic_data["class"].map(mean_labels)

titanic_data.head()

Unnamed: 0,sex,class,embark_town,survived,class_ordered,class_mean
0,male,Third,Southampton,0,0,0.242363
1,female,First,Cherbourg,1,2,0.62963
2,female,Third,Southampton,1,0,0.242363
3,female,First,Southampton,1,2,0.62963
4,male,Third,Southampton,0,0,0.242363
