In [1]:
# Strings are incompatible with sklearn. thus they need to be encoded to numbers

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# The machine learning models.
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# To evaluate the models.
from sklearn.metrics import roc_auc_score

# To separate data into train and test.
from sklearn.model_selection import train_test_split

In [3]:
data= pd.read_csv("titanic.csv")

data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB


In [6]:
# check cardinality of categorical variables

print(f"The number of categories in the variable Name is {data['name'].nunique()}")
print(f"The number of categories in the variable Sex is {data['sex'].nunique()}")
print(f"The number of categories in the variable Ticket is {data['ticket'].nunique()}")
print(f"The number of categories in the variable Cabin is {data['cabin'].nunique()}")
print(f"The number of categories in the variable Embarked is {data['embarked'].nunique()}")
print(f"The number of passengers is {len(data)}")

The number of categories in the variable Name is 1307
The number of categories in the variable Sex is 2
The number of categories in the variable Ticket is 929
The number of categories in the variable Cabin is 181
The number of categories in the variable Embarked is 3
The number of passengers is 1309


In [7]:
# Some of the variables have high cardinality

In [8]:
data['cabin_reduced'] = data['cabin'].str[0]

In [9]:
data[['cabin', 'cabin_reduced']].head()

Unnamed: 0,cabin,cabin_reduced
0,B5,B
1,C22,C
2,C22,C
3,C22,C
4,C22,C


In [19]:
print(f"The number of categories in the variable Cabin is {data['cabin'].nunique()}")
print(f"The number of categories in the variable Cabin reduced is {data['cabin_reduced'].nunique()}")

The number of categories in the variable Cabin is 181
The number of categories in the variable Cabin reduced is 8


In [11]:
X = data[['cabin', 'cabin_reduced', 'sex']]
y = data['survived']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [33]:
X_train.shape, X_test.shape

((877, 3), (432, 3))

# Uneven distribution of categories

In [34]:
# When a variable is highly cardinal, some categories appear only on the training set, and others only on the testing set. 
# If present only in the training set, they may cause over-fitting. If present only on the testing set, 
# the machine learning model will not know how to handle them, as they were not seen during training.

In [35]:
# Labels only present in the training set

unique_to_train_set = [x for x in X_train.cabin.unique() if x not in X_test.cabin.unique()]

In [36]:
len(unique_to_train_set)

107

In [37]:
# Labels unique to the test set only

unique_to_test_set = [x for x in X_test.cabin.unique() if x not in X_train.cabin.unique()]

In [38]:
len(unique_to_test_set)

42

Variables with high cardinality have categories present either only in the training set, or only in the testing set. This will cause problems at the time of training (over-fitting) and scoring of new data (how will the model deal with unseen categories?).

This problem can be mitigated by reducing the cardinality of the variable. Let's do that.

In [39]:
# Look at the cabin_reduced cardinality
unique_reduced_test = [x for x in X_test.cabin_reduced.unique()
                       if x not in X_train.cabin_reduced.unique()]

In [40]:
len(unique_reduced_test)

1

In [41]:
unique_reduced_train = [x for x in X_train.cabin_reduced.unique()
                        if x not in X_test.cabin_reduced.unique()]

In [42]:
len(unique_reduced_train)

2

# The impact of cardinality on the performance of machine learning models

In [43]:
# df.loc[:, 'A', 'B']  -----> elect columns A and B, all rows

In [44]:
# Create a replacement dictionary

cabin_dict = {k: i for i, k in enumerate(X_train['cabin_reduced'].unique(), 0)}

# Replace labels by numbers using dictionary
X_train.loc[:,'cabin_reduced'] = X_train.loc[:, 'cabin_reduced'].map(cabin_dict)

In [47]:
X_train.head()

Unnamed: 0,cabin,cabin_reduced,sex
1048,,0,female
1034,,0,male
568,F,1,male
615,,0,male
840,,0,female


In [48]:
# Re map the Sex column into numbers

X_test.loc[:, 'cabin_reduced'] = X_test.loc[:, 'cabin_reduced'].map(cabin_dict)

In [49]:
X_test.head()

Unnamed: 0,cabin,cabin_reduced,sex
1139,,0,male
533,,0,female
459,,0,male
1150,,0,male
393,,0,male


In [50]:
X_train.loc[:,'sex'] = X_train.loc[:,'sex'].map({'male':0, 'female':1})
X_test.loc[:,'sex'] = X_test.loc[:,'sex'].map({'male':0, 'female':1})

In [51]:
X_train.head()

Unnamed: 0,cabin,cabin_reduced,sex
1048,,0,1
1034,,0,0
568,F,1,0
615,,0,0
840,,0,1


In [52]:
X_test.head()

Unnamed: 0,cabin,cabin_reduced,sex
1139,,0,0
533,,0,1
459,,0,0
1150,,0,0
393,,0,0


In [53]:
X_train.columns

Index(['cabin', 'cabin_reduced', 'sex'], dtype='object')

In [54]:
X_train[['cabin', 'cabin_reduced', 'sex']].isnull().sum()

cabin            671
cabin_reduced      0
sex                0
dtype: int64

In [55]:
X_test[['cabin', 'cabin_reduced', 'sex']].isnull().sum()

cabin            343
cabin_reduced      0
sex                0
dtype: int64

In [56]:
len(X_train.cabin.unique())

141

In [57]:
X_train.cabin_reduced.nunique()

9