In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
%matplotlib inline

# Load and inspect data

In [2]:
df = pd.read_csv("data/titanic.csv")

In [3]:
df.shape

(891, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

- `PassengerID` and `Name` are unique columns
- `Ticket` has high cardinality 

In [8]:
df["Ticket"].value_counts().head(20)

CA. 2343        7
1601            7
347082          7
347088          6
3101295         6
CA 2144         6
S.O.C. 14879    5
382652          5
113781          4
LINE            4
W./C. 6608      4
19950           4
349909          4
4133            4
113760          4
PC 17757        4
347077          4
17421           4
2666            4
110413          3
Name: Ticket, dtype: int64

# Data Cleaning and Preprocessing

In [9]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [10]:
df["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [11]:
df[df["Age"].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


- interpolate (linear regression)
- try to see if there is kids fare
- try to understand age from name
- group by gender and take the mean from each gender
- take the mean

In [12]:
df["Age"].mean()

29.69911764705882

In [15]:
df.loc[df["Age"].isna(), "Age"] = 29.69

In [16]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [17]:
df["Cabin"].value_counts()

G6             4
C23 C25 C27    4
B96 B98        4
C22 C26        3
D              3
              ..
C45            1
B41            1
T              1
E63            1
E38            1
Name: Cabin, Length: 147, dtype: int64

In [19]:
df["Cabin"].isna().value_counts(normalize=True)

True     0.771044
False    0.228956
Name: Cabin, dtype: float64

In [24]:
df = df.assign(MissingCabin=df["Cabin"].isna().astype(int))

In [25]:
df = df.drop(columns="Cabin")

In [26]:
df.isna().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        2
MissingCabin    0
dtype: int64

In [28]:
df["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [30]:
df.loc[df["Embarked"].isna(), "Embarked"] = "S"

In [31]:
df.replace("", np.nan).isna().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        0
MissingCabin    0
dtype: int64

In [32]:
df.replace(" ", np.nan).isna().sum()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        0
MissingCabin    0
dtype: int64

In [33]:
df = df.drop(columns="Ticket")

In [35]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,MissingCabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1


In [39]:
df = df.assign(Female=(df["Sex"] == "female").astype(int))

In [40]:
df = df.drop(columns="Sex")

In [41]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,Embarked,MissingCabin,Female
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,7.25,S,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,C,0,1
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,7.925,S,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1,S,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,8.05,S,1,0


In [44]:
df = df.merge(pd.get_dummies(df["Embarked"], drop_first=True, prefix="Embarked"), 
         left_index=True, 
         right_index=True)
         

In [45]:
df = df.drop(columns="Embarked")

In [46]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Fare,MissingCabin,Female,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,7.25,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,7.925,1,1,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1,0,1,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,8.05,1,0,0,1


# 2. Feature Engingeering and extraction

In [47]:
#calculate family size
df = df.assign(FamilySize=df["SibSp"] + df["Parch"] +1)

In [49]:
df["Name"].head(20)

0                               Braund, Mr. Owen Harris
1     Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                Heikkinen, Miss. Laina
3          Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                              Allen, Mr. William Henry
5                                      Moran, Mr. James
6                               McCarthy, Mr. Timothy J
7                        Palsson, Master. Gosta Leonard
8     Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                   Nasser, Mrs. Nicholas (Adele Achem)
10                      Sandstrom, Miss. Marguerite Rut
11                             Bonnell, Miss. Elizabeth
12                       Saundercock, Mr. William Henry
13                          Andersson, Mr. Anders Johan
14                 Vestrom, Miss. Hulda Amanda Adolfina
15                     Hewlett, Mrs. (Mary D Kingcome) 
16                                 Rice, Master. Eugene
17                         Williams, Mr. Charles

In [52]:
#extract ttile from Name using regex
import re

title_pattern = r",\s(.*)\."
titles = df["Name"].str.extract(title_pattern)[0]
titles.value_counts()

Mr                          517
Miss                        182
Mrs                         124
Master                       40
Dr                            7
Rev                           6
Col                           2
Major                         2
Mlle                          2
Don                           1
Mme                           1
Capt                          1
Ms                            1
Mrs. Martin (Elizabeth L      1
Jonkheer                      1
the Countess                  1
Sir                           1
Lady                          1
Name: 0, dtype: int64

In [55]:
titles_to_keep = titles.value_counts()[titles.value_counts() > 100].index.to_list()

In [58]:
# ~ indicates that we want the opposite
titles[~titles.isin(titles_to_keep)] = "Other"

In [59]:
titles.value_counts()

Mr       517
Miss     182
Mrs      124
Other     68
Name: 0, dtype: int64

In [60]:
df = df.assign(Title=titles)
df = df.drop(columns="Name")

In [61]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,MissingCabin,Female,Embarked_Q,Embarked_S,FamilySize,Title
0,1,0,3,22.0,1,0,7.25,1,0,0,1,2,Mr
1,2,1,1,38.0,1,0,71.2833,0,1,0,0,2,Mrs
2,3,1,3,26.0,0,0,7.925,1,1,0,1,1,Miss
3,4,1,1,35.0,1,0,53.1,0,1,0,1,2,Mrs
4,5,0,3,35.0,0,0,8.05,1,0,0,1,1,Mr


In [63]:
df = df.merge(pd.get_dummies(df["Title"], prefix="Title", drop_first=True),
              left_index=True,
              right_index=True)

In [64]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,MissingCabin,Female,Embarked_Q,Embarked_S,FamilySize,Title,Title_Mr,Title_Mrs,Title_Other
0,1,0,3,22.0,1,0,7.25,1,0,0,1,2,Mr,1,0,0
1,2,1,1,38.0,1,0,71.2833,0,1,0,0,2,Mrs,0,1,0
2,3,1,3,26.0,0,0,7.925,1,1,0,1,1,Miss,0,0,0
3,4,1,1,35.0,1,0,53.1,0,1,0,1,2,Mrs,0,1,0
4,5,0,3,35.0,0,0,8.05,1,0,0,1,1,Mr,1,0,0


In [65]:
df = df.drop(columns="Title")

In [66]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,MissingCabin,Female,Embarked_Q,Embarked_S,FamilySize,Title_Mr,Title_Mrs,Title_Other
0,1,0,3,22.0,1,0,7.25,1,0,0,1,2,1,0,0
1,2,1,1,38.0,1,0,71.2833,0,1,0,0,2,0,1,0
2,3,1,3,26.0,0,0,7.925,1,1,0,1,1,0,0,0
3,4,1,1,35.0,1,0,53.1,0,1,0,1,2,0,1,0
4,5,0,3,35.0,0,0,8.05,1,0,0,1,1,1,0,0


# 3. Split into train and test

In [67]:
from sklearn.model_selection import train_test_split

In [68]:
train, test = train_test_split(df, test_size=.15, random_state=42)

In [69]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'MissingCabin', 'Female', 'Embarked_Q', 'Embarked_S', 'FamilySize',
       'Title_Mr', 'Title_Mrs', 'Title_Other'],
      dtype='object')

In [70]:
id_col = ["PassengerId"]
target_col = ["Survived"]
num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']
bool_cols = ['MissingCabin', 'Female', 'Embarked_Q', 'Embarked_S', 'Title_Mr', 'Title_Mrs', 'Title_Other']

In [73]:
df["Survived"].value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score

In [75]:
train_X = train[num_cols + bool_cols]
train_y = train[target_col]
test_X = test[num_cols + bool_cols]
test_y = test[target_col]

In [83]:
#exploring 1D issue
train_y["Survived"]

599    1
830    1
306    1
231    0
845    0
      ..
106    1
270    0
860    0
435    1
102    0
Name: Survived, Length: 757, dtype: int64

In [86]:
#exploring 1D issue

type(train_y.values.ravel())

numpy.ndarray

In [77]:
lr = LogisticRegression()
lr.fit(train_X, train_y.values.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [78]:
predictions = lr.predict(test_X)

In [79]:
balanced_accuracy_score(test_y, predictions)

0.7852564102564102

# 4. Cross Validation

In [80]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

In [81]:
models = {"Logistic Regression": LogisticRegression(),
          "Decision Tree": DecisionTreeClassifier(random_state=42),
          "Random Forest": RandomForestClassifier(n_estimators=100, n_jobs=-1)}

In [104]:
cross_validate(models["Logistic Regression"],
               train_X,
               train_y.values.ravel(), 
               cv=5,
               scoring="balanced_accuracy")

{'fit_time': array([0.0509119 , 0.05768108, 0.0430572 , 0.04010177, 0.040694  ]),
 'score_time': array([0.00498796, 0.00351596, 0.00290895, 0.00279903, 0.00284696]),
 'test_score': array([0.8294204 , 0.78070175, 0.82745427, 0.77295633, 0.80113848])}

In [105]:
cross_validate(models["Logistic Regression"],
               train_X,
               train_y.values.ravel(), 
               cv=5,
               scoring="balanced_accuracy")["test_score"].mean()

0.8023342472101016

In [95]:
# wrap into af unction that takes a model and computes the avg balanced accuracy score
def validate_model(model):
    validation_results = cross_validate(model,
                                        train_X,
                                        train_y.values.ravel(), 
                                        cv=5,
                                        scoring="balanced_accuracy")
    acc = validation_results["test_score"].mean()
    print(f"Mean Balanced Accuracy Score: {acc}")

In [119]:
def validate_model(model, X):
    validation_results = cross_validate(model,
                                        X,
                                        train_y.values.ravel(), 
                                        cv=5,
                                        scoring="balanced_accuracy")
    acc = validation_results["test_score"].mean()
    print(f"Mean Balanced Accuracy Score: {acc}")

In [96]:
validate_model(models["Logistic Regression"])

Mean Balanced Accuracy Score: 0.8023342472101016


In [99]:
for key, value in models.items():
    print(f"Model: {key}")
    validate_model(value)
    print("-------------\n")

Model: Logistic Regression
Mean Balanced Accuracy Score: 0.8023342472101016
-------------

Model: Decision Tree
Mean Balanced Accuracy Score: 0.7299338404705822
-------------

Model: Random Forest
Mean Balanced Accuracy Score: 0.7918538826891146
-------------



In [120]:
def evaluate_models(models, X):
    for key, value in models.items():
        print(f"Model: {key}")
        validate_model(value, X)
        print("-------------\n")

# Feature Selection
## Recursive Feature Elimination - RFE

In [106]:
train_X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,FamilySize,MissingCabin,Female,Embarked_Q,Embarked_S,Title_Mr,Title_Mrs,Title_Other
599,1,49.0,1,0,56.9292,2,0,0,0,0,0,0,1
830,3,15.0,1,0,14.4542,2,1,1,0,0,0,1,0
306,1,29.69,0,0,110.8833,1,1,1,0,0,0,0,0
231,3,29.0,0,0,7.775,1,1,0,0,1,1,0,0
845,3,42.0,0,0,7.55,1,1,0,0,1,1,0,0


In [107]:
#includes cross validation
from sklearn.feature_selection import RFECV

In [110]:
selector_rf = RFECV(models["Random Forest"], 
                    cv=5,
                    scoring="balanced_accuracy")

In [111]:
selector_rf.fit(train_X, train_y)

RFECV(cv=5, estimator=RandomForestClassifier(n_jobs=-1),
      scoring='balanced_accuracy')

In [112]:
selector_rf.get_support()

array([ True,  True, False, False,  True,  True, False,  True, False,
       False,  True, False, False])

In [113]:
train_X.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'MissingCabin',
       'Female', 'Embarked_Q', 'Embarked_S', 'Title_Mr', 'Title_Mrs',
       'Title_Other'],
      dtype='object')

In [116]:
train_X2 = train_X[train_X.columns[selector_rf.get_support()]]

In [122]:
evaluate_models(models, train_X2)

Model: Logistic Regression
Mean Balanced Accuracy Score: 0.7879561339151252
-------------

Model: Decision Tree
Mean Balanced Accuracy Score: 0.7669360672407357
-------------

Model: Random Forest
Mean Balanced Accuracy Score: 0.7888315248870524
-------------



# Hyperparameter Tuning

In [123]:
models

{'Logistic Regression': LogisticRegression(),
 'Decision Tree': DecisionTreeClassifier(random_state=42),
 'Random Forest': RandomForestClassifier(n_jobs=-1)}

In [125]:
models["Random Forest"].get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [129]:
models["Random Forest"] = RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [130]:
selector_rf2 = RFECV(models["Random Forest"], 
                     cv=5,
                    scoring="balanced_accuracy")
selector_rf2.fit(train_X, train_y)
train_X3 = train_X[train_X.columns[selector_rf2.get_support()]]

In [133]:
evaluate_models(models, train_X3)

Model: Logistic Regression
Mean Balanced Accuracy Score: 0.7854473491137969
-------------

Model: Decision Tree
Mean Balanced Accuracy Score: 0.7504000463374136
-------------

Model: Random Forest
Mean Balanced Accuracy Score: 0.7880995224672098
-------------



### Identify optimal hyperparameters using GridSearch (+CV)

In [134]:
from sklearn.model_selection import GridSearchCV

In [135]:
param_grid_rf = {"n_estimators": [100, 500, 1000],
                 "criterion": ["gini", "entropy"],
                 "bootstrap": [True, False],
                 "max_depth": [3, 10, None]}
tune_rf = GridSearchCV(RandomForestClassifier(), 
                       param_grid=param_grid_rf, 
                       cv=5, 
                       scoring="balanced_accuracy")

tune_rf.fit(train_X2, train_y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 10, None],
                         'n_estimators': [100, 500, 1000]},
             scoring='balanced_accuracy')

In [141]:
tune_rf.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 3,
 'n_estimators': 1000}

In [139]:
validate_model(RandomForestClassifier(**tune_rf.best_params_), train_X2)

Mean Balanced Accuracy Score: 0.7982514062117878


# Final validation on the test set

In [143]:
rf_model = RandomForestClassifier(**tune_rf.best_params_)
rf_model.fit(train_X2, train_y)

test_X2 = test_X[test_X.columns[selector_rf.get_support()]]
rf_pred = rf_model.predict(test_X2)

In [144]:
balanced_accuracy_score(test_y, rf_pred)

0.8019688644688645