In [1]:
from kaggle_titanic_utls import *
import warnings
warnings.filterwarnings('ignore')

In [2]:
PATH_DATA = Path('.')

Let's import the data and do some preprocessing

In [3]:
# Import data
data = pd.read_csv(PATH_DATA / "train.csv")
test = pd.read_csv(PATH_DATA / 'test.csv')

In [4]:
data.shape, test.shape

((891, 12), (418, 11))

In [5]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [6]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [9]:
data['Survived'].sum()/len(data)

0.3838383838383838

The classes are sufficiently balanced - it makes sense to consider accuracy as performance metrics.

In [10]:
#data.dropna(subset=['Embarked'], inplace=True)

In [11]:
labels = data[['Survived']]
data.drop(['Cabin', 'Ticket', 'PassengerId', 'Survived'], axis=1, inplace=True)

test_ids = test.PassengerId
test.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1, inplace=True)

In [12]:
data.corr().style.background_gradient(cmap ='cool')\
        .set_properties(**{'font-size': '15px'}) 

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
Pclass,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,-0.5495,0.096067,0.159651,0.216225,1.0


We can extract the title of each passenger from the name as the former likely bears more significance than the latter

In [13]:
title = get_title(data)
test_title = get_title(test)

In [14]:
title.value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Major             2
Mlle              2
Col               2
Jonkheer          1
the Countess      1
Sir               1
Ms                1
Capt              1
Mme               1
Don               1
Lady              1
Name: Name, dtype: int64

In [15]:
data['Title'] = title
data.drop('Name', axis=1, inplace=True)

test['Title'] = test_title
test.drop('Name', axis=1, inplace=True)

In [16]:
labels.shape, data.shape

((891, 1), (891, 8))

In [17]:
data.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title'], dtype='object')

## Preprocessing

Here we impute numerical and categorical features with median and mode, respectively, and then we transform categorical features using one-hot encoding

In [18]:
scaler = StandardScaler()

In [19]:
data_num, data_cat = get_features_by_type(data)

In [20]:
data_, value_num, value_cat = preprocessing(data_num, data_cat, scaler=scaler)

In [21]:
test_ = test_preprocessing(test, value_num, value_cat, scaler=scaler)

In [22]:
data_aligned, test_aligned = data_.align(test_, join='left', axis=1, fill_value=0)
test_aligned.shape

(418, 27)

## Baseline models

We look at various baseline models to check which ones give the best score in and out of sample

In [23]:
models = {'knn': KNeighborsClassifier(), 'LR': LogisticRegression(), 
         'tree': DecisionTreeClassifier(), 'GNB': GaussianNB(),
         'perceptron': Perceptron(), 'SVC': SVC()}

for i, j in models.items():
    scores = model_trial(data_, labels, j)
    print(color.BOLD + color.RED + color.UNDERLINE + f'{i} scores' + color.END + f': {scores}\n\n')

[1m[91m[4mknn scores[0m: {'train_score': 0.8772455089820359, 'test_score': 0.8071748878923767}


[1m[91m[4mLR scores[0m: {'train_score': 0.8308383233532934, 'test_score': 0.820627802690583}


[1m[91m[4mtree scores[0m: {'train_score': 0.9865269461077845, 'test_score': 0.7847533632286996}


[1m[91m[4mGNB scores[0m: {'train_score': 0.7694610778443114, 'test_score': 0.7443946188340808}


[1m[91m[4mperceptron scores[0m: {'train_score': 0.7754491017964071, 'test_score': 0.8026905829596412}


[1m[91m[4mSVC scores[0m: {'train_score': 0.844311377245509, 'test_score': 0.8295964125560538}




Perceptron and Naive Bayes have a poor accuracy, whereas Decision Trees are clearly overfitting to the training set. SVC, Logistic Regression and KNN perform reasonably well in and out of sample (random guessing would yield around $62 \%$ accuracy).

## Grid Search

We can now take the best models and preprocessing strategies and grid search the best parameters using cross validation.

In [24]:
data_num, data_cat = get_features_by_type(data)
data_, value_num, value_cat = preprocessing(data_num, data_cat)

grid search: knn

In [25]:
param_grid = {'weights': ['uniform', 'distance'], 
               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
               'n_neighbors': np.linspace(1, 15, 15).astype(int)}

In [26]:
gscv = GridSearchCV(models['knn'], cv=4, n_jobs=-3, param_grid=param_grid, scoring='accuracy', verbose=1)

In [27]:
%%time
gscv.fit(data_, labels)

Fitting 4 folds for each of 120 candidates, totalling 480 fits
CPU times: user 445 ms, sys: 156 ms, total: 602 ms
Wall time: 3.4 s


GridSearchCV(cv=4, estimator=KNeighborsClassifier(), n_jobs=-3,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
                         'weights': ['uniform', 'distance']},
             scoring='accuracy', verbose=1)

In [28]:
gscv.best_estimator_, gscv.best_params_

(KNeighborsClassifier(algorithm='ball_tree', n_neighbors=12),
 {'algorithm': 'ball_tree', 'n_neighbors': 12, 'weights': 'uniform'})

In [29]:
model = gscv.best_estimator_
name_model = 'knn_BE'
try_model(data_, labels, model, name_model)

model: knn_BE
train score: 0.8488
test score: 0.7982


In [30]:
create_subfile_titanic(test_aligned, test_ids, model, name_model, PATH_DATA)

0

In [None]:
!kaggle competitions submit -c titanic -f "submission_{name_model}_titanic.csv" -m 'submission of {model}'

grid search: lr

In [31]:
param_grid = [{'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 
                'C': np.logspace(-2, 2, 5, 10)},
               {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
                'C': np.logspace(-2, 2, 5, 10)},
               {'penalty': ['elasticnet'], 'solver': ['saga'], 
                'l1_ratio': np.arange(0.1, 1, 0.1), 'C': np.logspace(-2, 2, 5, 10)},
                {'penalty': [None], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']}]

In [32]:
gscv = GridSearchCV(models['LR'], cv=4, n_jobs=-3, param_grid=param_grid, scoring='accuracy', verbose=1)

In [33]:
%%time
gscv.fit(data_, labels)

Fitting 4 folds for each of 79 candidates, totalling 316 fits
CPU times: user 322 ms, sys: 33.6 ms, total: 355 ms
Wall time: 1.99 s


GridSearchCV(cv=4, estimator=LogisticRegression(), n_jobs=-3,
             param_grid=[{'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                          'penalty': ['l1'], 'solver': ['liblinear', 'saga']},
                         {'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                          'penalty': ['l2'],
                          'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']},
                         {'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                          'l1_ratio': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                          'penalty': ['elasticnet'], 'solver': ['saga']},
                         {'penalty': [None],
                          'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']}],
             scoring='accuracy', verbose=1)

In [34]:
gscv.best_estimator_, gscv.best_params_

(LogisticRegression(l1_ratio=0.4, penalty='elasticnet', solver='saga'),
 {'C': 1.0, 'l1_ratio': 0.4, 'penalty': 'elasticnet', 'solver': 'saga'})

In [35]:
model = gscv.best_estimator_
name_model = 'lr_BE'
try_model(data_, labels, model, name_model)

LogisticRegression(l1_ratio=0.4, penalty='elasticnet', solver='saga')
model: lr_BE
train score: 0.8413
test score: 0.8072


In [36]:
create_subfile_titanic(test_aligned, test_ids, model, name_model, PATH_DATA)

0

In [None]:
!kaggle competitions submit -c titanic -f "submission_{name_model}_titanic.csv" -m 'submission of {model}'

grid search: svc

In [37]:
param_grid = {'C': np.logspace(-2, 2, 5, 10),
               'kernel': ['linear', 'rbf', 'poly'],
              'degree': np.linspace(2, 5, 5).astype(int)}

In [38]:
gscv = GridSearchCV(models['SVC'], cv=4, n_jobs=-3, param_grid=param_grid, scoring='accuracy', verbose=1)

In [39]:
models['SVC'].get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [40]:
%%time
gscv.fit(data_, labels)

Fitting 4 folds for each of 75 candidates, totalling 300 fits
CPU times: user 277 ms, sys: 52.1 ms, total: 329 ms
Wall time: 4.06 s


GridSearchCV(cv=4, estimator=SVC(), n_jobs=-3,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'degree': array([2, 2, 3, 4, 5]),
                         'kernel': ['linear', 'rbf', 'poly']},
             scoring='accuracy', verbose=1)

In [41]:
gscv.best_estimator_, gscv.best_params_

(SVC(degree=2, kernel='poly'), {'C': 1.0, 'degree': 2, 'kernel': 'poly'})

In [43]:
model = gscv.best_estimator_
name_model = 'svc_BE'
try_model(data_, labels, model, name_model)

SVC(degree=2, kernel='poly')
model: svc_BE
train score: 0.8458
test score: 0.8117


Support Vector Classifier with a quadratic polynomial kernel and strength of the regularization equal to $C=1$ yields the best accuracy after grid-searching parameters. 

In [44]:
create_subfile_titanic(test_aligned, test_ids, model, name_model, PATH_DATA)

0

In [None]:
!kaggle competitions submit -c titanic -f "submission_{name_model}_titanic.csv" -m 'submission of {model}'

## Ensemble Methods

Let's now use some ensemble methods on this dataset. We will first try with hard and soft voting classifiers, and then with Random Forest and Extra Trees

In [45]:
data_num, data_cat = get_features_by_type(data)

In [46]:
data_, value_num, value_cat = preprocessing(data_num, data_cat, scaler=None)

Random Forest

In [55]:
model = RandomForestClassifier(random_state=10)
name_model = 'rf_titanic'
save_model = 'no'
try_model(data_, labels, model, name_model, dump_model=save_model)

model: rf_titanic
train score: 0.8772
test score: 0.8206


In [50]:
create_subfile_titanic(test_aligned, test_ids, model, name_model, PATH_DATA)

0

In [51]:
!kaggle competitions submit -c titanic -f \
"submission_{name_model}_titanic.csv" -m \
'submission of {name_model}'

100%|████████████████████████████████████████| 2.77k/2.77k [00:02<00:00, 972B/s]
Successfully submitted to Titanic - Machine Learning from Disaster

Extra Trees

In [56]:
model = ExtraTreesClassifier(random_state=10)
name_model = 'ext_titanic'
save_model = 'no'
try_model(data_, labels, model, name_model, dump_model=save_model)

ExtraTreesClassifier(random_state=10)
model: ext_titanic
train score: 0.9850
test score: 0.8072


In [54]:
create_subfile_titanic(test_aligned, test_ids, model, name_model)

0

### Grid Search

## grid search parameters for ensemble methods?

## Hard and Soft Voting Classifiers

The dataset seems to be very simple, so it make sense to train several weak learners and combine them using soft or hard voting classifiers. Let's see how these two strategies perform.

In [58]:
model = VotingClassifier(estimators=[('svc', SVC(kernel='poly')), \
('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()),\
('gnb', GaussianNB()), ('knn', KNeighborsClassifier())], \
voting='hard', n_jobs=-3, verbose=True)

name_model = 'hv_clf_titanic'
try_model(data_, labels, model, name_model, dump_model='no')

VotingClassifier(estimators=[('svc', SVC(kernel='poly')),
                             ('lr', LogisticRegression()),
                             ('dt', DecisionTreeClassifier()),
                             ('gnb', GaussianNB()),
                             ('knn', KNeighborsClassifier())],
                 n_jobs=-3, verbose=True)
model: hv_clf_titanic
train score: 0.8787
test score: 0.7758


In [80]:
model = VotingClassifier(estimators=[('svc', SVC(probability=True)), \
('lr', LogisticRegression()), ('dt', DecisionTreeClassifier()),\
('gnb', GaussianNB()), ('knn', KNeighborsClassifier())], \
voting='soft', n_jobs=-3, verbose=True)

name_model = 'sv_clf_titanic'
try_model(data_, labels, model, name_model, dump_model='yes')

VotingClassifier(estimators=[('svc', SVC(probability=True)),
                             ('lr', LogisticRegression()),
                             ('dt', DecisionTreeClassifier()),
                             ('gnb', GaussianNB()),
                             ('knn', KNeighborsClassifier())],
                 n_jobs=-3, verbose=True, voting='soft')
model: sv_clf_titanic
train score: 0.9042
test score: 0.8027


In [81]:
create_subfile_titanic(test_aligned, test_ids, model, name_model)

0

In [82]:
!kaggle competitions submit -c titanic -f \
"submission_{name_model}_titanic.csv" -m \
'submission of {name_model}'

100%|██████████████████████████████████████| 2.77k/2.77k [00:02<00:00, 1.02kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

# Not in the final notebook

## Grid Search

## More complex models

Let us now try some other more complex approaches, i. e., ensemble methods and dimensionality reduction.

In [6]:
data_num, data_cat = get_features_by_type(data)
data_num.fillna(data_num.median(), inplace=True)
data_cat.fillna(data_cat.mode(), inplace=True)

NameError: name 'get_features_by_type' is not defined

In [None]:
data_cat_dummy = pd.get_dummies(data_cat)

In [None]:
data_ = pd.concat([data_num, data_cat_dummy], axis=1, join='inner')

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)

X_train, X_test, y_train, y_test = train_test_split(data_, labels,
                        random_state=0, test_size=0.3, stratify=labels)

rf.fit(X_train, y_train)
rf.score(X_train, y_train)

### Grid Search

In [None]:
param_grid = {'n_estimators': np.linspace(5, 50, 5).astype(int), 
             'criterion': ['gini', 'entropy'],
             'max_depth': np.linspace(2, 10, 2),
             'max_features': ['auto', 'sqrt', 'log2']}

In [None]:
gscv = GridSearchCV(rf, cv=5, n_jobs=-3, param_grid=param_grid, scoring='accuracy', verbose=1)

In [None]:
%%time
gscv.fit(data_, labels)

In [None]:
gscv.best_estimator_

In [None]:
best_rf = gscv.best_estimator_
best_rf.fit(data_, labels)
best_rf.score(data_, labels)

## Submission

In [None]:
test = pd.read_csv(path / 'test.csv')
test.head()

In [None]:
test, test_ids = prepare_test_data(test)

In [None]:
#test['LogFare'] = np.log1p(test.Fare)
#test.drop('Fare', axis=1, inplace=True)

In [None]:
#test_ = test_preprocessing(test, value_num, value_cat, scaler=scaler)

In [None]:
test_num, test_cat = get_features_by_type(test)

In [None]:
test_num.fillna(value_num, inplace=True)
test_cat.fillna(value_cat, inplace=True)

In [None]:
test_cat_dummy = pd.get_dummies(test_cat)

In [None]:
test_ = pd.concat([test_num, test_cat_dummy], axis=1, join='inner')

In [None]:
data_aligned, test_aligned = data_.align(test_, join='left', axis=1, fill_value=0)

In [None]:
test_predict_rf = best_rf.predict(test_aligned)

In [None]:
submission_rf = np.c_[test_ids.values, test_predict_rf]
pd.DataFrame(submission_rf).to_csv(path / "submission_base_rf.csv", header=['PassengerId', 'Survived'], index=False)

In [None]:
!kaggle competitions submit -c titanic -f "submission_base_rf.csv" -m 'submission of base model'
#!kaggle competitions submit -c titanic -f "submission_base_LR.csv" -m 'submission of base model'
#!kaggle competitions submit -c titanic -f "submission_base_knn.csv" -m 'submission of base model'

## Various imputations

We can also look at the impact that different imputation strategies bear on the accuracy of the models. In the following we have imputed missing values with mean, median and with zeros, and have assessed the effect of these on the three best performing models we have previously selected.

In [None]:
data_num, data_cat = get_features_by_type(data)

In [None]:
imputation_dict = {'median': data_num.median(), 'mean': data_num.mean(), 'zero': 0}

models = {'LR': LogisticRegression(), 'SVC': SVC(), 'knn': KNeighborsClassifier()}

for key, value in imputation_dict.items():
    data_, value_num, value_cat = preprocessing(data_num, data_cat, imputer_num=key)
    for k, v in models.items():
        scores = model_trial(data_, labels, model=v)
        print(color.BOLD + color.RED + color.UNDERLINE + f'{k, key} scores' + color.END + f': {scores}\n\n')

The imputation strategies considered bear no effect on the accuracy of any of the models. We will however keep the median as default imputation method as it is more robust than the mean and contains more information about the data than the zeros.