In [1]:
from Kaggle_Titanic_utls import *
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = Path('.')

In [3]:
# Import data
data = pd.read_csv(path / "train.csv")

In [4]:
data.shape

(891, 12)

In [5]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
data['Survived'].sum()/len(data)

0.3838383838383838

In [8]:
#Fare funzione Alessandro

The classes are sufficiently balanced - it makes sense to consider accuracy as performance metrics.

Embarked only has 2 missing values out of 891 - we will get rid of them altogether.

In [9]:
data.dropna(subset=['Embarked'], inplace=True)

In [10]:
labels = data[['Survived']]

data.drop(['Cabin', 'Ticket', 'PassengerId', 'Survived'], axis=1, inplace=True)

In [11]:
data.corr().style.background_gradient(cmap ='cool')\
        .set_properties(**{'font-size': '15px'}) 

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
Pclass,1.0,-0.365902,0.081656,0.016824,-0.548193
Age,-0.365902,1.0,-0.307351,-0.187896,0.093143
SibSp,0.081656,-0.307351,1.0,0.414542,0.160887
Parch,0.016824,-0.187896,0.414542,1.0,0.217532
Fare,-0.548193,0.093143,0.160887,0.217532,1.0


There are no highly correlated features (maybe do feature extraction?).

We can extract the title of each passenger from the name **(which is most likely not useful in the prediction. We can check this though)**.

In [12]:
title = data.Name.apply(lambda x:x.split(',')[1].split('.')[0].strip())

In [13]:
title.value_counts()

Mr              517
Miss            181
Mrs             124
Master           40
Dr                7
Rev               6
Major             2
Col               2
Mlle              2
Sir               1
Capt              1
Mme               1
the Countess      1
Jonkheer          1
Don               1
Ms                1
Lady              1
Name: Name, dtype: int64

In [14]:
data['Title'] = title
data.drop('Name', axis=1, inplace=True)

In [15]:
labels.shape, data.shape

((889, 1), (889, 8))

In [16]:
data.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title'], dtype='object')

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

## Preprocessing

In [17]:
data_num, data_cat = get_features_by_type(data)

In [19]:
data_ = preprocessing(data_num, data_cat)

data_num.fillna(data_num.median(), inplace=True)
data_cat.fillna(data_cat.mode(), inplace=True)

scaler = StandardScaler()

data_num_scaled = scaler.fit_transform(data_num)

data_cat_dummy = pd.get_dummies(data_cat)

data_ = np.c_[data_num_scaled, data_cat_dummy]

X_train, X_test, y_train, y_test = train_test_split(data_, labels,
                            random_state=1, test_size=0.25, stratify=labels)

## Baseline models

We look at various baseline models to check which ones give the best score in and out of sample

In [20]:
models = {'knn': KNeighborsClassifier(), 'LR': LogisticRegression(), 
         'tree': DecisionTreeClassifier(), 'GNB': GaussianNB(),
         'perceptron': Perceptron(), 'SVC': SVC()}

for i, j in models.items():
    scores = model_trial(data_, labels, j)
    print(color.BOLD + color.RED + color.UNDERLINE + f'{i} scores' + color.END + f': {scores}\n\n')

[1m[91m[4mknn scores[0m: {'train_score': 0.8528528528528528, 'test_score': 0.8385650224215246}


[1m[91m[4mLR scores[0m: {'train_score': 0.8273273273273273, 'test_score': 0.8385650224215246}


[1m[91m[4mtree scores[0m: {'train_score': 0.9834834834834835, 'test_score': 0.7757847533632287}


[1m[91m[4mGNB scores[0m: {'train_score': 0.6291291291291291, 'test_score': 0.6322869955156951}


[1m[91m[4mperceptron scores[0m: {'train_score': 0.7672672672672672, 'test_score': 0.7533632286995515}


[1m[91m[4mSVC scores[0m: {'train_score': 0.8378378378378378, 'test_score': 0.8430493273542601}




Perceptron and Naive Bayes have a poor accuracy, whereas Decision Trees are clearly overfitting to the training set. SVC, Logistic Regression and KNN perform reasonably well in and out of sample (random guess would yield around $62 \%$ accuracy).

## Various imputations

We can also look at the impact that different imputation strategies bear on the accuracy of the models. In the following we have imputed missing values with mean, median and with zeros, and have assessed the effect of these on the three best performing models we have previously selected.

In [21]:
data_num, data_cat = get_features_by_type(data)

In [23]:
imputation_dict = {'median': data_num.median(), 'mean': data_num.mean(), 'zero': 0}

models = {'LR': LogisticRegression(), 'SVC': SVC(), 'knn': KNeighborsClassifier()}

for key, value in imputation_dict.items():
    data_ = preprocessing(data_num, data_cat, imputer_num=key)
    for k, v in models.items():
        scores = model_trial(data_, labels, model=v)
        print(color.BOLD + color.RED + color.UNDERLINE + f'{k, key} scores' + color.END + f': {scores}\n\n')

[1m[91m[4m('LR', 'median') scores[0m: {'train_score': 0.8273273273273273, 'test_score': 0.8385650224215246}


[1m[91m[4m('SVC', 'median') scores[0m: {'train_score': 0.8378378378378378, 'test_score': 0.8430493273542601}


[1m[91m[4m('knn', 'median') scores[0m: {'train_score': 0.8528528528528528, 'test_score': 0.8385650224215246}


[1m[91m[4m('LR', 'mean') scores[0m: {'train_score': 0.8273273273273273, 'test_score': 0.8385650224215246}


[1m[91m[4m('SVC', 'mean') scores[0m: {'train_score': 0.8378378378378378, 'test_score': 0.8430493273542601}


[1m[91m[4m('knn', 'mean') scores[0m: {'train_score': 0.8528528528528528, 'test_score': 0.8385650224215246}


[1m[91m[4m('LR', 'zero') scores[0m: {'train_score': 0.8273273273273273, 'test_score': 0.8385650224215246}


[1m[91m[4m('SVC', 'zero') scores[0m: {'train_score': 0.8378378378378378, 'test_score': 0.8430493273542601}


[1m[91m[4m('knn', 'zero') scores[0m: {'train_score': 0.8528528528528528, 'test_score': 0.

The imputation strategies considered bear no effect on the accuracy of any of the models. We will however keep the median as default imputation method as it is more robust than the mean and contains more information about the data than the zeros.

## Grid Search

We can now take the best models and preprocessing strategies and grid search the best parameters using cross validation.

### K-nearest Neighbors

In [26]:
data_num, data_cat = get_features_by_type(data)
data_ = preprocessing(data_num, data_cat)

In [27]:
param_grid = {'weights': ['uniform', 'distance'], 
               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
               'n_neighbors': np.linspace(1, 15, 15).astype(int)}

In [28]:
gscv = GridSearchCV(models['knn'], cv=4, n_jobs=-3, param_grid=param_grid, scoring='accuracy', verbose=1)

In [30]:
%%time
gscv.fit(data_, labels)

Fitting 4 folds for each of 120 candidates, totalling 480 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-3)]: Done 100 tasks      | elapsed:    0.4s


CPU times: user 318 ms, sys: 76.5 ms, total: 395 ms
Wall time: 1.5 s


[Parallel(n_jobs=-3)]: Done 480 out of 480 | elapsed:    1.5s finished


GridSearchCV(cv=4, estimator=KNeighborsClassifier(), n_jobs=-3,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
                         'weights': ['uniform', 'distance']},
             scoring='accuracy', verbose=1)

In [32]:
gscv.best_estimator_, gscv.best_params_

(KNeighborsClassifier(n_neighbors=12),
 {'algorithm': 'auto', 'n_neighbors': 12, 'weights': 'uniform'})

In [34]:
best_est_knn = gscv.best_estimator_
best_est_knn.fit(data_, labels)
best_est_knn.score(data_, labels)

0.8368953880764904

### Logistic Regression

In [None]:
grid_params = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

In [None]:
gscv = GridSearchCV(models['LR'], cv=4, n_jobs=-3, param_grid=param_grid, scoring='accuracy', verbose=1)

# ================ Ignore the following ====================

## Imputation

In [18]:
selected_models = ['knn', 'log', 'svc'] 

imputation_methods = ['mean', 'median', 'function', 'zeros']

for i in selected_models:
    print(i)
    for j in imputation_methods:
        print('\t',j, base_pip(models[i], data, labels, imputer_s=j))
    print(' ')
        

knn
train_tr shape (755, 29)
	 mean (0.8609271523178808, 0.835820895522388)
train_tr shape (755, 29)
	 median (0.8649006622516556, 0.8582089552238806)
train_tr shape (755, 29)
	 function (0.8609271523178808, 0.8432835820895522)
train_tr shape (755, 29)
	 zeros (0.8582781456953642, 0.8432835820895522)
 
log
train_tr shape (755, 29)
	 mean (0.833112582781457, 0.8134328358208955)
train_tr shape (755, 29)
	 median (0.833112582781457, 0.8134328358208955)
train_tr shape (755, 29)
	 function (0.833112582781457, 0.8134328358208955)
train_tr shape (755, 29)
	 zeros (0.8304635761589404, 0.8134328358208955)
 
svc
train_tr shape (755, 29)
	 mean (0.8423841059602649, 0.8208955223880597)
train_tr shape (755, 29)
	 median (0.8423841059602649, 0.8208955223880597)
train_tr shape (755, 29)
	 function (0.8423841059602649, 0.8208955223880597)
train_tr shape (755, 29)
	 zeros (0.8397350993377484, 0.8208955223880597)
 


Different imputation methods have little impact on train and test for LR and SVC. KNN is slightly less stable from this standpoint, but it remains the best performing model, with the median as imputation method.

## Grid Search

Now that we have chosen our model, we can grid search parameters.

In [19]:
param_grid = {'weights': ['uniform', 'distance'], 
               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
               'n_neighbors': np.linspace(1, 15, 15).astype(int)}

best_cv = base_pip(models['knn'], data, labels, 
                   imputer_s='median', option='grid_search', 
                   param_grid=param_grid, cv=3)

train_tr shape with gscv (889, 29)
Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-3)]: Done  40 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-3)]: Done 360 out of 360 | elapsed:    3.3s finished


In [20]:
models = {'best': best_cv['best_estimator']}


for i, j in models.items():
    print(i, base_pip(j, data, labels))

train_tr shape (755, 29)
best (0.8370860927152318, 0.8283582089552238)


## Submission

In [21]:
X_unknown = pd.read_csv(path / 'test.csv')
X_unknown.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [22]:
X_unknown.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [23]:
X_unknown.dropna(subset=['Fare'], inplace=True)

In [24]:
X_unknown.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1, inplace=True)

In [25]:
title_ = X_unknown.Name.apply(lambda x:x.split(',')[1].split('.')[0].strip())

X_unknown['Title'] = title

In [26]:
X_unknown.drop(['Name'], axis=1, inplace=True)

In [27]:
X_unknown.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 417 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    417 non-null    int64  
 1   Sex       417 non-null    object 
 2   Age       331 non-null    float64
 3   SibSp     417 non-null    int64  
 4   Parch     417 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  417 non-null    object 
 7   Title     416 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 29.3+ KB


In [35]:
pipeline_wparams = best_cv['pipeline'].set_params(**best_cv['pipeline_params'])
X_unknown.dropna(subset=['Title'], inplace=True)
pipeline = best_cv['pipeline']
X_unknown_tr = pipeline.fit_transform(X_unknown)

In [33]:
X_train = pipeline.fit_transform(data)
X_train.shape

(889, 29)

In [32]:
X_unknown_tr = X_train.align(X_unknown_tr, axis=1)
X_unknown_tr

TypeError: unsupported type: <class 'numpy.ndarray'>

In [33]:
predictions = best_cv['best_estimator'].predict(X_unknown_tr)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 20 while Y.shape[1] == 28