In [29]:
from Kaggle_Titanic_utls import *
import warnings
warnings.filterwarnings('ignore')

In [30]:
path = Path('.')

In [31]:
# Import data
data = pd.read_csv(path / "train.csv")

In [32]:
data.shape

(891, 12)

In [33]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [35]:
data['Survived'].sum()/len(data)

0.3838383838383838

In [36]:
#Fare funzione Alessandro

The classes are sufficiently balanced - it makes sense to consider accuracy as performance metrics.

Embarked only has 2 missing values out of 891 - it makes sense to get rid of them altogether.

In [37]:
data.dropna(subset=['Embarked'], inplace=True)

In [38]:
labels = data[['Survived']]

data.drop(['Cabin', 'Ticket', 'PassengerId', 'Survived'], axis=1, inplace=True)

In [39]:
data.corr().style.background_gradient(cmap ='cool')\
        .set_properties(**{'font-size': '15px'}) 

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
Pclass,1.0,-0.365902,0.081656,0.016824,-0.548193
Age,-0.365902,1.0,-0.307351,-0.187896,0.093143
SibSp,0.081656,-0.307351,1.0,0.414542,0.160887
Parch,0.016824,-0.187896,0.414542,1.0,0.217532
Fare,-0.548193,0.093143,0.160887,0.217532,1.0


There are no highly correlated features (maybe do feature extraction?).

We can extract the title of each passenger from the name **(which is most likely not useful in the prediction. We can check this though)**.

In [40]:
title = data.Name.apply(lambda x:x.split(',')[1].split('.')[0].strip())

In [41]:
title.value_counts()

Mr              517
Miss            181
Mrs             124
Master           40
Dr                7
Rev               6
Major             2
Mlle              2
Col               2
Ms                1
Don               1
the Countess      1
Mme               1
Jonkheer          1
Lady              1
Sir               1
Capt              1
Name: Name, dtype: int64

In [42]:
data['Title'] = title
data.drop('Name', axis=1, inplace=True)

In [43]:
labels.shape, data.shape

((889, 1), (889, 8))

In [44]:
data.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title'], dtype='object')

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

def base_pip(model, data, labels, imputer_s='median'):
    numeric_features = ['Age', 'SibSp', 'Fare', 'Parch']
    
    if (imputer_s == 'median'):
        imputer = SimpleImputer(strategy='median')
    elif (imputer_s == 'mean'):
        imputer = SimpleImputer(strategy='median')
    elif (imputer_s == 'zeros'):
        imputer = SimpleImputer(strategy='constant', fill_value=0)
    elif (imputer_s == 'function'):
        imputer = IterativeImputer()
        
    numeric_transformer = Pipeline(steps=[
        ('imputer', imputer),
        ('scaler', StandardScaler())])

    categorical_features = ['Embarked', 'Sex', 'Pclass', 'Title']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])

    
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.15,
                                                                random_state=0, stratify=labels)

    
    clf.fit(X_train, y_train)
    score_train, score_test = clf.score(X_train, y_train), clf.score(X_test, y_test)
    
    return score_train, score_test

In [45]:
models = {'log': LogisticRegression(), 
          'tree': DecisionTreeClassifier(), 
          'svc': SVC(), 
          'perceptron': Perceptron(), 
          'knn': KNeighborsClassifier()}


for i, j in models.items():
    print(i, base_pip(j, data, labels))
    
    
    

log (0.833112582781457, 0.8134328358208955)
tree (0.9827814569536424, 0.7686567164179104)
svc (0.8423841059602649, 0.8208955223880597)
perceptron (0.7827814569536424, 0.7761194029850746)
knn (0.866225165562914, 0.8582089552238806)


## NOTA: dobbiamo inserire anche i gaussian NB che al momento danno dei problemi perché non vogliono una matrice sparsa come input.

## To visualize the pipeline

from sklearn import set_config

set_config(display='diagram')
clf

It seems that the k-nearest neighbours does the best job among the models considered. Support Vector Classifier and Logistic Regression also score reasonably well (guessing randomly would produce a $\sim 62\%$ accuracy) Now we will:

1. Evaluate other imputation methods on selected models;
2. Choose the best imputation method;
3. Do a grid search on parameters for the best model(s) and imputation method(s).

## Imputation

In [47]:
selected_models = ['knn', 'log', 'svc'] 

imputation_methods = ['mean', 'median', 'function', 'zeros']

for i in selected_models:
    print(i)
    for j in imputation_methods:
        print('\t',j, base_pip(models[i], data, labels, imputer_s=j))
    print(' ')
        

knn
	 mean (0.8635761589403973, 0.8432835820895522)
	 median (0.866225165562914, 0.8582089552238806)
	 function (0.8596026490066225, 0.835820895522388)
	 zeros (0.8582781456953642, 0.8432835820895522)
 
log
	 mean (0.833112582781457, 0.8134328358208955)
	 median (0.833112582781457, 0.8134328358208955)
	 function (0.833112582781457, 0.8134328358208955)
	 zeros (0.8304635761589404, 0.8134328358208955)
 
svc
	 mean (0.8423841059602649, 0.8208955223880597)
	 median (0.8423841059602649, 0.8208955223880597)
	 function (0.8423841059602649, 0.8208955223880597)
	 zeros (0.8397350993377484, 0.8208955223880597)
 


Different imputation methods have little impact on train and test for LR and SVC. KNN is slightly less stable from this standpoint, but it remains the best performing model, with the median as imputation method.

## Grid Search

Now that we have chosen our model, we can grid search parameters.

In [51]:
grid_params = {'weights': ['uniform', 'distance'], 
               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
               'n_neighbors': np.logspace(0, 1.8, 10)}

model = KNeighborsClassifier()
gscv = GridSearchCV(model, grid_params, scoring='accuracy', 
             cv=3, n_jobs=-3, verbose=1)

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.15,
                                                                random_state=0, stratify=labels)

gscv.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-3)]: Done  40 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-3)]: Done 240 out of 240 | elapsed:    2.5s finished


ValueError: could not convert string to float: 'female'