In [108]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

In [109]:
path = Path('.')

In [110]:
# Import data
data = pd.read_csv(path / "train.csv")

In [111]:
data.shape

(891, 12)

In [112]:
data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [113]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [114]:
data['Survived'].sum()/len(data)

0.3838383838383838

In [115]:
#Fare funzione Alessandro

The classes are sufficiently balanced -- it makes sense to consider accuracy as performance metrics

In [116]:
data.dropna(subset=['Embarked'], inplace=True)

In [117]:
labels = data[['Survived']]

data.drop(['Cabin', 'Ticket', 'PassengerId', 'Survived'], axis=1, inplace=True)

In [118]:
data.corr().style.background_gradient(cmap ='cool')\
        .set_properties(**{'font-size': '15px'}) 

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
Pclass,1.0,-0.365902,0.081656,0.016824,-0.548193
Age,-0.365902,1.0,-0.307351,-0.187896,0.093143
SibSp,0.081656,-0.307351,1.0,0.414542,0.160887
Parch,0.016824,-0.187896,0.414542,1.0,0.217532
Fare,-0.548193,0.093143,0.160887,0.217532,1.0


There are no highly correlated features -- we can use all of them (maybe do feature extraction?)

In [119]:
title = data.Name.apply(lambda x:x.split(',')[1].split('.')[0].strip())

In [120]:
title.value_counts()

Mr              517
Miss            181
Mrs             124
Master           40
Dr                7
Rev               6
Mlle              2
Col               2
Major             2
Don               1
Ms                1
the Countess      1
Mme               1
Lady              1
Jonkheer          1
Capt              1
Sir               1
Name: Name, dtype: int64

In [121]:
data['Title'] = title
data.drop('Name', axis=1, inplace=True)

In [122]:
labels.shape, data.shape

((889, 1), (889, 8))

In [123]:
data.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title'], dtype='object')

In [170]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

numeric_features = ['Age', 'SibSp', 'Fare', 'Parch']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
    
categorical_features = ['Embarked', 'Sex', 'Pclass', 'Title']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.15,
                                                            random_state=0, stratify=labels)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_train, y_train))
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.833
model score: 0.813


  return f(**kwargs)


In [90]:
from sklearn import set_config

set_config(display='diagram')
clf

In [49]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree

In [21]:
scale = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[['Age','SibSp','Parch','Fare']]= scale.fit_transform(X_train_scaled[['Age','SibSp','Parch','Fare']])

In [35]:
perc = Perceptron(tol=1e-3, random_state=0)

cv = cross_val_score(perc, X_train_scaled, Y, cv=5, verbose=1)
print(cv)
print(cv.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.8258427  0.81460674 0.80337079 0.58426966 0.61016949]
0.7276518758331747


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


In [29]:
svc = SVC(probability=False)

cv = cross_val_score(svc, X_train_scaled, Y, cv=5, n_jobs=-1, verbose=1)
print(cv)
print(cv.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.8s remaining:    1.1s


[0.84831461 0.8258427  0.8258427  0.80898876 0.86440678]
0.8346791087411922


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished


In [31]:
lr = LogisticRegression(max_iter=2000)

cv = cross_val_score(lr, X_train_scaled, Y, n_jobs=-1, verbose=1)
print(cv)
print(cv.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[0.85393258 0.81460674 0.80898876 0.8258427  0.8700565 ]
0.8346854567383991


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.5s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished


In [33]:
rf = RandomForestClassifier(random_state = 1)

cv = cross_val_score(rf, X_train, Y, cv=10, n_jobs=-1, verbose=1)
print(cv)
print(cv.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[0.80898876 0.86516854 0.76404494 0.86516854 0.87640449 0.84269663
 0.84269663 0.79775281 0.87640449 0.82954545]
0.8368871297242084


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.4s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.6s finished


In [52]:
dt = tree.DecisionTreeClassifier(random_state=0, max_depth=5)

cv = cross_val_score(dt, X_train_scaled, Y, n_jobs=-1, verbose=1, cv=5)
print(cv)
print(cv.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[0.80337079 0.82022472 0.83707865 0.79213483 0.85310734]
0.8211832666793626


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


In [53]:
tr = dt.fit(X_train_scaled, Y)