In [3]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [4]:
path = Path('.')

In [5]:
training = pd.read_csv(path / "train.csv")
test = pd.read_csv(path / "test.csv")

In [6]:
training.shape, test.shape

((891, 12), (418, 11))

In [7]:
training.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [8]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
df_train_num = training[['Age', 'SibSp', 'Parch', 'Fare']]
df_train_cat = training[['Survived', 'Pclass',\
                   'Sex', 'Ticket', 'Cabin', 'Embarked']]

In [10]:
df_train_num.corr().style.background_gradient(cmap ='cool')\
        .set_properties(**{'font-size': '20px'}) 

Unnamed: 0,Age,SibSp,Parch,Fare
Age,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.308247,1.0,0.414838,0.159651
Parch,-0.189119,0.414838,1.0,0.216225
Fare,0.096067,0.159651,0.216225,1.0


In [11]:
pd.pivot_table\
(training, index='Survived', values=['Age', 'SibSp', 'Parch', 'Fare'])

Unnamed: 0_level_0,Age,Fare,Parch,SibSp
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,30.626179,22.117887,0.32969,0.553734
1,28.34369,48.395408,0.464912,0.473684


In [12]:
pd.pivot_table\
(training, index='Survived', columns='Sex', values='Ticket', aggfunc='count')

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,468
1,233,109


In [13]:
title = training.Name.apply(lambda x:x.split(',')[1].split('.')[0].strip())

In [14]:
title.value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Major             2
Mlle              2
Col               2
Don               1
Lady              1
the Countess      1
Jonkheer          1
Ms                1
Mme               1
Capt              1
Sir               1
Name: Name, dtype: int64

In [15]:
training['Title'] = title

In [16]:
pd.pivot_table(training, index='Survived', columns='Title', values='Ticket', aggfunc='count')

Title,Capt,Col,Don,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir,the Countess
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1.0,1.0,1.0,4.0,1.0,,1.0,17.0,55.0,,,436.0,26.0,,6.0,,
1,,1.0,,3.0,,1.0,1.0,23.0,127.0,2.0,1.0,81.0,99.0,1.0,,1.0,1.0


In [17]:
training.Cabin = training.Cabin.fillna(0)

training.Age = training.Age.fillna(training.Age.median())
training.Fare = training.Fare.fillna(training.Fare.median())

training.dropna(subset=['Embarked'], inplace=True)

In [18]:
training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        889 non-null    object 
 11  Embarked     889 non-null    object 
 12  Title        889 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 97.2+ KB


In [19]:
X_train = pd.get_dummies(training[['Pclass', 'Age', 'SibSp',\
                                      'Parch', 'Fare', 'Title',\
                                     'Sex', 'Ticket', 'Cabin',\
                                      'Embarked']])

Y = training.Survived

In [49]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree

In [21]:
scale = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[['Age','SibSp','Parch','Fare']]= scale.fit_transform(X_train_scaled[['Age','SibSp','Parch','Fare']])

In [35]:
perc = Perceptron(tol=1e-3, random_state=0)

cv = cross_val_score(perc, X_train_scaled, Y, cv=5, verbose=1)
print(cv)
print(cv.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0.8258427  0.81460674 0.80337079 0.58426966 0.61016949]
0.7276518758331747


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished


In [29]:
svc = SVC(probability=False)

cv = cross_val_score(svc, X_train_scaled, Y, cv=5, n_jobs=-1, verbose=1)
print(cv)
print(cv.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.8s remaining:    1.1s


[0.84831461 0.8258427  0.8258427  0.80898876 0.86440678]
0.8346791087411922


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished


In [31]:
lr = LogisticRegression(max_iter=2000)

cv = cross_val_score(lr, X_train_scaled, Y, n_jobs=-1, verbose=1)
print(cv)
print(cv.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[0.85393258 0.81460674 0.80898876 0.8258427  0.8700565 ]
0.8346854567383991


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.5s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished


In [33]:
rf = RandomForestClassifier(random_state = 1)

cv = cross_val_score(rf, X_train, Y, cv=10, n_jobs=-1, verbose=1)
print(cv)
print(cv.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[0.80898876 0.86516854 0.76404494 0.86516854 0.87640449 0.84269663
 0.84269663 0.79775281 0.87640449 0.82954545]
0.8368871297242084


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.4s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.6s finished


In [52]:
dt = tree.DecisionTreeClassifier(random_state=0, max_depth=5)

cv = cross_val_score(dt, X_train_scaled, Y, n_jobs=-1, verbose=1, cv=5)
print(cv)
print(cv.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[0.80337079 0.82022472 0.83707865 0.79213483 0.85310734]
0.8211832666793626


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


In [53]:
tr = dt.fit(X_train_scaled, Y)