# Titanic: Machine Learning from Disaster

Based on https://www.kaggle.com/amitkumarjaiswal/beginner-s-tutorial-to-titanic-using-scikit-learn

In [2]:
import pandas as pd
import numpy as np

## Import Data

In [45]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [46]:
train.size

10692

In [44]:
test['Train'] = 0
train['']

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,Mrs


## Analyze Data

In [8]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [64]:
train.describe

<bound method NDFrame.describe of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
5              6         0       3   
6              7         0       1   
7              8         0       3   
8              9         1       3   
9             10         1       2   
10            11         1       3   
11            12         1       1   
12            13         0       3   
13            14         0       3   
14            15         0       3   
15            16         1       2   
16            17         0       3   
17            18         1       2   
18            19         0       3   
19            20         1       3   
20            21         0       2   
21            22         1       2   
22            23         1       3   
23            24         1       1   
24            25

In [8]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [11]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [14]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [16]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


## Prepare Data
### Feature Generation & Selection

In [9]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train = train.drop(['Ticket', 'Cabin'], axis=1)
test = test.drop(['Ticket', 'Cabin'], axis=1)
combine = [train, test]
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [10]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [11]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Jonkheer', 'Dona'], 'Lady')
    dataset['Title'] = dataset['Title'].replace(['Capt', 'Don', 'Major', 'Sir'], 'Sir')
    
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Col,0.5
1,Dr,0.428571
2,Lady,0.666667
3,Master,0.575
4,Miss,0.702703
5,Mr,0.156673
6,Mrs,0.793651
7,Rev,0.0
8,Sir,0.4


In [41]:
values = train.Title.unique()
train['Title Id'] = train.Title.map(lambda value: np.where(values == value)[0][0])
train.head()
#train.title_id.map?

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title,title_id,Title Id
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,Mr,Mr,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,Mrs,Mrs,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,Miss,Miss,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,Mrs,Mrs,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,Mr,Mr,0


In [28]:
train.Title.unique().where(item == 'Sir')

AttributeError: 'numpy.ndarray' object has no attribute 'where'

In [139]:
title_mapping = {"Col": 1, "Dr": 2, "Lady": 3, "Master": 4, "Miss": 5, "Mr": 6, "Mrs": 7, "Rev": 8, "Sir": 9}

for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,6
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,7
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,5
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,7
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,6


In [140]:
train = train.drop(['Name', 'PassengerId'], axis=1)
test = test.drop(['Name'], axis=1)
combine = [train, test]

In [141]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [142]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,S,6
1,1,1,1,38.0,1,0,71.2833,C,7
2,1,3,1,26.0,0,0,7.925,S,5
3,1,1,1,35.0,1,0,53.1,S,7
4,0,3,0,35.0,0,0,8.05,S,6


### Null-Values

In [143]:
guess_ages = np.zeros((2,3))
guess_ages

array([[0., 0., 0.],
       [0., 0., 0.]])

In [144]:
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()            
            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( (age_guess/0.5 + 0.5)  * 0.5 )
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22,1,0,7.25,S,6
1,1,1,1,38,1,0,71.2833,C,7
2,1,3,1,26,0,0,7.925,S,5
3,1,1,1,35,1,0,53.1,S,7
4,0,3,0,35,0,0,8.05,S,6


In [145]:
train['AgeBand'] = pd.cut(train['Age'], 5)
train[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.337374
2,"(32.0, 48.0]",0.412037
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [146]:
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,AgeBand
0,0,3,0,1,1,0,7.25,S,6,"(16.0, 32.0]"
1,1,1,1,2,1,0,71.2833,C,7,"(32.0, 48.0]"
2,1,3,1,1,0,0,7.925,S,5,"(16.0, 32.0]"
3,1,1,1,2,1,0,53.1,S,7,"(32.0, 48.0]"
4,0,3,0,2,0,0,8.05,S,6,"(32.0, 48.0]"


In [147]:
train = train.drop(['AgeBand'], axis=1)
combine = [train, test]
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1,1,0,7.25,S,6
1,1,1,1,2,1,0,71.2833,C,7
2,1,3,1,1,0,0,7.925,S,5
3,1,1,1,2,1,0,53.1,S,7
4,0,3,0,2,0,0,8.05,S,6


In [148]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [149]:
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [150]:
train = train.drop(['Parch',], axis=1)
test = test.drop(['Parch'], axis=1)
combine = [train, test]

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Embarked,Title,FamilySize,IsAlone
0,0,3,0,1,1,7.25,S,6,2,0
1,1,1,1,2,1,71.2833,C,7,2,0
2,1,3,1,1,0,7.925,S,5,1,1
3,1,1,1,2,1,53.1,S,7,2,0
4,0,3,0,2,0,8.05,S,6,1,1


In [151]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Fare,Embarked,Title,FamilySize,IsAlone
0,892,3,0,2,0,7.8292,Q,6,1,1
1,893,3,1,2,1,7.0,S,7,2,0
2,894,2,0,3,0,9.6875,Q,6,1,1
3,895,3,0,1,0,8.6625,S,6,1,1
4,896,3,1,1,1,12.2875,S,7,3,0


In [152]:
for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

train.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)

Unnamed: 0,Age*Class,Age,Pclass
0,3,1,3
1,2,2,1
2,3,1,3
3,2,2,1
4,6,2,3
5,3,1,3
6,3,3,1
7,0,0,3
8,3,1,3
9,0,0,2


In [153]:
freq_port = train.Embarked.dropna().mode()[0]
freq_port

'S'

In [154]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [155]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Embarked,Title,FamilySize,IsAlone,Age*Class
0,0,3,0,1,1,7.25,0,6,2,0,3
1,1,1,1,2,1,71.2833,1,7,2,0,2
2,1,3,1,1,0,7.925,0,5,1,1,3
3,1,1,1,2,1,53.1,0,7,2,0,2
4,0,3,0,2,0,8.05,0,6,1,1,6


In [42]:
test['Fare'].fillna(test['Fare'].dropna().median(), inplace=True)
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,Mrs


In [157]:
train['FareBand'] = pd.qcut(train['Fare'], 4)
train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


In [158]:
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train = train.drop(['FareBand'], axis=1)
combine = [train, test]

# Train DF head
train.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Embarked,Title,FamilySize,IsAlone,Age*Class
0,0,3,0,1,1,0,0,6,2,0,3
1,1,1,1,2,1,3,1,7,2,0,2
2,1,3,1,1,0,1,0,5,1,1,3
3,1,1,1,2,1,3,0,7,2,0,2
4,0,3,0,2,0,1,0,6,1,1,6
5,0,3,0,1,0,1,2,6,1,1,3
6,0,1,0,3,0,3,0,6,1,1,3
7,0,3,0,0,3,2,0,4,5,0,0
8,1,3,1,1,0,1,0,7,3,0,3
9,1,2,1,0,1,2,1,7,2,0,0


In [159]:
# Test dataset
test.head(10)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Fare,Embarked,Title,FamilySize,IsAlone,Age*Class
0,892,3,0,2,0,0,2,6,1,1,6
1,893,3,1,2,1,0,0,7,2,0,6
2,894,2,0,3,0,1,2,6,1,1,6
3,895,3,0,1,0,1,0,6,1,1,3
4,896,3,1,1,1,1,0,7,3,0,3
5,897,3,0,0,0,1,0,6,1,1,0
6,898,3,1,1,0,0,2,5,1,1,3
7,899,2,0,1,1,2,0,6,3,0,2
8,900,3,1,1,0,0,1,7,1,1,3
9,901,3,0,1,2,2,0,6,3,0,3


## Model &amp; Training

In [160]:
x_train = train.drop("Survived", axis=1)
y_train = train["Survived"]

x_test  = test.drop("PassengerId", axis=1).copy()

x_train.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Title,FamilySize,IsAlone,Age*Class
0,3,0,1,1,0,0,6,2,0,3
1,1,1,2,1,3,1,7,2,0,2
2,3,1,1,0,1,0,5,1,1,3
3,1,1,2,1,3,0,7,2,0,2
4,3,0,2,0,1,0,6,1,1,6
5,3,0,1,0,1,2,6,1,1,3
6,1,0,3,0,3,0,6,1,1,3
7,3,0,0,3,2,0,4,5,0,0
8,3,1,1,0,1,0,7,3,0,3
9,2,1,0,1,2,1,7,2,0,0


In [161]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [162]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Fare,Embarked,Title,FamilySize,IsAlone,Age*Class
0,892,3,0,2,0,0,2,6,1,1,6
1,893,3,1,2,1,0,0,7,2,0,6
2,894,2,0,3,0,1,2,6,1,1,6
3,895,3,0,1,0,1,0,6,1,1,3
4,896,3,1,1,1,1,0,7,3,0,3


In [163]:
# machine learning
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

In [165]:
# Logistic Regression
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
acc_log

80.13

In [166]:
# Display correllation between attributes and predictions
coeff = pd.DataFrame(train.columns.delete(0))
coeff.columns = ['Feature']
coeff["Correlation"] = pd.Series(logreg.coef_[0])

coeff.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
1,Sex,2.527409
2,Age,0.349947
4,Fare,0.198187
5,Embarked,0.154449
6,Title,-0.220933
7,FamilySize,-0.223431
9,Age*Class,-0.3711
0,Pclass,-0.377935
3,SibSp,-0.428702
8,IsAlone,-0.769478


In [168]:
# Support Vector Machines
svc = SVC(gamma='auto')
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
acc_svc = round(svc.score(x_train, y_train) * 100, 2)
acc_svc

84.06

In [169]:
# KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
acc_knn = round(knn.score(x_train, y_train) * 100, 2)
acc_knn

84.51

In [170]:
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_test)
acc_gaussian = round(gaussian.score(x_train, y_train) * 100, 2)
acc_gaussian

72.28

In [171]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
y_pred = decision_tree.predict(x_test)
acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
acc_decision_tree

89.23

In [172]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
random_forest.score(x_train, y_train)
acc_random_forest = round(random_forest.score(x_train, y_train) * 100, 2)
acc_random_forest

89.23

In [173]:
# Gradient Boosting
grad_boost = GradientBoostingClassifier(n_estimators = 100)
grad_boost.fit(x_train, y_train)
y_pred = grad_boost.predict(x_test)
grad_boost.score(x_train, y_train)
acc_grad_boost = round(grad_boost.score(x_train, y_train) * 100, 2)
acc_grad_boost

85.86

In [174]:
# RidgeClassifierCV
Ridge= RidgeClassifierCV()
Ridge.fit(x_train, y_train)
y_pred = Ridge.predict(x_test)
acc_Ridge= round(Ridge.score(x_train, y_train) * 100, 2)
acc_Ridge

79.91

In [176]:
# Perceptron
perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_test)
acc_perceptron = round(perceptron.score(x_train, y_train) * 100, 2)
acc_perceptron

79.24

In [177]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 'Random Forest', 'Naive Bayes', 'Perceptron', 'Grad boost','Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, acc_random_forest, acc_gaussian, acc_perceptron, acc_grad_boost, acc_decision_tree]})

models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,89.23
7,Decision Tree,89.23
6,Grad boost,85.86
1,KNN,84.51
0,Support Vector Machines,84.06
2,Logistic Regression,80.13
5,Perceptron,79.24
4,Naive Bayes,72.28


In [178]:
# Split the training set into a development and an evaluation sets
from sklearn.model_selection import train_test_split
x_dev, x_eval, y_dev, y_eval = train_test_split(x_train,
                                                y_train,
                                                test_size=0.2,
                                                random_state=42)

x_test.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Embarked,Title,FamilySize,IsAlone,Age*Class
0,3,0,2,0,0,2,6,1,1,6
1,3,1,2,1,0,0,7,2,0,6
2,2,0,3,0,1,2,6,1,1,6
3,3,0,1,0,1,0,6,1,1,3
4,3,1,1,1,1,0,7,3,0,3
5,3,0,0,0,1,0,6,1,1,0
6,3,1,1,0,0,2,5,1,1,3
7,2,0,1,1,2,0,6,3,0,2
8,3,1,1,0,0,1,7,1,1,3
9,3,0,1,2,2,0,6,3,0,3


In [179]:
# 1. Random Forest
import time
dict_clf = {}


paramgrid = {
    'n_estimators':      [100, 150, 200, 250, 300, 400, 500],
    'criterion':         ['gini', 'entropy'],
    'max_features':      ['auto', 'log2'],
    'min_samples_leaf':  list(range(2, 8))
}

GS = GridSearchCV(RandomForestClassifier(random_state=77),
                  paramgrid,
                  cv=4)

# Fit the data and record time taking to train
t0 = time.time()
GS.fit(x_dev, y_dev)
t = time.time() - t0

# Store best parameters, score and estimator
best_clf = GS.best_estimator_
best_params = GS.best_params_
best_score = GS.best_score_

name = 'RF'

In [43]:
x_test.head()

NameError: name 'x_test' is not defined

In [78]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

# Split into x and y
train = pd.read_pickle('../data/train_prepared.pkl')
x_test = pd.read_pickle('../data/test_prepared.pkl')
x_train = train.drop(['Survived'], axis = 1)
y_train = train['Survived']

In [80]:
x_train[pd.isna(x_train.Age)]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone


In [54]:
x_train.[Pclass==1]

NameError: name 'Pclass' is not defined

In [76]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [77]:
train[pd.isna(train.Age)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
29,30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
32,33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.7500,,Q
36,37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C
42,43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C
