In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
import os
os.chdir('..')

In [100]:
train = pd.read_csv('data/titanic/train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
test = pd.read_csv('data/titanic/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# First baseline

In [9]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

In [98]:
X = train.drop('Survived', axis=1)
y = train.Survived
# tree.fit(X, y)

## Preprocessing

In [101]:
train.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,891,,,,446.0,257.354,1.0,223.5,446.0,668.5,891.0
Survived,891,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891,,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0
Name,891,891.0,"Laroche, Mr. Joseph Philippe Lemercier",1.0,,,,,,,
Sex,891,2.0,male,577.0,,,,,,,
Age,714,,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0
SibSp,891,,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0
Parch,891,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Ticket,891,681.0,347082,7.0,,,,,,,
Fare,891,,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329


In [27]:
train['PassengerId'] = train.PassengerId.astype(str)

### Remove unnecessary columns

In [102]:
drop_cols = ['PassengerId', 'Cabin', 'Ticket', 'Name', 'Survived']
keep_cols = [c for c in train.columns if c not in drop_cols]

In [103]:
X = train[keep_cols].copy()
y = train.Survived

### Handle missing values in categorical columns

In [64]:
X.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Pclass,891,,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0
Sex,891,2.0,male,577.0,,,,,,,
Age,714,,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0
SibSp,891,,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0
Parch,891,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891,,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329
Embarked,889,3.0,S,644.0,,,,,,,


In [105]:
cat_cols = ['Sex', 'Embarked']

In [106]:
for c in cat_cols:
    X[c] = X[c].astype(str)

In [107]:
lencoders = {}
for c in cat_cols:
    lencoders[c] = LabelEncoder()
    X[c] = lencoders[c].fit_transform(X[c])

In [109]:
mean_age = X.Age.mean()
X['AgeMissing'] = X.Age.isnull()
X['Age'] = X.Age.fillna(mean_age)

In [110]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeMissing
0,3,1,22.0,1,0,7.25,2,False
1,1,0,38.0,1,0,71.2833,0,False
2,3,0,26.0,0,0,7.925,2,False
3,1,0,35.0,1,0,53.1,2,False
4,3,1,35.0,0,0,8.05,2,False


## Training

In [88]:
tree.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [89]:
y_pred = tree.predict(X)

In [92]:
from sklearn.metrics import accuracy_score
train_acc = accuracy_score(y, y_pred)
print(f'Train accuracy: {train_acc:.2%}')

Train accuracy: 98.20%


## Test predictions

In [93]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [96]:
test.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,418,,,,1100.5,120.81,892.0,996.25,1100.5,1204.75,1309.0
Pclass,418,,,,2.26555,0.841838,1.0,1.0,3.0,3.0,3.0
Name,418,418.0,"Dodge, Dr. Washington",1.0,,,,,,,
Sex,418,2.0,male,266.0,,,,,,,
Age,332,,,,30.2726,14.1812,0.17,21.0,27.0,39.0,76.0
SibSp,418,,,,0.447368,0.89676,0.0,0.0,0.0,1.0,8.0
Parch,418,,,,0.392344,0.981429,0.0,0.0,0.0,0.0,9.0
Ticket,418,363.0,PC 17608,5.0,,,,,,,
Fare,417,,,,35.6272,55.9076,0.0,7.8958,14.4542,31.5,512.329
Cabin,91,76.0,B57 B59 B63 B66,3.0,,,,,,,


In [117]:
def process_data(X, keep_cols, cat_cols, lencoders, mean_age):
    X = X[keep_cols].copy()
    for c in cat_cols:
        X[c] = X[c].astype(str)
        X[c] = lencoders[c].transform(X[c])
    X['AgeMissing'] = X.Age.isnull()
    X['Age'] = X.Age.fillna(mean_age)
    return X

In [118]:
X_test = process_data(test, keep_cols, cat_cols, lencoders, mean_age)

In [119]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeMissing
0,3,1,34.5,0,0,7.8292,1,False
1,3,0,47.0,1,0,7.0,2,False
2,2,1,62.0,0,0,9.6875,1,False
3,3,1,27.0,0,0,8.6625,2,False
4,3,0,22.0,1,1,12.2875,2,False


In [120]:
X_test['Fare'] = X_test.Fare.fillna(X.Fare.mean())

In [121]:
y_test = tree.predict(X_test)

In [125]:
submission = pd.read_csv('data/titanic/gender_submission.csv')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [127]:
submission.PassengerId.tolist() == test.PassengerId.tolist()

True

In [131]:
submission['Survived'] = y_test

In [132]:
submission.to_csv('submission.csv', index=False)

# Baseline

In [133]:
train.Survived.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [135]:
submission['Survived'] = 0

In [136]:
submission.to_csv('all_zeros.csv', index=False)