In [47]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

In [2]:
plt.style.use('ggplot')
plt.rc('figure', dpi=90)

In [16]:
data = pd.read_csv('train.csv', index_col=0)

In [17]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Hot encode categoricals

In [20]:
data = pd.get_dummies(data, 
                      columns=['Pclass', 'Sex', 'Embarked'], 
                      drop_first=True)

In [21]:
data.head()

Unnamed: 0_level_0,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,1,0,1
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0
3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,1
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,0,1
5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,1,0,1


## Join SibSp and Parch into single boolean variable: has family?

In [36]:
has_family = (data.SibSp == 1) | (data.Parch == 1)
data = pd.concat([data, has_family.rename('Family')], axis=1)

In [40]:
data.drop(['SibSp', 'Parch'], axis=1, inplace=True)

## Transform Cabin into boolean: has cabin?

In [41]:
data['Cabin'] = data.Cabin.notna()
data.head()

Unnamed: 0_level_0,Survived,Name,Age,Ticket,Fare,Cabin,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,"Braund, Mr. Owen Harris",22.0,A/5 21171,7.25,False,0,1,1,0,1,True
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,PC 17599,71.2833,True,0,0,0,0,0,True
3,1,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282,7.925,False,0,1,0,0,1,False
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803,53.1,True,0,0,0,0,1,True
5,0,"Allen, Mr. William Henry",35.0,373450,8.05,False,0,1,1,0,1,False


## Predictors and labels

In [48]:
predictors = [
    'Fare', 
    'Cabin', 
    'Pclass_2', 
    'Pclass_3', 
    'Sex_male', 
    'Embarked_S', 
    'Embarked_Q', 
    'Family'
]
X = data.loc[:, predictors]
y = data.loc[:, 'Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

## The baseline

In [49]:
cls = DummyClassifier(strategy='most_frequent')

In [50]:
cls.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [51]:
pred = cls.predict(X_test)

In [52]:
accuracy_score(y_test, pred)

0.6233183856502242

## Algorithms

### SVC

In [53]:
from sklearn.svm import SVC

In [54]:
cls = SVC()
cls.fit(X_train, y_train)
pred = cls.predict(X_test)
accuracy_score(y_test, pred)

0.757847533632287

### Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
cls = RandomForestClassifier()
cls.fit(X_train, y_train)
pred = cls.predict(X_test)
accuracy_score(y_test, pred)

0.8071748878923767

### Submission

In [78]:
from sklearn.preprocessing import Imputer

In [89]:
test_data = pd.read_csv('test.csv', index_col=0)
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [63]:
test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [90]:
imputer = Imputer(strategy='mean')

In [91]:
imputer.fit(test_data.loc[:, ['Fare']])

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [92]:
test_data['Fare'] = imputer.transform(test_data.loc[:, ['Fare']])

In [93]:
test_data = pd.get_dummies(test_data,
                           columns=['Pclass', 'Sex', 'Embarked'],
                           drop_first=True)
has_family_test = (test_data.SibSp == 1) | (test_data.Parch == 1)
test_data = pd.concat([test_data, has_family_test.rename('Family')],
                      axis=1)
test_data.drop(['SibSp', 'Parch'], axis=1, inplace=True)
test_data['Cabin'] = test_data.Cabin.notna()
test_data.head()

Unnamed: 0_level_0,Name,Age,Ticket,Fare,Cabin,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,"Kelly, Mr. James",34.5,330911,7.8292,False,0,1,1,1,0,False
893,"Wilkes, Mrs. James (Ellen Needs)",47.0,363272,7.0,False,0,1,0,0,1,True
894,"Myles, Mr. Thomas Francis",62.0,240276,9.6875,False,1,0,1,1,0,False
895,"Wirz, Mr. Albert",27.0,315154,8.6625,False,0,1,1,0,1,False
896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,3101298,12.2875,False,0,1,0,0,1,True


In [94]:
X_submission = test_data.loc[:, predictors]
X_submission.head()

Unnamed: 0_level_0,Fare,Cabin,Pclass_2,Pclass_3,Sex_male,Embarked_S,Embarked_Q,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,7.8292,False,0,1,1,0,1,False
893,7.0,False,0,1,0,1,0,True
894,9.6875,False,1,0,1,0,1,False
895,8.6625,False,0,1,1,1,0,False
896,12.2875,False,0,1,0,1,0,True


In [95]:
cls = RandomForestClassifier()
cls.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [96]:
pred = cls.predict(X_submission)

In [97]:
pred

array([0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [99]:
submission = pd.DataFrame({'Survived': pred}, index=test_data.index)

In [100]:
submission.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1


In [101]:
submission.to_csv('submissions/sub1.csv')