In [1]:
import pandas as pd
import sklearn as sn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('./titanic/train.csv')
data_test = pd.read_csv('./titanic/test.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
X = data[['Name','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
X_test = data_test[['Name','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [6]:
n_train = X.shape[0]

In [7]:
X_gl = pd.concat([X, X_test])

In [8]:
X_gl['is_mr'] = X_gl.Name.apply(lambda x: 'Mr.' in x.split()).astype('int64')

In [9]:
X_gl.head()

Unnamed: 0,Name,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,is_mr
0,"Braund, Mr. Owen Harris",3,male,22.0,1,0,7.25,S,1
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,71.2833,C,0
2,"Heikkinen, Miss. Laina",3,female,26.0,0,0,7.925,S,0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,53.1,S,0
4,"Allen, Mr. William Henry",3,male,35.0,0,0,8.05,S,1


In [10]:
del X_gl['Name']

In [11]:
X_gl = pd.get_dummies(X_gl, columns=['Sex', 'Pclass', 'Embarked'])

In [12]:
X_gl.head()

Unnamed: 0,Age,SibSp,Parch,Fare,is_mr,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,7.25,1,0,1,0,0,1,0,0,1
1,38.0,1,0,71.2833,0,1,0,1,0,0,1,0,0
2,26.0,0,0,7.925,0,1,0,0,0,1,0,0,1
3,35.0,1,0,53.1,0,1,0,1,0,0,0,0,1
4,35.0,0,0,8.05,1,0,1,0,0,1,0,0,1


In [13]:
from sklearn.preprocessing import Imputer

In [14]:
imp = Imputer(missing_values='NaN', axis=0, strategy='mean', copy=False)
imp.fit(X_gl)
col = list(X_gl)
X_gl = pd.DataFrame(data=imp.transform(X_gl), columns=col)

In [15]:
X = X_gl.loc[:n_train-1]
X_test = X_gl.loc[n_train:]
print(X.shape, X_test.shape)

(891, 13) (418, 13)


In [16]:
y = data[['Survived']]

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
clf = DecisionTreeClassifier()
clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [19]:
y_predicted = clf.predict(X_test)

In [20]:
pass_ids = pd.read_csv('titanic/test.csv')[['PassengerId']]

In [21]:
with open('submission_01.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y_ in zip(pass_ids['PassengerId'], y_predicted):
        out.write('%s,%s\n' % (passenger, y_.astype('int64')))

In [22]:
clf.feature_importances_

array([ 0.21458991,  0.0228779 ,  0.03064808,  0.27584113,  0.31446586,
        0.02917161,  0.        ,  0.00247379,  0.00935533,  0.07401836,
        0.00503748,  0.00700907,  0.01451147])

In [23]:
for pair in sorted(zip(list(X), clf.feature_importances_), key=lambda x: x[1], reverse=True):
    print(pair)

('is_mr', 0.31446586050529635)
('Fare', 0.27584113156305995)
('Age', 0.21458991443136965)
('Pclass_3', 0.074018358909046247)
('Parch', 0.03064807619296003)
('Sex_female', 0.02917160780330234)
('SibSp', 0.022877901729425266)
('Embarked_S', 0.014511472111307051)
('Pclass_2', 0.0093553283003367724)
('Embarked_Q', 0.0070090743149285786)
('Embarked_C', 0.0050374832042872266)
('Pclass_1', 0.0024737909346806753)
('Sex_male', 0.0)


In [44]:
from sklearn.model_selection import GridSearchCV
depths = np.arange(1,10)
features_num = np.arange(3,14)
class_weights = ['balanced', None]
min_samples_leaf_ = np.arange(1,5)
min_samples_split_ = [2, 4, 6, 8]
grid = {
    'max_depth': depths,
    'max_features': features_num,
    'class_weight': class_weights,
    'min_samples_leaf': min_samples_leaf_,
    'min_samples_split': min_samples_split_
}
gridsearch = GridSearchCV(DecisionTreeClassifier(), grid, scoring='neg_log_loss', cv=4)

In [45]:
gridsearch.fit(X, y['Survived'])

GridSearchCV(cv=4, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_features': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]), 'class_weight': ['balanced', None], 'min_samples_leaf': array([1, 2, 3, 4]), 'min_samples_split': [2, 4, 6, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [46]:
sorted(gridsearch.grid_scores_, key = lambda x: -x.mean_validation_score)



[mean: -0.43207, std: 0.03423, params: {'class_weight': None, 'max_depth': 3, 'max_features': 11, 'min_samples_leaf': 4, 'min_samples_split': 6},
 mean: -0.43784, std: 0.03393, params: {'class_weight': None, 'max_depth': 3, 'max_features': 10, 'min_samples_leaf': 4, 'min_samples_split': 4},
 mean: -0.44052, std: 0.03511, params: {'class_weight': None, 'max_depth': 3, 'max_features': 10, 'min_samples_leaf': 2, 'min_samples_split': 8},
 mean: -0.44083, std: 0.04254, params: {'class_weight': None, 'max_depth': 3, 'max_features': 7, 'min_samples_leaf': 3, 'min_samples_split': 6},
 mean: -0.44132, std: 0.02351, params: {'class_weight': None, 'max_depth': 3, 'max_features': 6, 'min_samples_leaf': 3, 'min_samples_split': 6},
 mean: -0.44134, std: 0.03857, params: {'class_weight': None, 'max_depth': 3, 'max_features': 11, 'min_samples_leaf': 2, 'min_samples_split': 4},
 mean: -0.44274, std: 0.04507, params: {'class_weight': None, 'max_depth': 3, 'max_features': 8, 'min_samples_leaf': 3, 'min_s

In [47]:
clf_upd = gridsearch.best_estimator_

In [48]:
clf_upd.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=11, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=4, min_samples_split=6,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [49]:
y_pred_upd = clf_upd.predict(X_test)

In [50]:
with open('submission_02.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y_ in zip(pass_ids['PassengerId'], y_pred_upd):
        out.write('%s,%s\n' % (passenger, y_.astype('int64')))

In [51]:
from sklearn.tree import export_graphviz

def get_tree_dot_view(clf, feature_names=None, class_names=None):
    print(export_graphviz(clf, out_file=None, filled=True, feature_names=feature_names, class_names=class_names))

In [52]:
get_tree_dot_view(clf_upd, list(X_test.columns), ['survived', 'died'])

digraph Tree {
node [shape=box, style="filled", color="black"] ;
0 [label="is_mr <= 0.5\ngini = 0.473\nsamples = 891\nvalue = [549, 342]\nclass = survived", fillcolor="#e5813960"] ;
1 [label="Pclass_3 <= 0.5\ngini = 0.4217\nsamples = 374\nvalue = [113, 261]\nclass = died", fillcolor="#399de591"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="Sex_female <= 0.5\ngini = 0.2094\nsamples = 202\nvalue = [24, 178]\nclass = died", fillcolor="#399de5dd"] ;
1 -> 2 ;
3 [label="gini = 0.498\nsamples = 32\nvalue = [15, 17]\nclass = died", fillcolor="#399de51e"] ;
2 -> 3 ;
4 [label="gini = 0.1003\nsamples = 170\nvalue = [9, 161]\nclass = died", fillcolor="#399de5f1"] ;
2 -> 4 ;
5 [label="Fare <= 23.35\ngini = 0.4994\nsamples = 172\nvalue = [89, 83]\nclass = survived", fillcolor="#e5813911"] ;
1 -> 5 ;
6 [label="gini = 0.4725\nsamples = 128\nvalue = [49, 79]\nclass = died", fillcolor="#399de561"] ;
5 -> 6 ;
7 [label="gini = 0.1653\nsamples = 44\nvalue = [40, 4]\nclass = sur