In [2]:
import numpy as np
import pandas
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn import preprocessing
from sklearn.grid_search import GridSearchCV

In [3]:
get_ipython().magic('matplotlib inline')

Загружаем данные

In [4]:
df = pandas.read_csv('titanic.csv', index_col='id')
df.head()

Unnamed: 0_level_0,class,age,sex,survived
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


Делаем данные бинарными векторами

In [5]:
classes = pandas.get_dummies(df['class'])

In [6]:
df = pandas.concat([df, classes], axis=1)

In [7]:
lb = preprocessing.LabelBinarizer()

In [12]:
df['age'] = lb.fit_transform(df['age'])
df['sex'] = lb.fit_transform(df['sex'])
df['survived'] = lb.fit_transform(df['survived'])

In [13]:
df.head()

Unnamed: 0_level_0,class,age,sex,survived,1st class,2nd class,3rd class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1st class,0,0,1,1,0,0
2,1st class,0,0,1,1,0,0
3,1st class,0,0,1,1,0,0
4,1st class,0,0,1,1,0,0
5,1st class,0,0,1,1,0,0


Делаем векторы для обучения и тестов

In [17]:
X_all = df.loc[:,['age','sex', '1st class', '2nd class', '3rd class']]

In [18]:
y_all = df['survived']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2)

Делаем дерево решений

In [20]:
clf = DecisionTreeClassifier(min_samples_split=10, splitter='random', max_leaf_nodes=4, max_depth=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.78030303030303028, 0.59154929577464788)

Попробуем с помощью грид серч поперебирать разные результаты

In [21]:
parameters = {
    'min_samples_split': tuple(range(2,11)),
    'splitter': ('random', 'best'),
    'criterion': ('gini', 'entropy'),
    'max_leaf_nodes': tuple(range(2,11)),
    'max_depth': tuple(range(2,11))
}

In [22]:
gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(X_test)
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.7992424242424242, 0.6394557823129251)

Мы смогли чуть улучшить результат, теперь посмотрим какие параметры были лучшими

In [24]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

criterion: 'gini'
max_depth: 3
max_leaf_nodes: 7
min_samples_split: 2
splitter: 'random'


Делаем Random Forest

In [25]:
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.74242424242424243, 0.62222222222222223)

Он оказался чуть хуже просто дерева решений. Попробуем снова поискать параметры

In [26]:
parameters = {
    'n_estimators': (50, 100, 500),
    'min_samples_split': tuple(range(2,11))
}

In [27]:
gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)
y_pred = gs_clf.predict(X_test)
accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.7992424242424242, 0.6394557823129251)

In [28]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

min_samples_split: 8
n_estimators: 50


С такими параметрами результат вышел похожим на то что мы видели с одним деревом решений