In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [81]:
data = pd.read_csv('data/train.csv')

In [82]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [83]:
y = data['Survived']
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
X = data[features]

In [84]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
0,3,male,22.0,7.25,S
1,1,female,38.0,71.2833,C
2,3,female,26.0,7.925,S
3,1,female,35.0,53.1,S
4,3,male,35.0,8.05,S


In [85]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 34.9+ KB


In [86]:
num_features = [c for c in X.columns if not X[c].dtype == object]
cat_features = [c for c in X.columns if X[c].dtype == object]

In [87]:
X[num_features].describe()

Unnamed: 0,Pclass,Age,Fare
count,891.0,714.0,891.0
mean,2.308642,29.699118,32.204208
std,0.836071,14.526497,49.693429
min,1.0,0.42,0.0
25%,2.0,20.125,7.9104
50%,3.0,28.0,14.4542
75%,3.0,38.0,31.0
max,3.0,80.0,512.3292


In [88]:
X['Age'] = X['Age'].fillna(X['Age'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [89]:
X[num_features].describe()

Unnamed: 0,Pclass,Age,Fare
count,891.0,891.0,891.0
mean,2.308642,29.699118,32.204208
std,0.836071,13.002015,49.693429
min,1.0,0.42,0.0
25%,2.0,22.0,7.9104
50%,3.0,29.699118,14.4542
75%,3.0,35.0,31.0
max,3.0,80.0,512.3292


In [90]:
X[cat_features].describe()

Unnamed: 0,Sex,Embarked
count,891,889
unique,2,3
top,male,S
freq,577,644


In [91]:
X['Embarked'] = X['Embarked'].fillna(X['Embarked'].describe()['top'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [92]:
X[cat_features].describe()

Unnamed: 0,Sex,Embarked
count,891,891
unique,2,3
top,male,S
freq,577,646


In [93]:
X.at[X['Sex'] == 'male', 'Sex'] = 0
X.at[X['Sex'] == 'female', 'Sex'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [94]:
embark_data = pd.get_dummies(X['Embarked'])
embark_data.columns

Index(['C', 'Q', 'S'], dtype='object')

In [95]:
X = pd.concat((embark_data, X.drop('Embarked', axis=1)), axis=1)

In [96]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [97]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [98]:
y_pred = model.predict(x_test)

In [99]:
accuracy_score(y_test, y_pred)

0.7623318385650224

In [102]:
from sklearn.model_selection import GridSearchCV

Основные параметры класса [sklearn.tree.DecisionTreeClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html):

- `max_depth` – максимальная глубина дерева
- `max_features` - максимальное число признаков, по которым ищется лучшее разбиение в дереве (это нужно потому, что при большом количестве признаков будет "дорого" искать лучшее (по критерию типа прироста информации) разбиение среди *всех* признаков)
- `min_samples_leaf` – минимальное число объектов в листе. У этого параметра есть понятная интерпретация: скажем, если он равен 5, то дерево будет порождать только те классифицирующие правила, которые верны как мимимум для 5 объектов

Параметры дерева надо настраивать в зависимости от входных данных, и делается это обычно с помощью *кросс-валидации*,  про нее чуть ниже.


In [117]:
params = {'criterion':['gini','entropy'],'max_depth':[5, 10, 20], 'min_samples_leaf': [5, 10, 20, 25]}

In [121]:
model = DecisionTreeClassifier(random_state=241)
clf = GridSearchCV(model, params, cv=5)
clf.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=241,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ['gini', 'entropy'], 'min_samples_leaf': [5, 10, 20, 25], 'max_depth': [5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [122]:
y_pred = clf.predict(x_test)

In [123]:
accuracy_score(y_test, y_pred)

0.7713004484304933