In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from ipywidgets import Image
from io import StringIO
import pydotplus

In [2]:
data=pd.read_csv("../data/diam.csv")

In [3]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,dewevo
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,dewevo
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,dewevo
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,dewevo
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,dewevo


In [4]:
data['cut']=pd.Categorical(data['cut'])
data['color']=pd.Categorical(data['color'])
data['clarity']=pd.Categorical(data['clarity'])

In [5]:
dataDummies = pd.get_dummies(data['cut'], prefix = 'category')
datDummies = pd.get_dummies(data['color'], prefix = 'category')
daDummies = pd.get_dummies(data['clarity'], prefix = 'category')
data=pd.concat([data,dataDummies,datDummies,daDummies],axis=1)

In [6]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target,...,category_I,category_J,category_I1,category_IF,category_SI1,category_SI2,category_VS1,category_VS2,category_VVS1,category_VVS2
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,dewevo,...,0,0,0,0,0,1,0,0,0,0
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,dewevo,...,0,0,0,0,1,0,0,0,0,0
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,dewevo,...,0,0,0,0,0,0,1,0,0,0
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,dewevo,...,1,0,0,0,0,0,0,1,0,0
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,dewevo,...,0,1,0,0,0,1,0,0,0,0


In [7]:
data.drop(['cut','color','clarity','table','x','y','z'],axis=1,inplace=True)

In [8]:
data.head()

Unnamed: 0,carat,depth,target,category_Fair,category_Good,category_Ideal,category_Premium,category_Very Good,category_D,category_E,...,category_I,category_J,category_I1,category_IF,category_SI1,category_SI2,category_VS1,category_VS2,category_VVS1,category_VVS2
0,0.23,61.5,dewevo,0,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0.21,59.8,dewevo,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.23,56.9,dewevo,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0.29,62.4,dewevo,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.31,63.3,dewevo,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 23 columns):
carat                 53940 non-null float64
depth                 53940 non-null float64
target                53940 non-null object
category_Fair         53940 non-null uint8
category_Good         53940 non-null uint8
category_Ideal        53940 non-null uint8
category_Premium      53940 non-null uint8
category_Very Good    53940 non-null uint8
category_D            53940 non-null uint8
category_E            53940 non-null uint8
category_F            53940 non-null uint8
category_G            53940 non-null uint8
category_H            53940 non-null uint8
category_I            53940 non-null uint8
category_J            53940 non-null uint8
category_I1           53940 non-null uint8
category_IF           53940 non-null uint8
category_SI1          53940 non-null uint8
category_SI2          53940 non-null uint8
category_VS1          53940 non-null uint8
category_VS2          5394

In [10]:
data['target']=data['target'].map({'dewevo':0,'dorogo':1})

In [11]:
y=data['target'].astype('int')

In [12]:
X=data.drop('target',axis=1)

In [13]:
X.shape, y.shape

((53940, 22), (53940,))

In [14]:
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [15]:
X_train,X_valid,y_train,y_valid=train_test_split(X, y, 
                                                  test_size=0.3,
                                                  random_state=17)

In [16]:
X_train.shape,y_train.shape

((37758, 22), (37758,))

In [17]:
first_tree=DecisionTreeClassifier(random_state=17)

In [18]:
np.mean(cross_val_score(first_tree,X_train,y_train,cv=5))

0.9683511336850377

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
tree_params={'max_depth':np.arange(1,11),'max_features':([.5,.7,1])}

In [21]:
tree_grid=GridSearchCV(first_tree,tree_params,cv=5,n_jobs=-1)

In [22]:
%%time
tree_grid.fit(X_train,y_train)

Wall time: 9.97 s


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), 'max_features': [0.5, 0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
tree_grid.best_score_,tree_grid.best_params_

(0.9730123417553895, {'max_depth': 10, 'max_features': 0.7})

In [44]:
tree_valid_pred=tree_grid.predict(X_valid)

In [45]:
from sklearn.metrics import accuracy_score

In [46]:
accuracy_score(y_valid,tree_valid_pred)

0.9723767148683723

In [1]:
from sklearn.tree import export_graphviz

In [48]:
second_tree = DecisionTreeClassifier(max_depth=5).fit(X_train, y_train)
second_tree.score(X_valid, y_valid)

0.9652700531454703

In [55]:
export_graphviz(tree_grid.best_estimator_,out_file='diamond.dot',
               feature_names=X.columns, filled=True)

In [56]:
ls -l *.dot

 Том в устройстве C имеет метку Acer
 Серийный номер тома: 7A9D-E493

 Содержимое папки C:\Users\Dinara\Anaconda3\Decision Tree


 Содержимое папки C:\Users\Dinara\Anaconda3\Decision Tree

25.06.2019  19:20            67 816 diamond.dot
25.06.2019  19:19             6 461 diamond2.dot
22.06.2019  13:33             8 222 telecom_tree2.dot
22.06.2019  16:43                 0 titanic_tree.dot
22.06.2019  15:51                 0 tititanic_tree.dot
               5 файлов         82 499 байт
               0 папок  306 757 378 048 байт свободно


In [57]:
!dot -Tpng diamond.dot -o diamond.png

<img src = 'diamond.png'>