# Tree

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

### Get Data

In [2]:
df = pd.read_csv('C:/Users/Desktop/python/03 advance/overfitting.csv')
df = df.drop(['Target_Leaderboard','Target_Evaluate'],axis=1)
train = df[df['train']==1]
train = train.drop(['case_id','train'],axis=1)
test = df[df['train']==0]
test = test.drop(['case_id','train'],axis=1)
y_train = train['Target_Practice']
y_test = test['Target_Practice']
x_train = train.drop(['Target_Practice'],axis=1)
x_test = test.drop(['Target_Practice'],axis=1)

### Decision Trees

In [3]:
dtree = DecisionTreeClassifier(criterion='entropy')
paramgrid = {'criterion': ["entropy","gini"], 'splitter': ["best","random"], 
             'max_features': ['auto','log2'], 'max_leaf_nodes':[5,10,25,50,100,1000]} 
grid = GridSearchCV(dtree,param_grid=paramgrid,refit=True,verbose=4,cv=5,n_jobs=-1)
grid.fit(x_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   10.1s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'criterion': ['entropy', 'gini'], 'splitter': ['best', 'random'], 'max_features': ['auto', 'log2'], 'max_leaf_nodes': [5, 10, 25, 50, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

In [4]:
print(grid.best_params_)
print(grid.best_estimator_)

{'criterion': 'entropy', 'max_features': 'log2', 'max_leaf_nodes': 10, 'splitter': 'random'}
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features='log2', max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')


In [5]:
predictions = grid.predict(x_test)
predictions

array([1, 1, 0, ..., 1, 0, 0], dtype=int64)

### Evaluation

In [6]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.54      0.31      0.39      9909
          1       0.51      0.74      0.61      9841

avg / total       0.53      0.52      0.50     19750



In [7]:
print('accuracy %s' % accuracy_score(y_test, predictions))

accuracy 0.522329113924


In [8]:
print('confusion matrix\n %s' % confusion_matrix(y_test, predictions))

confusion matrix
 [[3052 6857]
 [2577 7264]]


### Random Forest

In [9]:
rfcn=RandomForestClassifier(n_estimators=1000,oob_score=True,n_jobs=-1)
paramgrid = {'criterion': ["entropy","gini"], 'max_features': ['auto','log2','sqrt'], 
             'max_leaf_nodes':[5,10,25,50,100]} 
grid = GridSearchCV(rfcn,paramgrid,refit=True,n_jobs=-1,cv=10)
grid.fit(x_train,y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'criterion': ['entropy', 'gini'], 'max_features': ['auto', 'log2', 'sqrt'], 'max_leaf_nodes': [5, 10, 25, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
print(grid.best_params_)
print(grid.best_estimator_)

{'criterion': 'entropy', 'max_features': 'auto', 'max_leaf_nodes': 100}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=100,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)


In [11]:
predictions = grid.predict(x_test)
predictions

array([1, 0, 1, ..., 1, 1, 1], dtype=int64)

### Evaluation

In [12]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.74      0.57      0.64      9909
          1       0.65      0.80      0.72      9841

avg / total       0.70      0.68      0.68     19750



In [13]:
print('accuracy %s' % accuracy_score(y_test, predictions))

accuracy 0.684


In [14]:
print('confusion matrix\n %s' % confusion_matrix(y_test, predictions))

confusion matrix
 [[5613 4296]
 [1945 7896]]


### Feature Importance

In [15]:
rfc = RandomForestClassifier(n_estimators=1000,oob_score=True,n_jobs=-1,
                            max_leaf_nodes=100, criterion='gini', max_features='log2')
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=100,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [16]:
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(x_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 49 (0.012884)
2. feature 177 (0.010923)
3. feature 126 (0.010822)
4. feature 39 (0.010404)
5. feature 6 (0.009471)
6. feature 125 (0.009157)
7. feature 116 (0.009144)
8. feature 87 (0.008376)
9. feature 164 (0.008375)
10. feature 115 (0.007867)
11. feature 46 (0.007865)
12. feature 160 (0.007806)
13. feature 152 (0.007697)
14. feature 147 (0.007669)
15. feature 104 (0.007397)
16. feature 118 (0.007247)
17. feature 26 (0.007172)
18. feature 173 (0.007135)
19. feature 2 (0.007104)
20. feature 189 (0.006933)
21. feature 42 (0.006914)
22. feature 53 (0.006882)
23. feature 3 (0.006649)
24. feature 157 (0.006534)
25. feature 48 (0.006412)
26. feature 127 (0.006339)
27. feature 25 (0.006256)
28. feature 198 (0.006226)
29. feature 24 (0.006181)
30. feature 108 (0.006159)
31. feature 83 (0.006010)
32. feature 38 (0.005998)
33. feature 186 (0.005974)
34. feature 196 (0.005945)
35. feature 176 (0.005821)
36. feature 145 (0.005820)
37. feature 68 (0.005803)
38. feature 