<a href="https://colab.research.google.com/github/cleysonl/ML_Bootcamp_CLL/blob/master/Advanced_ML_SM_DT_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pydotplus



In [0]:
import numpy as np
import pandas as pd

data = pd.read_csv('https://raw.githubusercontent.com/dipanjanS/appliedml_workshop_dhs_av_2019/master/Module%2008%20-%20%20Advanced%20ML%20-%20Supervised%20Models/data/Wine_Quality_Data.csv')

### **Encode Target Variable**

In [0]:
data['color'] = data.color.replace('white', 0).replace('red', 1).astype(np.int)

In [4]:
data.sample(10)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
1619,6.2,0.66,0.48,1.2,0.029,29.0,75.0,0.9892,3.33,0.39,12.8,8,0
2503,6.9,0.21,0.28,2.4,0.056,49.0,159.0,0.9944,3.02,0.47,8.8,8,0
4109,6.2,0.28,0.45,7.5,0.045,46.0,203.0,0.99573,3.26,0.46,9.2,6,0
5456,5.4,0.17,0.27,2.7,0.049,28.0,104.0,0.99224,3.46,0.55,10.3,6,0
1733,6.8,0.27,0.22,8.1,0.034,55.0,203.0,0.9961,3.19,0.52,8.9,5,0
2522,6.5,0.26,0.27,12.9,0.044,69.0,215.0,0.9967,3.17,0.43,10.0,6,0
4988,6.6,0.24,0.3,11.3,0.026,11.0,77.0,0.99381,3.13,0.55,12.8,7,0
4396,6.8,0.35,0.53,10.1,0.053,37.0,151.0,0.9963,3.07,0.4,9.4,5,0
123,8.0,0.71,0.0,2.6,0.08,11.0,34.0,0.9976,3.44,0.53,9.5,5,1
2704,5.5,0.32,0.45,4.9,0.028,25.0,191.0,0.9922,3.51,0.49,11.5,7,0


### **Train and Test sets**

In [0]:
feature_cols = [x for x in data.columns if x not in 'color']

# use StratifiedShuffleSplit to split into train and test sets that are stratified
from sklearn.model_selection import StratifiedShuffleSplit

#split the data into two parts with 1000 points in the test data
#this creates a generator
strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=1000, random_state=42)

# Get the index values from the generator
train_idx, test_idx = next(strat_shuff_split.split(data[feature_cols], data['color']))

#Create the data sets
X_train = data.loc[train_idx, feature_cols]
y_train = data.loc[train_idx, 'color']

X_test = data.loc[test_idx, feature_cols]
y_test = data.loc[test_idx, 'color']

In [7]:
X_train.shape, X_test.shape

((5497, 12), (1000, 12))

### **Check Distribution**

In [8]:
y_train.value_counts(normalize=True).sort_index()

0    0.753866
1    0.246134
Name: color, dtype: float64

In [9]:
y_test.value_counts(normalize=True).sort_index()

0    0.754
1    0.246
Name: color, dtype: float64

### **Model Evaluation**

In [0]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt = dt.fit(X_train, y_train)

In [11]:
print("Number of nodes= {}\nMaximum Depth= {}".format(dt.tree_.node_count, dt.tree_.max_depth))

Number of nodes= 171
Maximum Depth= 22


In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def measure_score(y_true, y_pred, label):
  return pd.Series({'accuracy': accuracy_score(y_true, y_pred),
                    'precision': precision_score(y_true, y_pred),
                    'recall': recall_score(y_true, y_pred),
                    'f1': f1_score(y_true, y_pred)}
                   , name=label)

In [14]:
# Error on the train and test sets
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

train_test_full_error = pd.concat([measure_score(y_train, y_train_pred, 'train'),
                                   measure_score(y_test, y_test_pred, 'test')],
                                  axis=1)
train_test_full_error

Unnamed: 0,train,test
accuracy,0.999818,0.984
precision,0.999261,0.96371
recall,1.0,0.971545
f1,0.999631,0.967611


### **GridSearchCV**

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': range(1, dt.tree_.max_depth+1, 2),
              'max_features': range(1, len(dt.feature_importances_)+1)}

gr = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  cv = 3,
                  param_grid = param_grid,
                  scoring = 'accuracy',
                  n_jobs = 1,
                  verbose = 2)

gr.fit(X_train, y_train)

Fitting 3 folds for each of 132 candidates, totalling 396 fits
[CV] max_depth=1, max_features=1 .....................................
[CV] ...................... max_depth=1, max_features=1, total=   0.0s
[CV] max_depth=1, max_features=1 .....................................
[CV] ...................... max_depth=1, max_features=1, total=   0.0s
[CV] max_depth=1, max_features=1 .....................................
[CV] ...................... max_depth=1, max_features=1, total=   0.0s
[CV] max_depth=1, max_features=2 .....................................
[CV] ...................... max_depth=1, max_features=2, total=   0.0s
[CV] max_depth=1, max_features=2 .....................................
[CV] ...................... max_depth=1, max_features=2, total=   0.0s
[CV] max_depth=1, max_features=2 .....................................
[CV] ...................... max_depth=1, max_features=2, total=   0.0s
[CV] max_depth=1, max_features=3 .....................................
[CV] .........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ..................... max_depth=1, max_features=10, total=   0.0s
[CV] max_depth=1, max_features=10 ....................................
[CV] ..................... max_depth=1, max_features=10, total=   0.0s
[CV] max_depth=1, max_features=10 ....................................
[CV] ..................... max_depth=1, max_features=10, total=   0.0s
[CV] max_depth=1, max_features=11 ....................................
[CV] ..................... max_depth=1, max_features=11, total=   0.0s
[CV] max_depth=1, max_features=11 ....................................
[CV] ..................... max_depth=1, max_features=11, total=   0.0s
[CV] max_depth=1, max_features=11 ....................................
[CV] ..................... max_depth=1, max_features=11, total=   0.0s
[CV] max_depth=1, max_features=12 ....................................
[CV] ..................... max_depth=1, max_features=12, total=   0.0s
[CV] max_depth=1, max_features=12 ....................................
[CV] .

[Parallel(n_jobs=1)]: Done 396 out of 396 | elapsed:    6.4s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=42,
                                              splitter='best'),
             iid='deprecated', n_jobs=1,
             param_grid={'max_depth': range(1, 23, 2),
                         'ma

In [16]:
print("Number of nodes= {}\nMaximum Depth= {}".format(gr.best_estimator_.tree_.node_count, gr.best_estimator_.tree_.max_depth))

Number of nodes= 99
Maximum Depth= 7


In [0]:
y_train_pred_gr = gr.predict(X_train)
y_test_pred_gr = gr.predict(X_test)

train_test_gr_error = pd.concat([measure_score(y_train, y_train_pred_gr, 'train'),
                                 measure_score(y_test, y_test_pred_gr, 'test')],
                                axis=1)

In [18]:
train_test_gr_error

Unnamed: 0,train,test
accuracy,0.995816,0.989
precision,0.998501,0.983539
recall,0.984479,0.971545
f1,0.99144,0.977505
