# 4. Using Sklearn and comparing results

In [1]:
from sklearn.datasets import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

min_samples_split = 10
min_samples_leaf = 5

data = load_wine()
X = data.data
y = data.target
labels = data.feature_names

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

def compare(pred1, pred2, pred_name_1=None, pred_name_2=None):
    pred1.fit(X_train, y_train)
    y_pred_1 = pred1.predict(X_test)
    acc_score_1 = accuracy_score(y_test, y_pred_1)
    
    pred2.fit(X_train, y_train)
    y_pred_2 = pred2.predict(X_test)
    acc_score_2 = accuracy_score(y_test, y_pred_2)
    
    if pred_name_1 is None: pred_name_1 = pred1.__class__.__name__
    if pred_name_2 is None: pred_name_2 = pred2.__class__.__name__
        
    print(F"{pred_name_1}: {acc_score_1}\nvs\n{pred_name_2}: {acc_score_2}")

### Decision Tree

In [2]:
from decision_tree import DecisionTree as MyDecisionTree
from sklearn.tree import DecisionTreeClassifier

mydt = MyDecisionTree(min_samples_split, min_samples_leaf, verbose=False)
dt = DecisionTreeClassifier(min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
compare(mydt, dt, "My DT ", "SKL DT")

My DT : 0.9491525423728814
vs
SKL DT: 0.9491525423728814


### Random Forest

In [4]:
from random_forest import RandomForest as MyRandomForest
from sklearn.ensemble import RandomForestClassifier

n_est = 200
myrf = MyRandomForest(min_samples_split, min_samples_leaf, n_est)
sklrf = RandomForestClassifier(min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, n_estimators=n_est)
compare(myrf, sklrf, "My RF ", "SKL RF")

My RF : 0.9661016949152542
vs
SKL RF: 0.9661016949152542


### Play around with the parameters

#### Desision Tree

In [29]:
from sklearn.model_selection import GridSearchCV

params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'max_leaf_nodes': list(range(2, 10)), 
    'min_samples_split': list(range(2, 20)),
    'min_samples_leaf': list(range(1, 10))
}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train);
grid_search_cv.best_estimator_

Fitting 3 folds for each of 20736 candidates, totalling 62208 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 6640 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 22640 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 45040 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 62208 out of 62208 | elapsed:   23.4s finished


DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=5,
                       min_samples_leaf=2, random_state=42)

In [30]:
y_pred_best = grid_search_cv.best_estimator_.predict(X_test)
acc_score_best = accuracy_score(y_test, y_pred_best)
acc_score_best

0.9322033898305084

#### Random Forest

In [34]:
params = {
    'n_estimators': list(range(10, 50)),
    'min_samples_split': list(range(2, 20)),
    'min_samples_leaf': list(range(1, 10))
}
grid_search_cv = GridSearchCV(RandomForestClassifier(random_state=42), params, n_jobs=-1, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train);
grid_search_cv.best_estimator_

Fitting 3 folds for each of 6480 candidates, totalling 19440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 1656 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 3056 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 4856 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 7056 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 9656 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 12656 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 16056 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 19440 out of 19440 | elapsed:  3.2min finished


RandomForestClassifier(min_samples_split=6, n_estimators=17, random_state=42)

In [36]:
y_pred_best = grid_search_cv.best_estimator_.predict(X_test)
acc_score_best = accuracy_score(y_test, y_pred_best)
acc_score_best

1.0

In [40]:
import numpy as np
feature_importances = grid_search_cv.best_estimator_.feature_importances_
index = np.argmax(feature_importances)
labels[index]

'flavanoids'