In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


X, y = make_classification(
    n_features=10, 
    n_samples=1000, 
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2, 
    random_state=42
)

### Evaluate the model using train, test split and tune parameters by trial and error

In [2]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model = DecisionTreeClassifier(criterion="gini",max_depth=10)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.85      0.82      0.84       130
           1       0.81      0.84      0.83       120

    accuracy                           0.83       250
   macro avg       0.83      0.83      0.83       250
weighted avg       0.83      0.83      0.83       250



In [4]:
model = DecisionTreeClassifier(criterion="entropy",max_depth=15)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.82      0.76      0.79       130
           1       0.76      0.82      0.79       120

    accuracy                           0.79       250
   macro avg       0.79      0.79      0.79       250
weighted avg       0.79      0.79      0.79       250



In [5]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion="entropy",max_depth=10),X,y,cv=5)

array([0.77 , 0.79 , 0.81 , 0.765, 0.795])

In [6]:
cross_val_score(DecisionTreeClassifier(criterion="gini",max_depth=5),X,y,cv=5)

array([0.78 , 0.815, 0.745, 0.805, 0.77 ])

In [9]:
criterion = ["gini","entropy"]
max_depth = [5,10,15]

avg_scores = {}
for c in criterion:
    for d in max_depth:
        clf = DecisionTreeClassifier(criterion=c,max_depth=d)
        scores = cross_val_score(clf,X,y,cv=5)
        avg_scores[c + "_" + str(d)]=np.mean(scores)
avg_scores

{'gini_5': 0.783,
 'gini_10': 0.788,
 'gini_15': 0.7899999999999999,
 'entropy_5': 0.7799999999999999,
 'entropy_10': 0.7939999999999999,
 'entropy_15': 0.812}

### Using GridSearchCV

In [11]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(
    DecisionTreeClassifier(),
    {
        'criterion':["gini","entropy"],
        'max_depth':[5,10,15]
    },
    cv=5,
    return_train_score=False
)
clf.fit(X,y)
clf.cv_results_

{'mean_fit_time': array([0.00380607, 0.00364742, 0.00656843, 0.00610957, 0.01086679,
        0.00909667]),
 'std_fit_time': array([0.00133496, 0.00342585, 0.00653892, 0.00748446, 0.00764827,
        0.00034894]),
 'mean_score_time': array([0.0006249 , 0.00019999, 0.        , 0.00040097, 0.        ,
        0.00040126]),
 'std_score_time': array([0.00060534, 0.00039997, 0.        , 0.00080194, 0.        ,
        0.00049145]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth': 15},
  {'criterion': 'entropy', 'max_depth': 5},
  {'cr

In [12]:
import pandas as pd

df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003806,0.001335,0.000625,0.000605,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.78,0.79,0.755,0.805,0.775,0.781,0.016553,5
1,0.003647,0.003426,0.0002,0.0004,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.79,0.72,0.795,0.795,0.81,0.782,0.031718,4
2,0.006568,0.006539,0.0,0.0,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.81,0.695,0.82,0.815,0.83,0.794,0.04994,2
3,0.00611,0.007484,0.000401,0.000802,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.785,0.765,0.815,0.775,0.781,0.018547,6
4,0.010867,0.007648,0.0,0.0,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.765,0.79,0.835,0.755,0.8,0.789,0.028178,3
5,0.009097,0.000349,0.000401,0.000491,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.765,0.795,0.84,0.795,0.85,0.809,0.031528,1


In [13]:
df[["param_criterion","param_max_depth","params","mean_test_score"]]

Unnamed: 0,param_criterion,param_max_depth,params,mean_test_score
0,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.781
1,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.782
2,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.794
3,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.781
4,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.789
5,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.809


In [14]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [15]:
clf.best_score_

0.8089999999999999

In [16]:
clf.best_estimator_

### Now We are going to try with Multiple Models

In [17]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

model_params = {
    'decision_tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy'],
            'max_depth':[5,10,15]
        }
    },
    'svm':{
        'model':SVC(),
        'params':{
            'kernel':['rbf','linear'],
            'C':[1,10,20]
        }
    },
    'randomforest_classifier':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[50,100,150],
            'criterion':["gini","entropy"]
        }
    }
}

scores = []
for key,value in model_params.items():
    clf = GridSearchCV(
        value["model"],
        value["params"],
        cv=5,
        return_train_score=False
    )
    clf.fit(X,y)
    scores.append({
        'model':key,
        'best_score':clf.best_score_,
        'best_params':clf.best_params_
    })
scores

[{'model': 'decision_tree',
  'best_score': 0.8039999999999999,
  'best_params': {'criterion': 'entropy', 'max_depth': 15}},
 {'model': 'svm',
  'best_score': 0.93,
  'best_params': {'C': 20, 'kernel': 'rbf'}},
 {'model': 'randomforest_classifier',
  'best_score': 0.899,
  'best_params': {'criterion': 'gini', 'n_estimators': 150}}]

In [18]:
df = pd.DataFrame(scores)
df

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.804,"{'criterion': 'entropy', 'max_depth': 15}"
1,svm,0.93,"{'C': 20, 'kernel': 'rbf'}"
2,randomforest_classifier,0.899,"{'criterion': 'gini', 'n_estimators': 150}"
