In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np

In [5]:
df=pd.read_csv("diabetes.csv")

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df.shape

(768, 9)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [9]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [10]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [13]:
df.index

RangeIndex(start=0, stop=768, step=1)

In [7]:
df["Glucose"]=np.where(df["Glucose"]==0, df["Glucose"].median(), df["Glucose"])
df["Insulin"]=np.where(df["Insulin"]==0, df["Insulin"].median(), df["Insulin"])
df["SkinThickness"]=np.where(df["SkinThickness"]==0, df["SkinThickness"].median(), df["SkinThickness"])

In [56]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [8]:
X=df.drop("Outcome", axis=1)
y=df['Outcome']

In [9]:
pd.DataFrame(X, columns=df.columns[:-1])

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33
5,5,116.0,74,23.0,30.5,25.6,0.201,30
6,3,78.0,50,32.0,88.0,31.0,0.248,26
7,10,115.0,0,23.0,30.5,35.3,0.134,29
8,2,197.0,70,45.0,543.0,30.5,0.158,53
9,8,125.0,96,23.0,30.5,0.0,0.232,54


In [59]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [73]:
rf_classifier=RandomForestClassifier(n_estimators=50).fit(X_train, y_train)

In [74]:
prediction=rf_classifier.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [76]:
print(confusion_matrix(y_test, prediction))
print(accuracy_score(y_test, prediction))
print(classification_report(y_test, prediction))

[[92 15]
 [15 32]]
0.8051948051948052
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       107
           1       0.68      0.68      0.68        47

    accuracy                           0.81       154
   macro avg       0.77      0.77      0.77       154
weighted avg       0.81      0.81      0.81       154



##### Manual Tuning of Hyperparameter

In [91]:
model1=RandomForestClassifier(n_estimators=300, 
                              criterion="entropy", 
                              max_features="sqrt", 
                              min_samples_leaf=10, 
                              random_state=100).fit(X_train, y_train)

In [92]:
prediction1=model1.predict(X_test)

In [93]:
print(confusion_matrix(y_test, prediction1))
print(accuracy_score(y_test, prediction1))
print(classification_report(y_test, prediction1))

[[97 10]
 [17 30]]
0.8246753246753247
              precision    recall  f1-score   support

           0       0.85      0.91      0.88       107
           1       0.75      0.64      0.69        47

    accuracy                           0.82       154
   macro avg       0.80      0.77      0.78       154
weighted avg       0.82      0.82      0.82       154



#### RandomizedSearchCV 

In [82]:
from sklearn.model_selection import RandomizedSearchCV

In [107]:
n_estimators=[int(x) for x in np.linspace(start=50, stop=2000, num=10)]
max_features=["auto", "sqrt", "log2"]
max_depth=[int(x) for x in np.linspace(10, 1000, 10)]
min_samples_split=[2,5,10,14]
min_samples_leaf=[1,2,4,6,8]
criterion=["gini", "entropy"]

In [108]:
random_grid={"n_estimators":n_estimators,
            "max_features":max_features,
            "max_depth":max_depth,
            "min_samples_split":min_samples_split,
            "min_samples_leaf":min_samples_leaf,
            "criterion":criterion}

In [109]:
print(random_grid)

{'n_estimators': [50, 266, 483, 700, 916, 1133, 1350, 1566, 1783, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['gini', 'entropy']}


In [110]:
model2=RandomForestClassifier()

In [111]:
model2_randomcv=RandomizedSearchCV(estimator=model2, param_distributions=random_grid, 
                                   n_iter=100, cv=5, verbose=2, random_state=100, n_jobs=-1)

In [112]:
model2_randomcv.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 5, 10, 14],
                                        'n_estimators': [50, 266, 483, 700, 916,
                                                         1133, 1350, 1566, 1783,
                                                         2000]},
                   random_state=100, verbose=2)

In [113]:
model2_randomcv.best_params_

{'n_estimators': 50,
 'min_samples_split': 10,
 'min_samples_leaf': 6,
 'max_features': 'auto',
 'max_depth': 230,
 'criterion': 'gini'}

In [127]:
best_random_model2=model2_randomcv.best_estimator_

In [128]:
best_random_model2

RandomForestClassifier(max_depth=230, min_samples_leaf=6, min_samples_split=10,
                       n_estimators=50)

In [129]:
prediction2=best_random_model2.predict(X_test)

In [130]:
print(confusion_matrix(y_test, prediction2))
print(accuracy_score(y_test, prediction2))
print(classification_report(y_test, prediction2))

[[98  9]
 [15 32]]
0.8441558441558441
              precision    recall  f1-score   support

           0       0.87      0.92      0.89       107
           1       0.78      0.68      0.73        47

    accuracy                           0.84       154
   macro avg       0.82      0.80      0.81       154
weighted avg       0.84      0.84      0.84       154



#### GridSeachCV

In [118]:
from sklearn.model_selection import GridSearchCV

In [119]:
model3=RandomForestClassifier()

In [131]:
param_grid={
    'criterion': [model2_randomcv.best_params_['criterion']],
    'max_depth': [model2_randomcv.best_params_['max_depth']],
    'max_features': [model2_randomcv.best_params_['max_features']],
    'min_samples_leaf': [model2_randomcv.best_params_['min_samples_leaf'], 
                         model2_randomcv.best_params_['min_samples_leaf']+2, 
                         model2_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [model2_randomcv.best_params_['min_samples_split'] - 2,
                          model2_randomcv.best_params_['min_samples_split'] - 1,
                          model2_randomcv.best_params_['min_samples_split'], 
                          model2_randomcv.best_params_['min_samples_split'] +1,
                          model2_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [model2_randomcv.best_params_['n_estimators'], model2_randomcv.best_params_['n_estimators'] + 100, 
                     model2_randomcv.best_params_['n_estimators'] + 200, 
                     model2_randomcv.best_params_['n_estimators'] + 300, model2_randomcv.best_params_['n_estimators'] + 400]
}

In [132]:
model3_grid_searchcv=GridSearchCV(estimator=model3, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)

In [133]:
model3_grid_searchcv.fit(X_train, y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [230],
                         'max_features': ['auto'],
                         'min_samples_leaf': [6, 8, 10],
                         'min_samples_split': [8, 9, 10, 11, 12],
                         'n_estimators': [50, 150, 250, 350, 450]},
             verbose=2)

In [134]:
best_grid_model3=model3_grid_searchcv.best_estimator_

In [135]:
prediction3=best_grid_model3.predict(X_test)

In [136]:
print(confusion_matrix(y_test, prediction3))
print(accuracy_score(y_test, prediction3))
print(classification_report(y_test, prediction3))

[[98  9]
 [20 27]]
0.8116883116883117
              precision    recall  f1-score   support

           0       0.83      0.92      0.87       107
           1       0.75      0.57      0.65        47

    accuracy                           0.81       154
   macro avg       0.79      0.75      0.76       154
weighted avg       0.81      0.81      0.80       154



##### Automated Hyperparameter Tuning
Automated Hyperparameter Tuning can be done by using techniques such as

1. Bayesian Optimization
2. Gradient Descent
3. Evolutionary Algorithms

###### Bayesian Optimization
Bayesian optimization uses probability to find the minimum of a function. The final aim is to find the input value to a function which can gives us the lowest possible output value.It usually performs better than random,grid and manual search providing better performance in the testing phase and reduced optimization time. In Hyperopt, Bayesian Optimization can be implemented giving 3 three main parameters to the function fmin.

1. Objective Function = defines the loss function to minimize.
2. Domain Space = defines the range of input values to test (in Bayesian Optimization this space creates a probability      distribution for each of the used Hyperparameters).
3. Optimization Algorithm = defines the search algorithm to use to select the best input values to use in each new iteration.

In [14]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [15]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [16]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x20a88e7d0a0>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x20a89214130>,
 'max_features': <hyperopt.pyll.base.Apply at 0x20a89214250>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x20a89214430>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x20a89214550>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x20a89214640>}

In [17]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [18]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)

  0%|                                                                           | 0/80 [00:00<?, ?trial/s, best loss=?]

build_posterior_wrapper took 0.021582 seconds
TPE using 0 trials


  1%|▌                                               | 1/80 [00:14<18:47, 14.27s/trial, best loss: -0.6400639744102359]

build_posterior_wrapper took 0.015991 seconds
TPE using 1/1 trials with best loss -0.640064


  2%|█▏                                              | 2/80 [00:48<33:56, 26.11s/trial, best loss: -0.6400639744102359]

build_posterior_wrapper took 0.008000 seconds
TPE using 2/2 trials with best loss -0.640064


  4%|█▊                                              | 3/80 [01:24<39:17, 30.62s/trial, best loss: -0.6400639744102359]

build_posterior_wrapper took 0.014994 seconds
TPE using 3/3 trials with best loss -0.640064


  5%|██▍                                             | 4/80 [01:51<37:04, 29.27s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.006001 seconds
TPE using 4/4 trials with best loss -0.695442


  6%|███                                             | 5/80 [02:09<31:10, 24.94s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.005997 seconds
TPE using 5/5 trials with best loss -0.695442


  8%|███▌                                            | 6/80 [02:13<22:10, 17.99s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.005996 seconds
TPE using 6/6 trials with best loss -0.695442


  9%|████▏                                           | 7/80 [02:30<21:27, 17.64s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.004997 seconds
TPE using 7/7 trials with best loss -0.695442


 10%|████▊                                           | 8/80 [02:30<14:30, 12.09s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.005997 seconds
TPE using 8/8 trials with best loss -0.695442


 11%|█████▍                                          | 9/80 [02:34<11:23,  9.62s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.004992 seconds
TPE using 9/9 trials with best loss -0.695442


 12%|█████▉                                         | 10/80 [02:35<07:50,  6.71s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.005996 seconds
TPE using 10/10 trials with best loss -0.695442


 14%|██████▍                                        | 11/80 [02:39<06:54,  6.00s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.004997 seconds
TPE using 11/11 trials with best loss -0.695442


 15%|███████                                        | 12/80 [02:40<05:00,  4.42s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.005997 seconds
TPE using 12/12 trials with best loss -0.695442


 16%|███████▋                                       | 13/80 [02:44<04:51,  4.35s/trial, best loss: -0.6954418232706917]

build_posterior_wrapper took 0.004997 seconds
TPE using 13/13 trials with best loss -0.695442


 18%|████████▏                                      | 14/80 [03:04<09:51,  8.96s/trial, best loss: -0.7231240836998534]

build_posterior_wrapper took 0.005997 seconds
TPE using 14/14 trials with best loss -0.723124


 19%|████████▊                                      | 15/80 [03:04<06:50,  6.32s/trial, best loss: -0.7231240836998534]

build_posterior_wrapper took 0.005999 seconds
TPE using 15/15 trials with best loss -0.723124


 20%|█████████▍                                     | 16/80 [03:27<12:00, 11.25s/trial, best loss: -0.7231374117019858]

build_posterior_wrapper took 0.005994 seconds
TPE using 16/16 trials with best loss -0.723137


 21%|█████████▉                                     | 17/80 [03:27<08:31,  8.11s/trial, best loss: -0.7231374117019858]

build_posterior_wrapper took 0.005000 seconds
TPE using 17/17 trials with best loss -0.723137


 22%|██████████▌                                    | 18/80 [03:32<07:11,  6.95s/trial, best loss: -0.7231374117019858]

build_posterior_wrapper took 0.005992 seconds
TPE using 18/18 trials with best loss -0.723137


 24%|███████████▏                                   | 19/80 [03:51<10:59, 10.81s/trial, best loss: -0.7621884579501532]

build_posterior_wrapper took 0.005998 seconds
TPE using 19/19 trials with best loss -0.762188


 25%|███████████▊                                   | 20/80 [03:52<07:49,  7.83s/trial, best loss: -0.7621884579501532]

build_posterior_wrapper took 0.005995 seconds
TPE using 20/20 trials with best loss -0.762188


 26%|████████████▎                                  | 21/80 [04:14<11:48, 12.02s/trial, best loss: -0.7621884579501532]

build_posterior_wrapper took 0.004997 seconds
TPE using 21/21 trials with best loss -0.762188


 28%|████████████▉                                  | 22/80 [04:36<14:31, 15.03s/trial, best loss: -0.7621884579501532]

build_posterior_wrapper took 0.004998 seconds
TPE using 22/22 trials with best loss -0.762188


 29%|█████████████▌                                 | 23/80 [04:56<15:36, 16.43s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.004997 seconds
TPE using 23/23 trials with best loss -0.762215


 30%|██████████████                                 | 24/80 [05:15<16:14, 17.41s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.004995 seconds
TPE using 24/24 trials with best loss -0.762215


 31%|██████████████▋                                | 25/80 [05:39<17:38, 19.25s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.005995 seconds
TPE using 25/25 trials with best loss -0.762215


 32%|███████████████▎                               | 26/80 [05:59<17:30, 19.45s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.004995 seconds
TPE using 26/26 trials with best loss -0.762215


 34%|███████████████▊                               | 27/80 [06:18<17:09, 19.43s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.005998 seconds
TPE using 27/27 trials with best loss -0.762215


 35%|████████████████▍                              | 28/80 [06:37<16:44, 19.32s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.005997 seconds
TPE using 28/28 trials with best loss -0.762215


 36%|█████████████████                              | 29/80 [06:49<14:22, 16.91s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.004996 seconds
TPE using 29/29 trials with best loss -0.762215


 38%|█████████████████▋                             | 30/80 [06:59<12:27, 14.95s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.005997 seconds
TPE using 30/30 trials with best loss -0.762215


 39%|██████████████████▏                            | 31/80 [07:20<13:42, 16.79s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.005992 seconds
TPE using 31/31 trials with best loss -0.762215


 40%|██████████████████▊                            | 32/80 [07:40<14:09, 17.69s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.006006 seconds
TPE using 32/32 trials with best loss -0.762215


 41%|███████████████████▍                           | 33/80 [07:52<12:28, 15.93s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.005994 seconds
TPE using 33/33 trials with best loss -0.762215


 42%|███████████████████▉                           | 34/80 [08:13<13:27, 17.55s/trial, best loss: -0.7622151139544181]

build_posterior_wrapper took 0.005993 seconds
TPE using 34/34 trials with best loss -0.762215


 44%|████████████████████▌                          | 35/80 [08:32<13:24, 17.87s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.004992 seconds
TPE using 35/35 trials with best loss -0.768719


 45%|█████████████████████▏                         | 36/80 [08:50<13:12, 18.01s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.004994 seconds
TPE using 36/36 trials with best loss -0.768719


 46%|█████████████████████▋                         | 37/80 [09:11<13:35, 18.97s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.010997 seconds
TPE using 37/37 trials with best loss -0.768719


 48%|██████████████████████▎                        | 38/80 [09:53<18:08, 25.91s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.027983 seconds
TPE using 38/38 trials with best loss -0.768719


 49%|██████████████████████▉                        | 39/80 [10:25<18:51, 27.60s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.006006 seconds
TPE using 39/39 trials with best loss -0.768719


 50%|███████████████████████▌                       | 40/80 [10:43<16:27, 24.68s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005995 seconds
TPE using 40/40 trials with best loss -0.768719


 51%|████████████████████████                       | 41/80 [10:43<11:16, 17.35s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.006001 seconds
TPE using 41/41 trials with best loss -0.768719


 52%|████████████████████████▋                      | 42/80 [11:01<11:10, 17.64s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005998 seconds
TPE using 42/42 trials with best loss -0.768719


 54%|█████████████████████████▎                     | 43/80 [11:02<07:46, 12.61s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.004998 seconds
TPE using 43/43 trials with best loss -0.768719


 55%|█████████████████████████▊                     | 44/80 [11:21<08:45, 14.60s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.004995 seconds
TPE using 44/44 trials with best loss -0.768719


 56%|██████████████████████████▍                    | 45/80 [11:22<06:00, 10.29s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.008994 seconds
TPE using 45/45 trials with best loss -0.768719


 57%|███████████████████████████                    | 46/80 [11:45<08:00, 14.15s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005995 seconds
TPE using 46/46 trials with best loss -0.768719


 59%|███████████████████████████▌                   | 47/80 [11:50<06:14, 11.36s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.006000 seconds
TPE using 47/47 trials with best loss -0.768719


 60%|████████████████████████████▏                  | 48/80 [12:09<07:16, 13.63s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.014737 seconds
TPE using 48/48 trials with best loss -0.768719


 61%|████████████████████████████▊                  | 49/80 [12:10<05:10, 10.01s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.012993 seconds
TPE using 49/49 trials with best loss -0.768719


 62%|█████████████████████████████▍                 | 50/80 [12:31<06:36, 13.20s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005997 seconds
TPE using 50/50 trials with best loss -0.768719


 64%|█████████████████████████████▉                 | 51/80 [12:31<04:30,  9.32s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005997 seconds
TPE using 51/51 trials with best loss -0.768719


 65%|██████████████████████████████▌                | 52/80 [12:36<03:39,  7.86s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.004998 seconds
TPE using 52/52 trials with best loss -0.768719


 66%|███████████████████████████████▏               | 53/80 [13:16<07:55, 17.62s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.010721 seconds
TPE using 53/53 trials with best loss -0.768719


 68%|███████████████████████████████▋               | 54/80 [14:02<11:23, 26.28s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.014991 seconds
TPE using 54/54 trials with best loss -0.768719


 69%|████████████████████████████████▎              | 55/80 [14:45<13:02, 31.31s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.006998 seconds
TPE using 55/55 trials with best loss -0.768719


 70%|████████████████████████████████▉              | 56/80 [15:22<13:06, 32.77s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.011992 seconds
TPE using 56/56 trials with best loss -0.768719


 71%|█████████████████████████████████▍             | 57/80 [15:52<12:17, 32.08s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.004994 seconds
TPE using 57/57 trials with best loss -0.768719


 72%|██████████████████████████████████             | 58/80 [16:18<11:05, 30.27s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.008994 seconds
TPE using 58/58 trials with best loss -0.768719


 74%|██████████████████████████████████▋            | 59/80 [17:00<11:51, 33.89s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.011994 seconds
TPE using 59/59 trials with best loss -0.768719


 75%|███████████████████████████████████▎           | 60/80 [17:02<08:06, 24.33s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.013991 seconds
TPE using 60/60 trials with best loss -0.768719


 76%|███████████████████████████████████▊           | 61/80 [17:11<06:11, 19.56s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005998 seconds
TPE using 61/61 trials with best loss -0.768719


 78%|████████████████████████████████████▍          | 62/80 [17:39<06:39, 22.17s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.008985 seconds
TPE using 62/62 trials with best loss -0.768719


 79%|█████████████████████████████████████          | 63/80 [17:57<05:54, 20.84s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.004995 seconds
TPE using 63/63 trials with best loss -0.768719


 80%|█████████████████████████████████████▌         | 64/80 [18:27<06:17, 23.62s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.008997 seconds
TPE using 64/64 trials with best loss -0.768719


 81%|██████████████████████████████████████▏        | 65/80 [18:27<04:09, 16.66s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005995 seconds
TPE using 65/65 trials with best loss -0.768719


 82%|██████████████████████████████████████▊        | 66/80 [18:53<04:29, 19.24s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.008995 seconds
TPE using 66/66 trials with best loss -0.768719


 84%|███████████████████████████████████████▎       | 67/80 [19:21<04:45, 21.97s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.004992 seconds
TPE using 67/67 trials with best loss -0.768719


 85%|███████████████████████████████████████▉       | 68/80 [19:47<04:38, 23.23s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.131916 seconds
TPE using 68/68 trials with best loss -0.768719


 86%|████████████████████████████████████████▌      | 69/80 [20:15<04:31, 24.72s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005995 seconds
TPE using 69/69 trials with best loss -0.768719


 88%|█████████████████████████████████████████▏     | 70/80 [20:39<04:02, 24.29s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.006993 seconds
TPE using 70/70 trials with best loss -0.768719


 89%|█████████████████████████████████████████▋     | 71/80 [20:40<02:35, 17.27s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005996 seconds
TPE using 71/71 trials with best loss -0.768719


 90%|██████████████████████████████████████████▎    | 72/80 [20:53<02:09, 16.23s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005992 seconds
TPE using 72/72 trials with best loss -0.768719


 91%|██████████████████████████████████████████▉    | 73/80 [21:27<02:29, 21.37s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.006677 seconds
TPE using 73/73 trials with best loss -0.768719


 92%|███████████████████████████████████████████▍   | 74/80 [21:52<02:14, 22.47s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.008993 seconds
TPE using 74/74 trials with best loss -0.768719


 94%|████████████████████████████████████████████   | 75/80 [22:21<02:01, 24.37s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.004997 seconds
TPE using 75/75 trials with best loss -0.768719


 95%|████████████████████████████████████████████▋  | 76/80 [22:27<01:15, 18.99s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.007993 seconds
TPE using 76/76 trials with best loss -0.768719


 96%|█████████████████████████████████████████████▏ | 77/80 [22:28<00:40, 13.50s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.010994 seconds
TPE using 77/77 trials with best loss -0.768719


 98%|█████████████████████████████████████████████▊ | 78/80 [22:51<00:33, 16.53s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.005991 seconds
TPE using 78/78 trials with best loss -0.768719


 99%|██████████████████████████████████████████████▍| 79/80 [23:17<00:19, 19.36s/trial, best loss: -0.7687191789950686]

build_posterior_wrapper took 0.129078 seconds
TPE using 79/79 trials with best loss -0.768719


100%|███████████████████████████████████████████████| 80/80 [23:50<00:00, 17.88s/trial, best loss: -0.7687191789950686]


In [19]:
best

{'criterion': 1,
 'max_depth': 820.0,
 'max_features': 2,
 'min_samples_leaf': 0.07612276463030189,
 'min_samples_split': 0.03598242227443349,
 'n_estimators': 4}

In [20]:
crit = {0: 'entropy', 1: 'gini'}
feat = {0: 'auto', 1: 'sqrt', 2: 'log2', 3: None}
est = {0: 10, 1: 50, 2: 300, 3: 750, 4: 1200,5:1300,6:1500}


print(crit[best['criterion']])
print(feat[best['max_features']])
print(est[best['n_estimators']])

gini
log2
1200


In [21]:
best['min_samples_leaf']

0.07612276463030189

In [22]:
trainedforest = RandomForestClassifier(criterion = crit[best['criterion']], max_depth = best['max_depth'], 
                                       max_features = feat[best['max_features']], 
                                       min_samples_leaf = best['min_samples_leaf'], 
                                       min_samples_split = best['min_samples_split'], 
                                       n_estimators = est[best['n_estimators']]).fit(X_train,y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(accuracy_score(y_test,predictionforest))
print(classification_report(y_test,predictionforest))
acc5 = accuracy_score(y_test,predictionforest)

[[98  9]
 [21 26]]
0.8051948051948052
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       107
           1       0.74      0.55      0.63        47

    accuracy                           0.81       154
   macro avg       0.78      0.73      0.75       154
weighted avg       0.80      0.81      0.80       154



##### Genetic Algorithms
Genetic Algorithms tries to apply natural selection mechanisms to Machine Learning contexts.

Let's immagine we create a population of N Machine Learning models with some predifined Hyperparameters. We can then calculate the accuracy of each model and decide to keep just half of the models (the ones that performs best). We can now generate some offsprings having similar Hyperparameters to the ones of the best models so that go get again a population of N models. At this point we can again caltulate the accuracy of each model and repeate the cycle for a defined number of generations. In this way, just the best models will survive at the end of the process.

In [23]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(param)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [25]:
from tpot import TPOTClassifier

In [26]:
tpot_classifier = TPOTClassifier(generations= 5, population_size= 24, offspring_size= 12,
                                 verbosity= 2, early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, scoring = 'accuracy')
tpot_classifier.fit(X_train,y_train)

Code block execution exceeded 10 seconds timeout
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\stopit\utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\tpot\decorators.py", line 57, in time_limited_call
    func(*args)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 1044, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 777, in _dispatch
   

Optimization Progress:   0%|          | 0/84 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7621912401324166

Generation 2 - Current best internal CV score: 0.7621912401324166


Code block execution exceeded 10 seconds timeout
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\stopit\utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\tpot\decorators.py", line 57, in time_limited_call
    func(*args)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 1044, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 777, in _dispatch
   


Generation 3 - Current best internal CV score: 0.7621912401324166

Generation 4 - Current best internal CV score: 0.7621912401324166


Code block execution exceeded 10 seconds timeout
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\stopit\utils.py", line 145, in wrapper
    result = func(*args, **kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\tpot\decorators.py", line 57, in time_limited_call
    func(*args)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 1044, in __call__
    while self.dispatch_one_batch(iterator):
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\user\anaconda3\lib\site-packages\joblib\parallel.py", line 777, in _dispatch
   


Generation 5 - Current best internal CV score: 0.7621912401324166

Best pipeline: RandomForestClassifier(CombineDFs(RandomForestClassifier(input_matrix, criterion=entropy, max_depth=890, max_features=log2, min_samples_leaf=8, min_samples_split=10, n_estimators=600), input_matrix), criterion=gini, max_depth=450, max_features=auto, min_samples_leaf=6, min_samples_split=14, n_estimators=1200)


TPOTClassifier(config_dict={'sklearn.ensemble.RandomForestClassifier': {'criterion': ['entropy',
                                                                                      'gini'],
                                                                        'max_depth': [10,
                                                                                      120,
                                                                                      230,
                                                                                      340,
                                                                                      450,
                                                                                      560,
                                                                                      670,
                                                                                      780,
                                                                                 

In [27]:
accuracy = tpot_classifier.score(X_test, y_test)
print(accuracy)

0.8636363636363636


#### Optimize hyperparameters of the model using Optuna
The hyperparameters of the above algorithm are n_estimators and max_depth for which we can try different values to see if the model accuracy can be improved. The objective function is modified to accept a trial object. This trial has several methods for sampling hyperparameters. We create a study to run the hyperparameter optimization and finally read the best hyperparameters.
--Automated search for optimal hyperparameters using Python conditionals, loops, and syntax
--Efficiently search large spaces and prune unpromising trials for faster results.
--Parallelize hyperparameter searches over multiple threads or processes without modifying code.

In [29]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 200, 2000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()

In [30]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2022-02-07 18:19:03,058][0m A new study created in memory with name: no-name-d8a20741-f876-4372-a258-7d3856b0bf1f[0m
[32m[I 2022-02-07 18:19:22,228][0m Trial 0 finished with value: 0.7524469950581859 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1870, 'max_depth': 24.485592381264663}. Best is trial 0 with value: 0.7524469950581859.[0m
[32m[I 2022-02-07 18:19:31,102][0m Trial 1 finished with value: 0.7540650406504065 and parameters: {'classifier': 'RandomForest', 'n_estimators': 1020, 'max_depth': 79.4232475506669}. Best is trial 1 with value: 0.7540650406504065.[0m
[32m[I 2022-02-07 18:19:31,487][0m Trial 2 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 1402198.3697193945}. Best is trial 1 with value: 0.7540650406504065.[0m
[32m[I 2022-02-07 18:19:31,577][0m Trial 3 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 4.427806115012646e-06}. Best is trial 1 with value: 0.754065

Accuracy: 0.7573091025027897
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 1190, 'max_depth': 43.6396672368485}


In [31]:
trial

FrozenTrial(number=91, values=[0.7573091025027897], datetime_start=datetime.datetime(2022, 2, 7, 18, 30, 58, 745489), datetime_complete=datetime.datetime(2022, 2, 7, 18, 31, 7, 764931), params={'classifier': 'RandomForest', 'n_estimators': 1190, 'max_depth': 43.6396672368485}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'n_estimators': IntUniformDistribution(high=2000, low=200, step=10), 'max_depth': LogUniformDistribution(high=100.0, low=10.0)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=91, state=TrialState.COMPLETE, value=None)

In [32]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 1190,
 'max_depth': 43.6396672368485}

In [33]:
rf=RandomForestClassifier(n_estimators=330,max_depth=30)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=30, n_estimators=330)

In [34]:
y_pred=rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[94 13]
 [15 32]]
0.8181818181818182
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       107
           1       0.71      0.68      0.70        47

    accuracy                           0.82       154
   macro avg       0.79      0.78      0.78       154
weighted avg       0.82      0.82      0.82       154

