# **Hyperparameter Optimization**

In machine learning, hyperparameter optimization or tuning is the goal of choosing a set of optimal hyperparameters for a learning algorithm. A hyperparameter is a parameter whose value controls the learning process. By contrast, the values of other parameters (typically node weights) get learned.

In [None]:
# Import Library.
import pandas as pd
import numpy as np
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

# Load Dataset.
data = pd.read_csv(
    "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# Dataset Summary.
data.info()

# **Exploratory Data Analysis.**

In [None]:
sns.pairplot(data, hue="Outcome")

In [None]:
# Split the dataset into features and target values.
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Feature Scaling.
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = sc.fit_transform(X)

# Split the dataset into training and test set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# **Using Random Forest Classifier.**

[**sklearn.ensemble.RandomForestClassifier**](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier)


The main parameters used by a Random Forest Classifier are:


*   **criterion** = the function used to evaluate the quality of a split.
*   **max_depth** = maximum number of levels allowed in each tree.
*   **max_features** = maximum number of features considered when splitting a node.
*   **min_samples_leaf** = minimum number of samples which can be stored in a tree leaf.
*   **min_samples_split** = minimum number of samples necessary in a node to cause node splitting.
*   **n_estimators** = number of trees in the ensamble.

In [None]:
# Use Random Forest, with manual Hyperparameter Optimization.
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=300,
    criterion="entropy",
    max_features="sqrt",
    min_samples_leaf=10,
    random_state=42,
)
clf = clf.fit(X_train, y_train)

# Predict the test set results.
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Accuracy Score is ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy Score is  0.7552083333333334
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       123
           1       0.68      0.61      0.64        69

    accuracy                           0.76       192
   macro avg       0.73      0.72      0.73       192
weighted avg       0.75      0.76      0.75       192

[[103  20]
 [ 27  42]]


# **Grid Search**

[**sklearn.model_selection.GridSearchCV**](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [None]:
""" Hyperparameter Optimization. """

parameters = {
    "n_estimators": [100, 200, 300],
    "criterion": ["entropy", "gini"],
    "max_depth": [None, 1, 3, 5],
    "min_samples_split": [2, 3, 5],
    "max_features": ["auto", "sqrt", "log2"],
    "min_samples_leaf": [1, 2, 4],
}

print(parameters)

{'n_estimators': [100, 200, 300], 'criterion': ['entropy', 'gini'], 'max_depth': [None, 1, 3, 5], 'min_samples_split': [2, 3, 5], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [1, 2, 4]}


In [None]:
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()
grid_search = GridSearchCV(
    estimator=clf, param_grid=parameters, cv=10, n_jobs=-1, verbose=2
)
grid_search = grid_search.fit(X_train, y_train)

In [None]:
best_grid = grid_search.best_estimator_
print(grid_search.best_estimator_)

RandomForestClassifier(criterion='entropy', max_features='auto',
                       min_samples_leaf=2, min_samples_split=3)


In [None]:
# Predict the test set results.
y_pred = best_grid.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Accuracy Score {}".format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy Score 0.7447916666666666
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       123
           1       0.64      0.65      0.65        69

    accuracy                           0.74       192
   macro avg       0.72      0.72      0.72       192
weighted avg       0.75      0.74      0.75       192

[[98 25]
 [24 45]]


# **Random Search**

[**sklearn.model_selection.RandomizedSearchCV**](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

In [None]:
""" Hyperparameter Optimization. """

# Number of trees in Random Forest.
n_estimators = [int(x) for x in np.linspace(start=100, stop=2000, num=10)]

# Number of features to consider at every split.
max_features = ["auto", "sqrt", "log2"]

# Maximum number of levels in the tree.
max_depth = [int(x) for x in np.linspace(10, 1000, 10)]

# Minimum number of samples required to split a node.
min_samples_split = [2, 3, 5, 7, 10, 14]

# Minimum number of samples required at each leaf node.
min_samples_leaf = [1, 2, 3, 4, 6, 7, 9]

# Create the Random Grid.
random_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "criterion": ["entropy", "gini"],
}

print(random_grid)

{'n_estimators': [100, 311, 522, 733, 944, 1155, 1366, 1577, 1788, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 3, 5, 7, 10, 14], 'min_samples_leaf': [1, 2, 3, 4, 6, 7, 9], 'criterion': ['entropy', 'gini']}


In [None]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomForestClassifier()
random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=random_grid,
    n_iter=100,
    cv=10,
    verbose=2,
    random_state=100,
    n_jobs=-1,
)
random_search = random_search.fit(X_train, y_train)

In [None]:
best_random_grid = random_search.best_estimator_
print(random_search.best_estimator_)

RandomForestClassifier(criterion='entropy', max_depth=450, min_samples_leaf=4)


In [None]:
# Predict the test set results.
y_pred = best_random_grid.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Accuracy Score {}".format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy Score 0.75
              precision    recall  f1-score   support

           0       0.80      0.81      0.81       123
           1       0.66      0.64      0.65        69

    accuracy                           0.75       192
   macro avg       0.73      0.73      0.73       192
weighted avg       0.75      0.75      0.75       192

[[100  23]
 [ 25  44]]


# **TPOT - Automated Machine Learning for Supervised Classification Tasks**

> [**TPOTClassifier**](http://epistasislab.github.io/tpot/api/)

In [None]:
!pip install tpot

In [None]:
""" Hyperparameter Optimization. """

# Number of trees in Random Forest.
n_estimators = [int(x) for x in np.linspace(start=100, stop=2000, num=10)]

# Number of features to consider at every split.
max_features = ["auto", "sqrt", "log2"]

# Maximum number of levels in the tree.
max_depth = [int(x) for x in np.linspace(10, 1000, 10)]

# Minimum number of samples required to split a node.
min_samples_split = [2, 3, 5, 7, 10, 14]

# Minimum number of samples required at each leaf node.
min_samples_leaf = [1, 2, 3, 4, 6, 7, 9]

# Create the Random Grid.
random_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "criterion": ["entropy", "gini"],
}

print(random_grid)

{'n_estimators': [100, 311, 522, 733, 944, 1155, 1366, 1577, 1788, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 3, 5, 7, 10, 14], 'min_samples_leaf': [1, 2, 3, 4, 6, 7, 9], 'criterion': ['entropy', 'gini']}


In [None]:
from tpot import TPOTClassifier

tpot_classifier = TPOTClassifier(
    generations=5,
    population_size=24,
    offspring_size=12,
    verbosity=2,
    early_stop=10,
    config_dict={"sklearn.ensemble.RandomForestClassifier": random_grid},
    cv=10,
    scoring="accuracy",
).fit(X_train, y_train)

accuracy = tpot_classifier.score(X_test, y_test)
print("Accuracy is", accuracy)  # Accuracy is 0.7604166666666666

# **Optimize hyperparameters of the Model using Optuna**

> [**Optuna: Automate Hyperparameter Tuning**](https://optuna.org/)

In [None]:
!pip install optuna

The hyperparameters of the above algorithm are `n_estimators` and `max_depth` for which we can try different values to see if the model accuracy can be improved. The objective function is modified to accept a trial object. This trial has several methods for sampling hyperparameters. We create a study to run the hyperparameter optimization and finally read the best hyperparameters.


In [None]:
import optuna
import sklearn.svm


def objective(trial):
    classifier = trial.suggest_categorical("classifier", ["RandomForest", "SVC"])
    if classifier == "RandomForest":
        n_estimators = trial.suggest_int("n_estimators", 200, 2000, 10)
        max_depth = int(trial.suggest_float("max_depth", 10, 100, log=True))
        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth
        )
    else:
        c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        clf = sklearn.svm.SVC(C=c, gamma="auto")
    return sklearn.model_selection.cross_val_score(
        clf, X_train, y_train, n_jobs=-1, cv=10
    ).mean()

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

trial = study.best_trial

print("Accuracy: {}".format(trial.value))
print("Best Hyperparameters: {}".format(trial.params))

In [None]:
print(trial)
print(study.best_params)

FrozenTrial(number=85, state=TrialState.COMPLETE, values=[0.7793708408953417], datetime_start=datetime.datetime(2023, 5, 30, 17, 32, 54, 314049), datetime_complete=datetime.datetime(2023, 5, 30, 17, 33, 27, 643803), params={'classifier': 'RandomForest', 'n_estimators': 1680, 'max_depth': 16.132944824273615}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'n_estimators': IntDistribution(high=2000, log=False, low=200, step=10), 'max_depth': FloatDistribution(high=100.0, log=True, low=10.0, step=None)}, trial_id=85, value=None)
{'classifier': 'RandomForest', 'n_estimators': 1680, 'max_depth': 16.132944824273615}


# **Lime Model Interpretation**

In [None]:
!pip install lime

In [None]:
# Import Library.
import pandas as pd
import numpy as np
import lime
from lime import lime_tabular
import warnings

warnings.filterwarnings("ignore")

# Load Dataset.
data = pd.read_csv(
    "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
)
data.head()

In [None]:
# Split the dataset into features and the target variables.
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Split the dataset into the Training set and Test set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Apply Random Forest Classification Model.
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier().fit(X_train, y_train)

In [None]:
interpretor = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    mode="classification",
)

In [None]:
exp = interpretor.explain_instance(
    data_row=X_test.iloc[10], predict_fn=clf.predict_proba
)
exp.show_in_notebook(show_table=True)