# Hyperparameter Optimization with GridSearch

The mushroom dataset is used to classify if a mushroom is for edible 'e' or poisonous 'p' 

https://www.kaggle.com/uciml/mushroom-classification

In [None]:
import pandas as pd 
df = pd.read_csv("mushrooms.csv")

In [None]:
df.head()

Convert all categorial variables to numerical values

In [None]:
from sklearn.preprocessing import LabelEncoder
for i in list(df.columns):
    labelencoder = LabelEncoder()
    df[i] = labelencoder.fit_transform(df[i])

In [None]:
df.head()

Split into dependent (y/target) and independent variables (X)

In [None]:
X = df.drop('class', axis = 1)
y = df['class']

Split into test and training set

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Define GridSearch Parameters 

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [{'bootstrap': [True],
     'max_depth': [6, 10],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [3, 5],
     'min_samples_split': [4, 6],
     'n_estimators': [100, 350]}
     ]

In [None]:
clf = RandomForestClassifier()
grid_search = GridSearchCV(forest_clf, param_grid, cv=5,
                                  scoring="accuracy",
                                  return_train_score=True,
                                  verbose=True,
                                  n_jobs=-1)
grid_search.fit(X_train, y_train)

Get the parameters that lead to the best score

In [None]:
grid_search.best_params_

Get the best score

In [None]:
grid_search.best_score_

Get the full estimator that leads to the best score

In [None]:
best_clf=grid_search.best_estimator_

Calculate the classification error with a cinfusion matrix using the test data 

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = best_clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))