# 01: Classification Part 4

### Load variables

In [None]:
import pickle
import pandas as pd
with open("classification.pickle", "rb") as f:
    transform_series = lambda x: pd.DataFrame(x).iloc[:,0] if isinstance(x, pd.core.series.Series) else x
    X_train, y_train, X_val, y_val, X_test, y_test = list(map(transform_series, pickle.load(f)[0:6]))

Import packages and related functions.

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import cohen_kappa_score, roc_curve, roc_auc_score, accuracy_score, classification_report, make_scorer
from sklearn.ensemble import RandomForestClassifier

### Automatic Hyperparameter tuning with Random Forest

In [None]:
# define function to find best parameter combination based on Cohen’s kappa coefficient
def rfr_model(X, y):
# Perform Grid-Search
    kappa_scorer = make_scorer(cohen_kappa_score)
    
    # Use grid search to find the parameters that produce the highest accuracy
    gsc = GridSearchCV(
        estimator=RandomForestClassifier(),
        param_grid={
            'max_depth': range(2,10),
            'n_estimators': (5,10, 50, 100, 1000),
            'max_features': ('sqrt','log2'),
            'criterion': ('gini','entropy'),
        },
        cv=10, verbose=0, n_jobs=-1,scoring=kappa_scorer)

    # call gsc fit on X and y
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_

    # Use the best parameter to create the final classifier
    rfr = RandomForestClassifier(max_depth=best_params["max_depth"],
                                 n_estimators=best_params["n_estimators"],
                                 max_features=best_params["max_features"], criterion = best_params["criterion"])
    
    # Perform K-Fold CV, and return the most optimized RandomForest
    scores = cross_val_score(rfr, X, y, cv=10)
    predictions = cross_val_predict(rfr, X, y, cv=10)
    optimised_random_forest = gsc.best_estimator_

    print("Scores: {}", scores)
    return scores, optimised_random_forest

Create an optimized random forest `rf`.

In [None]:
# TODO: EXERCISE


In [None]:
#print GridSearch results: the best parameters and the best cross validation scores.
print('Best parameters:', rf)
print('---')
print('CV scores:', scores)
print('CV scores (average):', np.mean(scores))
print('Training score:', rf.score(X_train, y_train))
print('Testing score:', rf.score(X_test, y_test))