# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV, ParameterGrid
from tqdm.notebook import tqdm
from sklearn import svm

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [4]:
df = pd.read_csv('../data/dayofweek-not-scaled.csv')
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
X = df.drop('dayofweek', axis=1)
y = df.dayofweek
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=21)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [8]:
model_svc = svm.SVC(probability=True)
param_grid = {'C': [0.01, 0.1, 1, 1.5, 5, 10],
              'kernel': ['linear', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto'],
              'class_weight': ['balanced', None],
              'random_state': [21]}
gs_svm = GridSearchCV(model_svc, param_grid, scoring='accuracy', n_jobs=-1)
gs_svm.fit(X_train, y_train)

GridSearchCV(estimator=SVC(probability=True), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 1.5, 5, 10],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid'],
                         'random_state': [21]},
             scoring='accuracy')

In [9]:
gs_svm.best_score_

0.8761090458488228

In [10]:
results = pd.DataFrame(gs_svm.cv_results_)
results = results.sort_values('rank_test_score')
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
70,0.613378,0.007596,0.050519,0.002169,10,,auto,rbf,21,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.900000,0.848148,0.885185,0.884758,0.862454,0.876109,0.018419,1
64,0.623640,0.019038,0.048734,0.002958,10,balanced,auto,rbf,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.877778,0.851852,0.862963,0.873606,0.851301,0.863500,0.010870,2
58,0.584752,0.003328,0.053590,0.001347,5,,auto,rbf,21,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.825926,0.811111,0.818519,0.821561,0.802974,0.816018,0.008116,3
52,0.645369,0.051781,0.052683,0.003182,5,balanced,auto,rbf,21,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.844444,0.785185,0.792593,0.817844,0.802974,0.808608,0.021007,4
63,40.380071,3.841671,0.012323,0.002868,10,balanced,auto,linear,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.729630,0.700000,0.755556,0.754647,0.665428,0.721052,0.034438,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,0.694997,0.015029,0.023512,0.000871,5,balanced,auto,sigmoid,21,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.144444,0.148148,0.137037,0.126394,0.092937,0.129792,0.019869,68
65,0.639987,0.012794,0.021906,0.001392,10,balanced,auto,sigmoid,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.122222,0.140741,0.129630,0.100372,0.085502,0.115693,0.020052,69
41,0.826463,0.088883,0.027910,0.010234,1.5,balanced,auto,sigmoid,21,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.066667,0.085185,0.081481,0.078067,0.085502,0.079380,0.006913,70
17,0.831809,0.012850,0.025087,0.003729,0.1,balanced,auto,sigmoid,21,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.062963,0.066667,0.062963,0.059480,0.059480,0.062310,0.002678,71


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [11]:
model_tree = DecisionTreeClassifier()
param_grid = {'criterion': ['gini','entropy'],
              'max_depth': np.arange(1, 50),
              'class_weight': ['balanced', None],
              'random_state': [21]}
gs_tree = GridSearchCV(model_tree, param_grid, scoring='accuracy', n_jobs=-1)
gs_tree.fit(X_train, y_train)

NameError: name 'DecisionTreeClassifier' is not defined

In [None]:
gs_tree.best_score_

In [None]:
results = pd.DataFrame(gs_tree.cv_results_)
results = results.sort_values('rank_test_score')
results

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [None]:
model_random_forest = RandomForestClassifier()
param_grid = {'n_estimators': [5, 10, 50, 100],
              'criterion': ['gini','entropy'],
              'max_depth': np.arange(1, 50),
              'class_weight': ['balanced', None],
              'random_state': [21]}
gs_random_forest = GridSearchCV(model_random_forest, param_grid, scoring='accuracy', n_jobs=-1)
gs_random_forest.fit(X_train, y_train)

In [None]:
gs_random_forest.best_score_


In [None]:
results = pd.DataFrame(gs_random_forest.cv_results_)
results = results.sort_values('rank_test_score')
results

## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [None]:
grid = list(ParameterGrid(param_grid))
frame =[]


for param in tqdm(grid):
  model_random_forest.set_params(**param)
  scores = cross_val_score(model_random_forest, X_train, y_train, cv=5, n_jobs=-1)
  param['mean_accuracy'] = np.mean(scores)
  param['std_accuracy'] = np.std(scores)
  frame.append(param)


In [None]:
pd.DataFrame(frame).sort_values('mean_accuracy')


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [None]:
model_random_tree = RandomForestClassifier(n_estimators=50, max_depth=32, random_state=21, criterion='gini')
cv = StratifiedKFold(n_splits=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
model_random_tree.fit(X_train, y_train)
pred = model_random_tree.predict(X_test)
accuracy_score(y_test, pred)