# Day 09. Exercise 02
# Metrics

## 0. Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import GridSearchCV, ParameterGrid
from tqdm.notebook import tqdm
import pickle

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/dayofweek-not-scaled.csv')
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=21)

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [5]:
model_svc = svm.SVC(C=10, gamma='auto', probability=True, random_state=21)
model_svc.fit(X_train, y_train)
pred = model_svc.predict(X_test)
score = model_svc.predict_proba(X_test)

In [6]:
print('accuracy is %.5f' %accuracy_score(y_test, pred))
print('precision is %.5f' %precision_score(y_test, pred, average='weighted'))
print('recall is %.5f' %recall_score(y_test, pred, average='weighted'))
print('roc_auc is %.5f' %roc_auc_score(y_test, score, average='weighted', multi_class='ovo'))

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. The same task for decision tree

In [7]:
model_tree = DecisionTreeClassifier(class_weight = 'balanced', max_depth =27, criterion='entropy')
model_tree.fit(X_train, y_train)
pred = model_tree.predict(X_test)
score = model_tree.predict_proba(X_test)

In [8]:
print('accuracy is %.5f' %accuracy_score(y_test, pred))
print('precision is %.5f' %precision_score(y_test, pred, average='weighted'))
print('recall is %.5f' %recall_score(y_test, pred, average='weighted'))
print('roc_auc is %.5f' %roc_auc_score(y_test, score, average='weighted', multi_class='ovo'))

accuracy is 0.90828
precision is 0.90854
recall is 0.90828
roc_auc is 0.94231


## 4. Random forest

1. The same task for random forest.

In [9]:
model_random_forest = RandomForestClassifier(n_estimators =50, class_weight = None, max_depth = 32)
model_random_forest.fit(X_train, y_train)
pred = model_random_forest.predict(X_test)
score = model_random_forest.predict_proba(X_test)

In [10]:
print('accuracy is %.5f' %accuracy_score(y_test, pred))
print('precision is %.5f' %precision_score(y_test, pred, average='weighted'))
print('recall is %.5f' %recall_score(y_test, pred, average='weighted'))
print('roc_auc is %.5f' %roc_auc_score(y_test, score, average='weighted', multi_class='ovo'))

accuracy is 0.93195
precision is 0.93396
recall is 0.93195
roc_auc is 0.98801


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [11]:
view_test = X_test
view_test['predict'] = pred
view_test[y_test != pred]

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,predict
1181,54,21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
814,29,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6
809,5,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5
117,2,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2
1618,1,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
951,84,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
337,15,22,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5
86,5,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
98,8,22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5
1073,75,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6


In [12]:
filename = 'model_random_tree.sav'
pickle.dump(model_random_forest, open(filename, 'wb'))

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [13]:
def print_metric(models, paramets):
    for model in models:
      model.set_params(paramets).fit(X_train, y_train)
      pred = model.set_params(paramets).predict(X_test)
      score = model.set_params(paramets).predict_proba(X_test)
      return {'accuracy': accuracy_score(y_test, pred),
            'precision': precision_score(y_test, pred, average='weighted'),
            'recall': recall_score(y_test, pred, average='weighted'),
            'roc_auc': roc_auc_score(y_test, score, average='weighted', multi_class='ovo')}