# Day 09. Exercise 00
# Regularization

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib


## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../../datasets/dayofweek.csv')

In [3]:
X=df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [4]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False)

CPU times: user 11 μs, sys: 0 ns, total: 11 μs
Wall time: 13.1 μs


In [5]:
%%time
def crossval(model, n_splits, X, y):
    cv = StratifiedKFold(n_splits=n_splits, random_state=21, shuffle=True)
    score = cross_val_score(model, X,y, cv=cv)
    accuracy_train_list = []
    accuracy_test_list = []
    
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        train_predict = model.predict(X_train)
        test_predict = model.predict(X_test)
        accuracy_train = accuracy_score(y_train, train_predict)
        accuracy_test = accuracy_score(y_test, test_predict)
        
        accuracy_train_list.append(accuracy_train)
        accuracy_test_list.append(accuracy_test)
        print(f"train - {accuracy_train:.5f} | valid - {accuracy_test:.5f}")

    print(f"Average accuracy on crossval is {np.mean(accuracy_test_list):.5f}")
    print(f"std is {score.std():.5f}")

crossval(logreg, 10, X, y)

train - 0.63546 | valid - 0.65089
train - 0.65326 | valid - 0.60947
train - 0.63942 | valid - 0.63314
train - 0.63283 | valid - 0.57988
train - 0.65590 | valid - 0.57988
train - 0.64535 | valid - 0.62130
train - 0.63834 | valid - 0.60714
train - 0.63702 | valid - 0.59524
train - 0.64295 | valid - 0.68452
train - 0.63900 | valid - 0.56548
Average accuracy on crossval is 0.61269
std is 0.03441
CPU times: user 549 ms, sys: 26.6 ms, total: 576 ms
Wall time: 701 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [6]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver='saga', max_iter=10000)
crossval(logreg, 10, X, y)

train - 0.66777 | valid - 0.68639
train - 0.66051 | valid - 0.65089
train - 0.66183 | valid - 0.68639
train - 0.67106 | valid - 0.59763
train - 0.67436 | valid - 0.58580
train - 0.66051 | valid - 0.63314
train - 0.67062 | valid - 0.61905
train - 0.65547 | valid - 0.61310
train - 0.65744 | valid - 0.67857
train - 0.66469 | valid - 0.58333
Average accuracy on crossval is 0.63343
std is 0.03825
CPU times: user 1min, sys: 502 ms, total: 1min 1s
Wall time: 1min 6s


In [7]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False, penalty='l2', solver='saga', max_iter=10000)
crossval(logreg, 10, X, y)

train - 0.63546 | valid - 0.65089
train - 0.65326 | valid - 0.60947
train - 0.63942 | valid - 0.63905
train - 0.63283 | valid - 0.57988
train - 0.65590 | valid - 0.57988
train - 0.64469 | valid - 0.61538
train - 0.63702 | valid - 0.60714
train - 0.63636 | valid - 0.59524
train - 0.64295 | valid - 0.68452
train - 0.63900 | valid - 0.56548
Average accuracy on crossval is 0.61269
std is 0.03471
CPU times: user 2.22 s, sys: 24.8 ms, total: 2.24 s
Wall time: 2.38 s


In [8]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False, penalty='l1', solver='saga', max_iter=10000)
crossval(logreg, 10, X, y)

train - 0.64008 | valid - 0.63905
train - 0.64601 | valid - 0.63314
train - 0.64008 | valid - 0.65089
train - 0.63678 | valid - 0.57988
train - 0.65458 | valid - 0.56805
train - 0.64140 | valid - 0.61538
train - 0.64954 | valid - 0.60714
train - 0.62978 | valid - 0.57738
train - 0.62912 | valid - 0.67262
train - 0.64625 | valid - 0.55952
Average accuracy on crossval is 0.61031
std is 0.03645
CPU times: user 9.7 s, sys: 66.4 ms, total: 9.77 s
Wall time: 9.95 s


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [9]:
%%time
svc = SVC(random_state=21, kernel='linear', probability=True)
crossval(svc, 10, X, y)

train - 0.70138 | valid - 0.71598
train - 0.69677 | valid - 0.68639
train - 0.70402 | valid - 0.71006
train - 0.69941 | valid - 0.63905
train - 0.71127 | valid - 0.62130
train - 0.70336 | valid - 0.69822
train - 0.69038 | valid - 0.67857
train - 0.70487 | valid - 0.69048
train - 0.69895 | valid - 0.71429
train - 0.70026 | valid - 0.61905
Average accuracy on crossval is 0.67734
std is 0.03553
CPU times: user 7.43 s, sys: 57.6 ms, total: 7.49 s
Wall time: 7.72 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [10]:
%%time
svc = SVC(random_state=21, kernel='linear', probability=True, C=2.0)
crossval(svc, 10, X, y)

train - 0.71457 | valid - 0.71598
train - 0.70929 | valid - 0.68047
train - 0.71589 | valid - 0.70414
train - 0.70336 | valid - 0.65089
train - 0.71852 | valid - 0.62130
train - 0.71589 | valid - 0.70414
train - 0.70224 | valid - 0.66667
train - 0.74111 | valid - 0.73810
train - 0.70487 | valid - 0.71429
train - 0.71212 | valid - 0.64286
Average accuracy on crossval is 0.68388
std is 0.03565
CPU times: user 8.84 s, sys: 107 ms, total: 8.95 s
Wall time: 9.35 s


In [11]:
%%time
svc = SVC(random_state=21, kernel='linear', probability=True, C=5.0)
crossval(svc, 10, X, y)

train - 0.74225 | valid - 0.74556
train - 0.76796 | valid - 0.73964
train - 0.76664 | valid - 0.74556
train - 0.76928 | valid - 0.71598
train - 0.77785 | valid - 0.69822
train - 0.74357 | valid - 0.72189
train - 0.76153 | valid - 0.72619
train - 0.76680 | valid - 0.75000
train - 0.76482 | valid - 0.76786
train - 0.75626 | valid - 0.69048
Average accuracy on crossval is 0.73014
std is 0.02293
CPU times: user 11.7 s, sys: 229 ms, total: 11.9 s
Wall time: 13.4 s


In [12]:
%%time
svc = SVC(random_state=21, kernel='linear', probability=True, C=10.0)
crossval(svc, 10, X, y)

train - 0.77521 | valid - 0.75740
train - 0.77587 | valid - 0.73964
train - 0.77983 | valid - 0.75740
train - 0.78049 | valid - 0.75148
train - 0.78510 | valid - 0.69822
train - 0.76664 | valid - 0.75740
train - 0.78129 | valid - 0.77381
train - 0.78195 | valid - 0.75595
train - 0.77800 | valid - 0.78571
train - 0.76680 | valid - 0.71429
Average accuracy on crossval is 0.74913
std is 0.02470
CPU times: user 13.6 s, sys: 199 ms, total: 13.8 s
Wall time: 14.6 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [13]:
%%time
tree = DecisionTreeClassifier(random_state=21, max_depth=10)
crossval(tree, 10, X, y)

train - 0.82004 | valid - 0.79290
train - 0.82663 | valid - 0.69822
train - 0.82927 | valid - 0.76923
train - 0.81806 | valid - 0.71598
train - 0.82268 | valid - 0.75148
train - 0.80554 | valid - 0.76923
train - 0.83333 | valid - 0.75000
train - 0.81555 | valid - 0.76786
train - 0.81225 | valid - 0.76190
train - 0.81752 | valid - 0.67262
Average accuracy on crossval is 0.74494
std is 0.03551
CPU times: user 257 ms, sys: 17.5 ms, total: 274 ms
Wall time: 496 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [14]:
%%time
tree = DecisionTreeClassifier(random_state=21, max_depth=4)
crossval(tree, 4, X, y)

train - 0.56171 | valid - 0.55213
train - 0.55538 | valid - 0.50474
train - 0.54783 | valid - 0.56770
train - 0.51779 | valid - 0.52494
Average accuracy on crossval is 0.53738
std is 0.02427
CPU times: user 60.1 ms, sys: 3.9 ms, total: 64 ms
Wall time: 65 ms


In [15]:
%%time
tree = DecisionTreeClassifier(random_state=21, max_depth=20)
crossval(tree, 10, X, y)

train - 0.98682 | valid - 0.89349
train - 0.98550 | valid - 0.85799
train - 0.99604 | valid - 0.94083
train - 0.99341 | valid - 0.89349
train - 0.98550 | valid - 0.88166
train - 0.98352 | valid - 0.86982
train - 0.98617 | valid - 0.90476
train - 0.99341 | valid - 0.89881
train - 0.98551 | valid - 0.91071
train - 0.98946 | valid - 0.83333
Average accuracy on crossval is 0.88849
std is 0.02832
CPU times: user 164 ms, sys: 5.19 ms, total: 169 ms
Wall time: 171 ms


In [16]:
%%time
tree = DecisionTreeClassifier(random_state=21, max_depth=15)
crossval(tree, 10, X, y)

train - 0.95122 | valid - 0.88757
train - 0.94924 | valid - 0.82249
train - 0.96243 | valid - 0.88166
train - 0.95188 | valid - 0.86391
train - 0.94463 | valid - 0.85799
train - 0.93408 | valid - 0.87574
train - 0.95520 | valid - 0.88690
train - 0.95850 | valid - 0.88095
train - 0.94532 | valid - 0.90476
train - 0.95784 | valid - 0.85714
Average accuracy on crossval is 0.87191
std is 0.02159
CPU times: user 157 ms, sys: 5.63 ms, total: 162 ms
Wall time: 165 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [17]:
%%time
RandFor = RandomForestClassifier(random_state=21, max_depth=14, n_estimators=50)
crossval(RandFor, 10, X, y)

train - 0.96506 | valid - 0.88757
train - 0.97034 | valid - 0.86391
train - 0.97100 | valid - 0.89941
train - 0.96572 | valid - 0.87574
train - 0.96506 | valid - 0.88757
train - 0.96440 | valid - 0.92308
train - 0.96574 | valid - 0.91667
train - 0.97431 | valid - 0.89881
train - 0.96640 | valid - 0.88095
train - 0.96311 | valid - 0.87500
Average accuracy on crossval is 0.89087
std is 0.01780
CPU times: user 2.16 s, sys: 43.8 ms, total: 2.2 s
Wall time: 2.3 s


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [18]:
%%time
RandFor = RandomForestClassifier(random_state=21, max_depth=20, n_estimators=50)
crossval(RandFor, 10, X, y)

train - 0.99670 | valid - 0.92308
train - 0.99670 | valid - 0.91716
train - 0.99604 | valid - 0.92899
train - 0.99604 | valid - 0.91716
train - 0.99802 | valid - 0.89941
train - 0.99670 | valid - 0.93491
train - 0.99671 | valid - 0.92857
train - 0.99736 | valid - 0.92857
train - 0.99605 | valid - 0.92262
train - 0.99473 | valid - 0.90476
Average accuracy on crossval is 0.92052
std is 0.01066
CPU times: user 2.15 s, sys: 34.5 ms, total: 2.18 s
Wall time: 2.27 s


In [19]:
%%time
RandFor = RandomForestClassifier(random_state=21, max_depth=14, n_estimators=20)
crossval(RandFor, 10, X, y)

train - 0.95452 | valid - 0.88166
train - 0.96111 | valid - 0.84615
train - 0.95649 | valid - 0.89941
train - 0.95056 | valid - 0.89941
train - 0.95254 | valid - 0.85799
train - 0.95847 | valid - 0.90533
train - 0.95784 | valid - 0.90476
train - 0.95125 | valid - 0.89881
train - 0.95125 | valid - 0.89881
train - 0.96311 | valid - 0.86310
Average accuracy on crossval is 0.88554
std is 0.02080
CPU times: user 930 ms, sys: 19 ms, total: 949 ms
Wall time: 983 ms


In [20]:
%%time
RandFor = RandomForestClassifier(random_state=21, max_depth=20, n_estimators=70)
crossval(RandFor, 10, X, y)

train - 0.99604 | valid - 0.92899
train - 0.99670 | valid - 0.91716
train - 0.99736 | valid - 0.93491
train - 0.99670 | valid - 0.91716
train - 0.99736 | valid - 0.89941
train - 0.99736 | valid - 0.94083
train - 0.99802 | valid - 0.94048
train - 0.99802 | valid - 0.92262
train - 0.99605 | valid - 0.92857
train - 0.99802 | valid - 0.91667
Average accuracy on crossval is 0.92468
std is 0.01212
CPU times: user 2.44 s, sys: 26.1 ms, total: 2.46 s
Wall time: 2.48 s


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [21]:
%%time
RandFor = RandomForestClassifier(random_state=21, max_depth=20, n_estimators=70)
RandFor.fit(X_train, y_train)
pred = RandFor.predict(X_test)

CPU times: user 141 ms, sys: 4.46 ms, total: 145 ms
Wall time: 212 ms


In [22]:
accuracy_score(y_test, pred)

0.9171597633136095

In [23]:
df2 = pd.DataFrame({'true': y_test, 'predict': pred})

In [24]:
df2['equal'] = df2['true'] == df2['predict']
df2

Unnamed: 0,true,predict,equal
1087,1,1,True
16,5,5,True
563,6,6,True
1381,3,3,True
1199,2,2,True
...,...,...,...
1411,3,3,True
1079,1,1,True
1222,2,2,True
1064,1,1,True


In [25]:
False_weekday = df2[df2['equal'] == False].groupby(['true', 'equal'])['equal'].count().sort_values(ascending=False)
False_weekday

true  equal
1     False    8
0     False    7
4     False    3
5     False    3
6     False    3
2     False    2
3     False    2
Name: equal, dtype: int64

for Tuesday  model makes the most errors

In [26]:
joblib.dump(RandFor, 'models/00_RandFor_model')

['models/00_RandFor_model']