### parameter searching using single prediction output

In [1]:
import numpy as np
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tqdm.auto import tqdm
from datetime import datetime
from scipy.interpolate import interp1d
from sklearn.preprocessing import RobustScaler

from projectUtils import *


## Load our saved Data
We can index to get submatrices for less previous observations

In [2]:
def np_load(filename):
    path = ''
    return np.load(path + filename + '.npy', allow_pickle=True)

X_train = np_load('X_train_14wvar')
y_train = np_load('y_train_14wvar')
X_valid = np_load('X_valid_14wvar')
y_valid = np_load('y_valid_14wvar')

In [3]:
# convert to multiclass
y_train = round_and_intify(y_train)
y_valid = round_and_intify(y_valid)

In [4]:
y_valid.dtype

dtype('int32')

## Ordinal classification with aggregated binary classifiers
https://stackoverflow.com/questions/57561189/multi-class-multi-label-ordinal-classification-with-sklearn


## Experimenting with just a single prediction

In [5]:
# set solver
SOLVER = 'saga'
ITERS = 250

training is impractically slow for anything but lbfgs, sag or saga

### Grid searching for two different prediction weeks to inform later grid search

In [6]:
#initialize base classifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, make_scorer, f1_score

base_clf = LogisticRegression(solver=SOLVER, max_iter=ITERS)
ord_clf = OrdinalClassifier(base_clf, mean_absolute_error)

ord_clf.get_params()

{'base_estimator__C': 1.0,
 'base_estimator__class_weight': None,
 'base_estimator__dual': False,
 'base_estimator__fit_intercept': True,
 'base_estimator__intercept_scaling': 1,
 'base_estimator__l1_ratio': None,
 'base_estimator__max_iter': 250,
 'base_estimator__multi_class': 'auto',
 'base_estimator__n_jobs': None,
 'base_estimator__penalty': 'l2',
 'base_estimator__random_state': None,
 'base_estimator__solver': 'saga',
 'base_estimator__tol': 0.0001,
 'base_estimator__verbose': 0,
 'base_estimator__warm_start': False,
 'base_estimator': LogisticRegression(max_iter=250, solver='saga'),
 'loss': <function sklearn.metrics._regression.mean_absolute_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')>}

In [7]:
def transform_param(grid):
    #utility to rename keys for chained estimator
    prefix = 'base_estimator__'
    return {prefix + k: grid[k] for k in grid.keys()}

In [8]:
from sklearn.utils.random import sample_without_replacement as sample

#train and validate on a subset to save time
train_num = int(y_train.shape[0] / 10)
indices = sample(y_train.shape[0], train_num)

X_t_s = X_train[indices, :]
y_t_s = y_train[indices, :]

#train on a subset to save time
valid_num = int(y_valid.shape[0] / 10)
val_indices = sample(y_valid.shape[0], valid_num)

X_v_s = X_valid[val_indices, :]
y_v_s = y_valid[val_indices, :]


In [9]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit

scorer = {'mae': make_scorer(mean_absolute_error),
          'f1': make_scorer(f1_score, average='macro')}

X = np.vstack((X_t_s, X_v_s))
y = np.vstack((y_t_s, y_v_s))

indexer = np.ones(X.shape[0], dtype=np.int8)
indexer[ :X_t_s.shape[0] + 1] *= -1
ps = PredefinedSplit(indexer)

def search(params, X, y, X_v, y_v, name):
    base_clf = LogisticRegression(solver=SOLVER, max_iter=ITERS)
    ord_clf = OrdinalClassifier(base_clf, mean_absolute_error)
    
    tic = time.perf_counter()

    gridSearch = GridSearchCV(ord_clf, 
                              transform_param(params), 
                              scoring=scorer,
                              n_jobs=-2,
                              refit='mae',
                              verbose=3,
                              cv=ps) 

    gridSearch.fit(X, y)
    save_model(gridSearch, name)

    #print time
    timer(tic)
    
    #evaluate
    y_pred = gridSearch.score(X_v, y_v)
    
    return gridSearch

#### w6 preds

In [10]:
w = y[:, 5]
w_v_s = y_v_s[:, 5]

In [11]:
param_grid = {'C': [0.5,1.0,5.0],
             'penalty': ['elasticnet'],
             'l1_ratio': [0.2, 0.5, 0.8]}

grid1 = search(param_grid, X, w, X_v_s, w_v_s, 'elasticw6')

Fitting 1 folds for each of 9 candidates, totalling 9 fits
[CV 1/1] END base_estimator__C=0.5, base_estimator__l1_ratio=0.2, base_estimator__penalty=elasticnet; f1: (test=0.385) mae: (test=0.363) total time= 8.4min
[CV 1/1] END base_estimator__C=1.0, base_estimator__l1_ratio=0.5, base_estimator__penalty=elasticnet; f1: (test=0.385) mae: (test=0.363) total time= 8.5min
[CV 1/1] END base_estimator__C=1.0, base_estimator__l1_ratio=0.8, base_estimator__penalty=elasticnet; f1: (test=0.385) mae: (test=0.363) total time= 8.7min
[CV 1/1] END base_estimator__C=0.5, base_estimator__l1_ratio=0.8, base_estimator__penalty=elasticnet; f1: (test=0.385) mae: (test=0.363) total time= 8.9min
[CV 1/1] END base_estimator__C=0.5, base_estimator__l1_ratio=0.5, base_estimator__penalty=elasticnet; f1: (test=0.385) mae: (test=0.363) total time=11.3min
[CV 1/1] END base_estimator__C=5.0, base_estimator__l1_ratio=0.2, base_estimator__penalty=elasticnet; f1: (test=0.385) mae: (test=0.363) total time= 8.1min
[CV 1

In [12]:
param_grid = {'penalty': ['none']}

grid1 = search(param_grid, X, w, X_v_s, w_v_s, 'nonew6')

Fitting 1 folds for each of 1 candidates, totalling 1 fits
Elapsed time: 9 minutes, 23 seconds


In [23]:
grid1.best_score_

0.3631800843204176

## Examine results

In [13]:
import joblib
e_3 = joblib.load('elasticw3.pkl')
n_3 = joblib.load('nonew3.pkl')
e_6 = joblib.load('elasticw6.pkl')
n_6 = joblib.load('nonew6.pkl')

In [14]:
gs = [e_3, n_3, e_6, n_6]

In [15]:
for estimator in gs:
    print(estimator.best_score_)
    print(estimator.best_params_)

0.23603024827678512
{'base_estimator__C': 0.5, 'base_estimator__l1_ratio': 0.8, 'base_estimator__penalty': 'elasticnet'}
0.23569564344509134
{'base_estimator__penalty': 'none'}
0.36324700528675635
{'base_estimator__C': 0.5, 'base_estimator__l1_ratio': 0.8, 'base_estimator__penalty': 'elasticnet'}
0.3631800843204176
{'base_estimator__penalty': 'none'}
