# Set-up

## Imports

In [1]:
# Standard
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.notebook import tqdm, trange
import plotly.graph_objects as go

# Sci-kit learn imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, average_precision_score, f1_score, classification_report, matthews_corrcoef, PrecisionRecallDisplay
import joblib

## Parameters

In [2]:
scope = 'OFF'
input_path = f'../data/3_output/ML_dataset_{scope}.pkl'

start_month = 201901
end_month = 202212

subsample_size = None

target = 'Result'

features_to_encode = [
    'MargTech',
    'WorkDay',
    'Prov',
    'Tech',
]

other_features = [
    'hour',
    'SC_PC1',
    'SC_PC2',
    'IN_PC1',
    'IN_PC2',
    'CT_PC1',
    'CT_PC2',
    'PurchMGP',
    'SellMGP',
    'SolarAngle',
    'DeclAngle',
    'PVold',
    'PVnew',
    'PriceDiff',
]

features = other_features + features_to_encode

In [3]:
input_df = pd.read_pickle(input_path)
input_df

Unnamed: 0,hour,date,year,unit,scope,SC_PC1,SC_PC2,IN_PC1,IN_PC2,CT_PC1,...,SolarAngle,DeclAngle,WorkDay,Prov,Tech,PVold,PVnew,Price,PriceDiff,Result
201801011UP_ALTOADDA_1_OFF,1,20180101,2018,UP_ALTOADDA_1,OFF,1.023687,0.433872,-3.072079,-0.745118,0.717989,...,-178.226106,-23.011637,holiday,Milano,Hydro Run-of-river and poundage,0.000,0.000,111.74,0.419,False
201801012UP_ALTOADDA_1_OFF,2,20180101,2018,UP_ALTOADDA_1,OFF,0.787333,0.214727,-3.269953,-0.389639,0.717989,...,-163.226106,-23.011637,holiday,Milano,Hydro Run-of-river and poundage,0.000,0.000,111.74,0.433,False
201801013UP_ALTOADDA_1_OFF,3,20180101,2018,UP_ALTOADDA_1,OFF,1.142582,0.517246,-3.511562,-0.682966,0.717989,...,-148.226106,-23.011637,holiday,Milano,Hydro Run-of-river and poundage,0.000,0.000,111.74,0.451,False
201801014UP_ALTOADDA_1_OFF,4,20180101,2018,UP_ALTOADDA_1,OFF,1.359091,0.692018,-3.635374,-0.274023,0.717989,...,-133.226106,-23.011637,holiday,Milano,Hydro Run-of-river and poundage,0.000,0.000,111.74,0.480,False
201801015UP_ALTOADDA_1_OFF,5,20180101,2018,UP_ALTOADDA_1,OFF,1.640151,0.729753,-3.703611,-0.167476,1.093151,...,-118.226106,-23.011637,holiday,Milano,Hydro Run-of-river and poundage,0.000,0.000,111.74,0.513,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022123117UP_VOGHERA_1_OFF,17,20221231,2022,UP_VOGHERA_1,OFF,0.237656,-2.008471,-1.806319,0.875068,-2.527147,...,61.887169,-23.085911,holiday,Pavia,Fossil Gas,0.540,0.540,320.00,0.066,False
2022123121UP_VOGHERA_1_OFF,21,20221231,2022,UP_VOGHERA_1,OFF,-0.075154,-0.954465,-1.872673,0.745683,-2.449869,...,121.887169,-23.085911,holiday,Pavia,Fossil Gas,0.573,0.573,320.00,0.016,False
2022123122UP_VOGHERA_1_OFF,22,20221231,2022,UP_VOGHERA_1,OFF,2.244316,-3.847872,-2.345759,0.833482,-2.547642,...,136.887169,-23.085911,holiday,Pavia,Fossil Gas,0.526,0.526,320.00,0.085,False
2022123123UP_VOGHERA_1_OFF,23,20221231,2022,UP_VOGHERA_1,OFF,2.277886,-3.849805,-2.698679,0.834434,-2.527069,...,151.887169,-23.085911,holiday,Pavia,Fossil Gas,0.525,0.525,320.00,0.190,False


In [4]:

# We take of the fact that some categories could be absent in the training set but present in the test set
categories = [input_df[feature].unique() for feature in features_to_encode]

feature_transformer = make_column_transformer(
    (OneHotEncoder(categories=categories), features_to_encode),
    remainder="passthrough"
)

## Functions

In [5]:
def get_X_y(df):
    X = feature_transformer.fit_transform(df[features])
    y = df[target]
    return X, y

In [6]:
def predict_proba_monthly_recal(model, df):
    """
    For each observation of the dataset, if M is the corresponding month, outputs the predicted probability of the model when trained on the M-12 to M-1 period.
    We hence fit a number of models equal to the number of months in the dataset.
    This allows to test the the performance of the model in a "live" setting, where each month, the model is recalibrated with the new data.
    """
    X, y = get_X_y(df)
    
    observation_month = df.index.str[:6].astype(int)
    months = sorted(observation_month.unique())
    test_months = [month for month in  months if month >= start_month and month <= end_month]
    y_probs_list =  []

    for test_month in tqdm(test_months):
        # For every month M, we take the training period as M-12 to M-1
        idx = months.index(test_month)
        train_months = months[idx-12:idx]
        X_train, y_train = get_X_y(df[observation_month.isin(train_months)])
        # And the test period as month M
        X_test, y_test = get_X_y(df[observation_month == test_month])

        model.fit(X_train, y_train)

        y_probs = model.predict_proba(X_test)[:,1]

        APS = average_precision_score(y_test, y_probs)
        print('Average Precision Score over {:,} samples for month {} is: {}'.format(len(y_test), test_month, round(APS, 3)))
        print("\n")

        y_probs_list.append(y_probs)
    
    return pd.Series(np.concatenate(y_probs_list, axis=0), index=df[observation_month.isin(test_months)].index)

# Main

## Preprocessing

In [7]:
# Subsample
if subsample_size is not None:
    print(f'Subsampled {subsample_size} rows from the input dataset')
    df = input_df.copy()
    df['order'] = range(len(df))
    df = df.sample(subsample_size).sort_values('order')
    df.drop('order', axis=1, inplace=True)
else:
    df = input_df.copy()

## Model

# Run test with monthly recal

In [8]:
%%time

clf = RandomForestClassifier(
    random_state=42,
    n_jobs=-1
)

y_probs = predict_proba_monthly_recal(clf, df)


  0%|          | 0/48 [00:00<?, ?it/s]

Average Precision Score over 73,147 samples for month 201901 is: 0.877


Average Precision Score over 66,421 samples for month 201902 is: 0.798


Average Precision Score over 67,287 samples for month 201903 is: 0.686


Average Precision Score over 67,696 samples for month 201904 is: 0.73


Average Precision Score over 68,786 samples for month 201905 is: 0.663


Average Precision Score over 65,302 samples for month 201906 is: 0.776


Average Precision Score over 66,221 samples for month 201907 is: 0.791


Average Precision Score over 59,107 samples for month 201908 is: 0.668


Average Precision Score over 55,212 samples for month 201909 is: 0.761


Average Precision Score over 60,256 samples for month 201910 is: 0.68


Average Precision Score over 59,335 samples for month 201911 is: 0.627


Average Precision Score over 67,198 samples for month 201912 is: 0.619


Average Precision Score over 64,423 samples for month 202001 is: 0.711


Average Precision Score over 62,076 samples for month

In [9]:
y_probs

201901011UP_ACTV_1_OFF        0.00
201901012UP_ACTV_1_OFF        0.00
201901013UP_ACTV_1_OFF        0.00
201901014UP_ACTV_1_OFF        0.00
201901015UP_ACTV_1_OFF        0.00
                              ... 
2022123117UP_VOGHERA_1_OFF    0.41
2022123121UP_VOGHERA_1_OFF    0.43
2022123122UP_VOGHERA_1_OFF    0.45
2022123123UP_VOGHERA_1_OFF    0.56
2022123124UP_VOGHERA_1_OFF    0.50
Length: 2764499, dtype: float64

In [11]:
y_probs.to_pickle(f'{scope}/model_predictions/predicted_probs_monthly_recal_rolling_12m.pkl')

---

# TESTS