In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from loguru import logger
from source.utils.file_read import read_csv_file, join_dataframes
from source.utils.collect_results import create_df_forecaster_first_stage, create_df_forecaster_second_stage
from source.utils.generate_timestamp import generate_timestamps
from source.simulation.submission_module import submission_forecasters
from source.simulation.buyer_module import prepare_buyer_data
from source.ensemble.combination_scheme.equal_weights import calculate_equal_weights
from source.ensemble.combination_scheme.avg_weights import calculate_weighted_avg
from source.ensemble.combination_scheme.model_selection import run_model_selection
from source.plots.plot_forecasts import plot_forecasts, plot_var_forecasts
from source.ml_engine import create_ensemble_forecasts
from sklearn.utils.fixes import parse_version, sp_version
solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point"
from IPython.display import clear_output

In [None]:
from config.simulation_setting import Simulation, WeightedAvg, Stack
from source.simulation.helpers_simulation import process_combination_scheme
from source.utils.session_ml_info import delete_previous_day_pickle
sim_params = Simulation.testing_period
weight_avg_params = WeightedAvg.params
ens_params = Stack.params

In [None]:
# set random seed
np.random.seed(sim_params['random_seed'])

# read csv file
df_filtered = read_csv_file(sim_params['csv_filename'], sim_params['list_columns'], sim_params['starting_period'], sim_params['ending_period'])

# replace NaN values
if sim_params['replace_nan']:
    logger.info(' ')
    logger.warning("Replacing NaN values with 0s")
    print(df_filtered.isna().sum())
    df_filtered.fillna(0, inplace=True)

# remove previous day pickle file
logger.info(' ')
delete_previous_day_pickle()
logger.opt(colors = True).warning('previous day pickle file removed')

# save csv variables
list_csv_to_save = []

# loop over test days
for i in tqdm(range(sim_params['num_test_days']), desc='Testing Days'):

    # generate timestamps train and prediction
    start_training_timestamp, end_training_timestamp, start_prediction_timestamp, end_prediction_timestamp = generate_timestamps(sim_params['start_training'], i, sim_params['window_size'])

    if i >= ens_params['day_calibration'] and ens_params['conformalized_qr']:
        day_calibration = ens_params['day_calibration']
        start_training_timestamp = start_training_timestamp - pd.Timedelta(f'{day_calibration}day')

    logger.info(' ')
    logger.opt(colors = True).info('<blue>-------------------------------------------------------------------------------------------</blue>')
    logger.opt(colors=True).info(f'<blue>Start training: {start_training_timestamp} - End training: {end_training_timestamp}</blue>')
    logger.opt(colors = True).info('<blue>-------------------------------------------------------------------------------------------</blue>')
    logger.opt(colors = True).info(f'<blue>Start prediction: {start_prediction_timestamp} - End prediction: {end_prediction_timestamp}</blue>')

    df_train = df_filtered[df_filtered.index.to_series().between(start_training_timestamp, end_training_timestamp)].iloc[:-1,:]
    df_test = df_filtered[df_filtered.index.to_series().between(start_prediction_timestamp, end_prediction_timestamp)].iloc[:-1,:]
                                                                                                                            
    logger.info(' ')
    logger.opt(colors = True).info(f'<blue> -----------------> Length of training data: {len(df_train)} </blue>')
    logger.opt(colors = True).info(f'<blue> -----------------> Length of test data: {len(df_test)} </blue>')

    logger.info(' ')
    logger.opt(colors = True).info('<blue> -----------------> Forecasters prediction submitted </blue>')

# # ----------------------------> FORECASTERS PREDICTION SUBMISSION <----------------------------

    df_market, df_train, df_test = submission_forecasters(sim_params, df_train, df_test)   

# # ----------------------------> BUYERS DATA <----------------------------

    df_buyer, forecast_range = prepare_buyer_data(df_train, df_test, start_prediction_timestamp, end_prediction_timestamp)

# # ----------------------------> PREDICO PLATFORM ML ENGINE <----------------------------

# # ----------------------------> ENSEMBLE FORECASTS <----------------------------

    results_ensemble_forecasts = create_ensemble_forecasts(ens_params=ens_params,
                                                            df_buyer=df_buyer, 
                                                            df_market=df_market,
                                                            end_training_timestamp=end_training_timestamp,
                                                            forecast_range = forecast_range,
                                                            challenge_usecase='simulation',
                                                            simulation=True)
    
    ## ----------------------------> SAVE to CSV <----------------------------
    # wind power
    df_pred_ensemble = results_ensemble_forecasts['wind_power']['predictions']
    df_pred_ensemble.rename(columns={'q50_' + sim_params['buyer_resource_name']: '50_predictions', 
                                        'q10_' + sim_params['buyer_resource_name']: '10_predictions',
                                        'q90_' + sim_params['buyer_resource_name']: '90_predictions', 
                                        'norm_' + sim_params['buyer_resource_name']: 'targets'}, inplace=True)
    df_pred_ensemble_clean = df_pred_ensemble.drop(columns=['targets'], axis=1)
    df_test_clean = df_test.iloc[-96:, :]  # last 96 rows

    # wind power variability
    df_pred_ensemble_var = results_ensemble_forecasts['wind_power_variability']['predictions']
    df_pred_ensemble_var.rename(columns={'q50_' + sim_params['buyer_resource_name']: '50_var_predictions',
                                            'q10_' + sim_params['buyer_resource_name']: '10_var_predictions',
                                            'q90_' + sim_params['buyer_resource_name']: '90_var_predictions',
                                            'norm_' + sim_params['buyer_resource_name']: 'targets'}, inplace=True)
    df_pred_ensemble_var_clean = df_pred_ensemble_var.drop(columns=['targets'], axis=1)
    df_test_ensemble_var_clean = df_test.iloc[-96:, :]  # last 96 rows

    if sim_params['baselines_comparison']:

        # # # ----------------------------> COMBINATION SCHEME DATA <----------------------------

        # process data for baselines combination schemes
        df_train_norm, day_previous_df_test_norm, day_previous_df_test_norm_var = process_combination_scheme(df_train, df_test, end_training_timestamp, start_prediction_timestamp)
        
        # Wind power
        df_pred_ensemble = results_ensemble_forecasts['wind_power']['predictions']   
        df_pred_ensemble.rename(columns={'q50_' + sim_params['buyer_resource_name']: '50_predictions', 
                                        'q10_' + sim_params['buyer_resource_name']: '10_predictions',
                                        'q90_' + sim_params['buyer_resource_name']: '90_predictions',
                                        'norm_' + sim_params['buyer_resource_name']: 'targets'}, inplace=True)
        df_pred_ensemble['targets'] = day_previous_df_test_norm['norm_measured'].values[-96:]
        
        # Wind power variability
        df_var_ensemble = results_ensemble_forecasts['wind_power_variability']['predictions']
        df_var_ensemble.rename(columns={'q50_' + sim_params['buyer_resource_name']: '50_var_predictions',
                                        'q10_' + sim_params['buyer_resource_name']: '10_var_predictions',
                                        'q90_' + sim_params['buyer_resource_name']: '90_var_predictions',
                                        'targets': 'targets'}, inplace=True)
        df_var_ensemble['targets'] = day_previous_df_test_norm_var['norm_measured'].values[-96:]
        
        # create dataframes
        df_test_ensemble = pd.DataFrame(df_pred_ensemble['targets']) 
        df_test_ensemble_var = pd.DataFrame(df_var_ensemble['targets'])

    # # ----------------------------> PERFORMANCE METRICS <----------------------------

    ## ----------------------------> WIND POWER VARIABILITY - PERFORMANCE METRICS <----------------------------

        # performance best model selection
        df_best_model_var = run_model_selection(sim_params, df_train_norm , day_previous_df_test_norm, end_training_timestamp, start_prediction_timestamp , window_size_valid = weight_avg_params['window_size_valid'], var=True)
        df_best_model_clean_var = df_best_model_var.rename(columns={'mean_prediction': 'q50_best_model_var'}).drop(columns=['targets'], axis=1)

        # performance weighted average
        df_weighted_avg_var, dict_weights_var = calculate_weighted_avg(sim_params, df_train_norm , day_previous_df_test_norm, end_training_timestamp, start_prediction_timestamp , window_size_valid=weight_avg_params['window_size_valid'], var=True)
        df_weighted_avg_clean_var = df_weighted_avg_var.rename(columns={'mean_prediction': 'q50_weight_avg_var'}).drop(columns=['targets'], axis=1)

        # performance weighted avg soft
        df_weighted_avg_soft_var, dict_weights_soft_var = calculate_weighted_avg(sim_params, df_train_norm, day_previous_df_test_norm, end_training_timestamp, start_prediction_timestamp, window_size_valid=weight_avg_params['window_size_valid'], var=True, norm='softmax')
        df_weighted_avg_soft_clean_var = df_weighted_avg_soft_var.rename(columns={'mean_prediction': 'q50_weight_avg_soft_var'}).drop(columns=['targets'], axis=1)

        # performance equal weights
        df_equal_weights_var = calculate_equal_weights(day_previous_df_test_norm_var, start_prediction_timestamp)
        df_equal_weights_clean_var = df_equal_weights_var.rename(columns={'mean_prediction': 'q50_equal_weights_var'}).drop(columns=['targets', 'Q10', 'Q90'], axis=1)

        # performance day-ahead
        df_dayahead_var = create_df_forecaster_second_stage(day_previous_df_test_norm_var, 'dayahead', start_prediction_timestamp)
        df_dayahead_var_clean = df_dayahead_var.rename(columns={'norm_dayaheadforecast': 'q50_dayahead_var'}).drop(columns=['norm_measured','targets'], axis=1)

        # performance day-ahead-11h
        df_dayahead_11h_var = create_df_forecaster_second_stage(day_previous_df_test_norm_var, 'dayahead11h', start_prediction_timestamp)
        df_dayahead_11h_var_clean = df_dayahead_11h_var.rename(columns={'norm_dayahead11hforecast': 'q50_dayahead_11h_var'}).drop(columns=['norm_measured','targets'], axis=1)

        # performance week ahead
        df_week_ahead_var = create_df_forecaster_second_stage(day_previous_df_test_norm_var, 'weekahead', start_prediction_timestamp)
        df_week_ahead_var_clean = df_week_ahead_var.rename(columns={'norm_weekaheadforecast': 'q50_week_ahead_var'}).drop(columns=['norm_measured','targets'], axis=1)

        # performance most recent
        if sim_params['most_recent']:
            # performance most recent
            df_most_recent_var = create_df_forecaster_second_stage(day_previous_df_test_norm_var, 'mostrecent', start_prediction_timestamp)
            df_most_recent_var_clean = df_most_recent_var.rename(columns={'norm_mostrecentforecast': 'q50_most_recent_var'}).drop(columns=['norm_measured','targets'], axis=1)

        # performance malicious
        if sim_params['malicious']:
            # performance malicious
            df_malicious_var = create_df_forecaster_second_stage(day_previous_df_test_norm_var, 'malicious', start_prediction_timestamp)
            df_malicious_var_clean = df_malicious_var.rename(columns={'norm_maliciousforecast': 'q50_malicious_var'}).drop(columns=['norm_measured','targets'], axis=1)

        # performance noisy
        if sim_params['noisy']:
            # performance noisy
            df_noisy_var = create_df_forecaster_second_stage(day_previous_df_test_norm_var, 'noisy', start_prediction_timestamp)
            df_noisy_var_clean = df_noisy_var.rename(columns={'norm_noisyforecast': 'q50_noisy_var'}).drop(columns=['norm_measured','targets'], axis=1)

    ## ----------------------------> WIND POWER - PERFORMANCE METRICS <----------------------------

        # performance best model selection
        df_best_model = run_model_selection(sim_params, df_train_norm, day_previous_df_test_norm, end_training_timestamp, start_prediction_timestamp, window_size_valid=weight_avg_params['window_size_valid'])
        df_best_model_clean = df_best_model.rename(columns={'Q10': 'q10_best_model', 
                                                            'mean_prediction': 'q50_best_model', 
                                                            'Q90': 'q90_best_model'}).drop(columns=['targets'], axis=1)
        
        # performance weighted average
        df_weighted_avg, dict_weights = calculate_weighted_avg(sim_params, df_train_norm, day_previous_df_test_norm, end_training_timestamp, start_prediction_timestamp, window_size_valid=weight_avg_params['window_size_valid'])
        df_weighted_avg_clean = df_weighted_avg.rename(columns={'Q10': 'q10_weight_avg', 
                                                                'mean_prediction': 'q50_weight_avg', 
                                                                'Q90': 'q90_weight_avg'}).drop(columns=['targets'], axis=1)

        # performance weighted avg soft
        df_weighted_avg_soft, dict_weights_soft = calculate_weighted_avg(sim_params, df_train_norm, day_previous_df_test_norm, end_training_timestamp, start_prediction_timestamp, window_size_valid=weight_avg_params['window_size_valid'], norm='softmax')
        df_weighted_avg_soft_clean = df_weighted_avg_soft.rename(columns={'Q10': 'q10_weight_avg_soft', 
                                                                        'mean_prediction': 'q50_weight_avg_soft', 
                                                                        'Q90': 'q90_weight_avg_soft'}).drop(columns=['targets'], axis=1)
        
        # performance equal weights
        df_equal_weights = calculate_equal_weights(day_previous_df_test_norm, start_prediction_timestamp)
        df_equal_weights_clean = df_equal_weights.rename(columns={'Q10': 'q10_equal_weights', 
                                                                'mean_prediction': 'q50_equal_weights',
                                                                'Q90': 'q90_equal_weights'}).drop(columns=['targets'], axis=1)

        # performance malicious cheat
        if sim_params['malicious']:
            df_malicious = create_df_forecaster_first_stage(day_previous_df_test_norm, 'malicious', start_prediction_timestamp)

        # performance noisy
        if sim_params['noisy']:
            df_noisy = create_df_forecaster_first_stage(day_previous_df_test_norm, 'noisy', start_prediction_timestamp)

        # plot forecasts
        if ens_params['plt_wind_power_ensemble']:
            plot_forecasts(df_pred_ensemble, df_test_ensemble, list_wind_ramps=[], title=f'Wind Power Forecasting')

        # plot variability forecast results
        if ens_params['plt_wind_power_variability_ensemble']:
            plot_var_forecasts(df_var_ensemble, df_test_ensemble_var, list_wind_ramps=[], title=f'Wind Power Variability Forecasting')

    # join dataframes and append to list
    list_df_wind_power = [df_test_clean, df_pred_ensemble_clean, df_best_model_clean, df_weighted_avg_clean, df_weighted_avg_soft_clean, df_equal_weights_clean]
    df_csv_wind_power = join_dataframes(*list_df_wind_power)
    list_df_wind_power_variability = [df_var_ensemble, df_best_model_clean_var, df_weighted_avg_clean_var, df_weighted_avg_soft_clean_var, df_equal_weights_clean_var, df_dayahead_var_clean, df_dayahead_11h_var_clean, df_week_ahead_var_clean]
    df_csv_wind_power_variability = join_dataframes(*list_df_wind_power_variability)
    if sim_params['most_recent']:
        df_csv_wind_power_variability = df_csv_wind_power_variability.join(df_most_recent_var_clean)
    if sim_params['malicious']:
        df_csv_wind_power = df_csv_wind_power.join(df_malicious)
        df_csv_wind_power_variability = df_csv_wind_power_variability.join(df_malicious_var_clean)
    if sim_params['noisy']:
        df_csv_wind_power = df_csv_wind_power.join(df_noisy)
        df_csv_wind_power_variability = df_csv_wind_power_variability.join(df_noisy_var_clean)
    df_csv_wind_power_variability_fillna = df_csv_wind_power_variability.fillna(method='bfill').drop(['10_var_predictions', '90_var_predictions'], axis=1)
    # renale measured and targets columns
    df_csv_wind_power_variability_fillna.rename(columns={'targets': 'measured_var'}, inplace=True)
    df_csv_test_day = join_dataframes(df_csv_wind_power, df_csv_wind_power_variability_fillna)
    list_csv_to_save.append(df_csv_test_day)

    #Clear output
    clear_output(wait=True)

    import time
    time.sleep(3)

In [6]:
df_csv_wind_power_variability_fillna

Unnamed: 0_level_0,50_var_predictions,measured_var,q50_best_model_var,q50_weight_avg_var,q50_weight_avg_soft_var,q10_equal_weights,q50_equal_weights,q90_equal_weights,q50_dayahead_var,q50_dayahead_11h_var,q50_week_ahead_var
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-05-12 00:00:00+00:00,0.0,51.97,949.53,923.483185,900.750491,523.5700,1328.0125,1629.6525,23.79,38.50,13.17
2021-05-12 00:15:00+00:00,0.0,51.97,38.50,25.241530,29.177199,555.9900,1356.4900,1640.4875,23.79,38.50,13.17
2021-05-12 00:30:00+00:00,0.0,71.11,39.79,18.580855,26.138771,585.8125,1379.0325,1652.7950,12.35,39.79,2.91
2021-05-12 00:45:00+00:00,0.0,1.50,40.72,18.692830,25.929286,605.0400,1401.1550,1676.1650,14.12,40.72,0.65
2021-05-12 01:00:00+00:00,0.0,-7.91,25.15,19.708539,16.810650,621.0125,1424.2225,1695.7225,33.09,25.15,1.74
...,...,...,...,...,...,...,...,...,...,...,...
2021-05-12 22:45:00+00:00,0.0,-17.73,-16.12,-13.524704,-14.915237,288.4625,741.0800,1154.2075,-11.32,-16.12,-12.95
2021-05-12 23:00:00+00:00,0.0,-10.10,-18.65,7.100590,6.747591,281.5675,751.1500,1146.4925,-12.66,-18.65,51.57
2021-05-12 23:15:00+00:00,0.0,19.51,-20.54,-18.115051,-19.335223,274.5275,741.1925,1124.8800,-16.30,-20.54,-17.35
2021-05-12 23:30:00+00:00,0.0,10.90,-17.56,-17.974188,-18.651865,268.4175,730.6950,1105.3600,-15.54,-17.56,-20.66


In [None]:
# # ----------------------------> SAVE TO CSV <----------------------------
# from list to dataframe
df_csv = pd.concat(list_csv_to_save)#.to_csv('ensemble_results_no_mostrecent.csv')
#df_csv.head()
df_csv[['measured', '10_predictions', '50_predictions', '90_predictions']].iloc[1000:2000].plot(figsize=(20,10)) #