## 6. Target
We compared different models. The Random Forest Regressor had the best results, so this is the model we will use 
to make the predictions for the target. <br>
The predictions will be saved in the file ```RandomForest_Predictions.csv```. This file is needed for the visualizations 
in our dashboard.

In [1]:
## load modules
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import ast
import sys
sys.path.append("..")
from modeling.features import get_feature_combinations
from sklearn.preprocessing import MinMaxScaler

In [2]:
## read data
data = pd.read_csv('../data/GEFCom2014Data/Wind/clean_data.csv', 
                    parse_dates= ['TIMESTAMP'],
                    index_col= 'TIMESTAMP' )

In [3]:
# train-test-split 
data_train = data[:'2013-07-01 00:00:00']
data_test = data['2013-07-01 01:00:00':]

In [4]:
# the feature groups we defined
feature_dict = get_feature_combinations()

# load the model we use to make the predictions
model_params = pd.DataFrame()
model_params = pd.read_csv(f'../results/RandomForestRegressor.csv', index_col='ZONE')

# add column ZONEID
model_params['ZONEID'] = range(1,11) 

model_params

Unnamed: 0_level_0,BEST_PARAMS,CV,MODEL,FC,TESTSCORE,TRAINSCORE,ZONEID
ZONE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ZONE1,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.170892,RandomForestRegressor,no_deg_norm,0.177034,0.140094,1
ZONE2,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.132224,RandomForestRegressor,no_deg_norm,0.175856,0.109415,2
ZONE3,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.148439,RandomForestRegressor,no_comp,0.149962,0.124022,3
ZONE4,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.17002,RandomForestRegressor,no_comp_plus_100Norm,0.171536,0.128381,4
ZONE5,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.172481,RandomForestRegressor,all,0.171554,0.142998,5
ZONE6,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.175007,RandomForestRegressor,no_deg,0.181507,0.14599,6
ZONE7,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.125856,RandomForestRegressor,no_card_100Norm,0.141791,0.090274,7
ZONE8,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.152973,RandomForestRegressor,no_comp,0.192363,0.125311,8
ZONE9,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.162231,RandomForestRegressor,no_deg_norm,0.151015,0.132315,9
ZONE10,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.192226,RandomForestRegressor,no_comp,0.204616,0.16398,10


In [6]:
# make the predictions
model = RandomForestRegressor()
df_pred = pd.DataFrame(index=data_test[data_test.ZONEID == 1].index)

# we scale the features, because the scales are very different, the target is normalised with
# values from 0 to 1, while
# the other features have a lot larger scales
scaler = MinMaxScaler() 

for zone in model_params.ZONEID:
    fc = model_params[model_params.ZONEID == zone]['FC'].values[0]
    
    data_train_zone = data_train[data_train.ZONEID == zone]
    data_test_zone = data_test[data_test.ZONEID == zone]
        
    X_train = data_train_zone[feature_dict[fc]]
    X_train = scaler.fit_transform(X_train)

    X_test = data_test_zone[feature_dict[fc]]
    X_test = scaler.transform(X_test)

    y_train = data_train_zone.TARGETVAR
    y_test = data_test_zone.TARGETVAR
    
    best_params = model_params[model_params.ZONEID == zone]['BEST_PARAMS'].values[0]
    model = model.set_params(**ast.literal_eval(best_params))

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # predictions can't have values larger than 1 or smaller than 0, because the energy output 
    # consists of nornmalized values in [0,1].
    y_pred = np.array([1 if value >= 1 else 0 if value <= 0 else value for value in y_pred])

    df_pred[f'Zone {zone}'] = y_pred
    

In [7]:
# save the predictions in ```RandomForest_Predictions.csv ```
df_pred.to_csv('../results/RandomForest_Predictions.csv')
