In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df_de=pd.read_csv('./processed_data/X_train_DE.csv')
df_de_y=df_de['TARGET']
df_de=df_de.drop(columns=['TARGET'])

df_fr=pd.read_csv('./processed_data/X_train_FR.csv')
df_fr_y=df_fr['TARGET']
df_fr=df_fr.drop(columns=['TARGET'])

# Splitting data in 80% training and 20% testing
X_train_de, X_test_de, y_train_de, y_test_de = train_test_split(df_de, df_de_y, test_size=0.2, random_state=69)
X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(df_fr, df_fr_y, test_size=0.2, random_state=69)

In [3]:
df_de.columns

Index(['ID', 'DAY_ID', 'COUNTRY', 'DE_GAS', 'FR_GAS', 'DE_COAL', 'FR_COAL',
       'DE_HYDRO', 'FR_HYDRO', 'DE_NUCLEAR', 'FR_NUCLEAR', 'DE_SOLAR',
       'FR_SOLAR', 'DE_WINDPOW', 'FR_WINDPOW', 'DE_LIGNITE',
       'DE_RESIDUAL_LOAD', 'FR_RESIDUAL_LOAD', 'DE_RAIN', 'FR_RAIN', 'DE_WIND',
       'FR_WIND', 'DE_TEMP', 'FR_TEMP', 'GAS_RET', 'COAL_RET', 'CARBON_RET',
       'ID.1', 'DE_COAL_COST', 'DE_GAS_COST', 'FR_COAL_COST', 'FR_GAS_COST',
       'DE_LIGNITE_COST', 'DE_COAL_CARBON', 'DE_GAS_CARBON', 'FR_COAL_CARBON',
       'FR_GAS_CARBON', 'DE_LIGNITE_CARBON', 'EXCHANGE', 'SELF_EXPORT',
       'OTHER_EXPORT', 'SELF_CONSUMPTION', 'OTHER_CONSUMPTION'],
      dtype='object')

In [4]:
df_de.head()

Unnamed: 0,ID,DAY_ID,COUNTRY,DE_GAS,FR_GAS,DE_COAL,FR_COAL,DE_HYDRO,FR_HYDRO,DE_NUCLEAR,...,DE_COAL_CARBON,DE_GAS_CARBON,FR_COAL_CARBON,FR_GAS_CARBON,DE_LIGNITE_CARBON,EXCHANGE,SELF_EXPORT,OTHER_EXPORT,SELF_CONSUMPTION,OTHER_CONSUMPTION
0,111,2,DE,1.228079,0.458302,-0.247704,-0.766904,1.785758,-0.930172,0.064726,...,-0.189585,0.774063,-0.586965,0.28887,-0.445656,-1.102015,-1.080403,0.256736,-0.068972,-0.66739
1,800,3,DE,1.588542,0.069297,-0.635452,-0.718729,1.994144,-0.38369,-2.002323,...,0.017693,-0.036424,0.020011,-0.001589,0.005824,-1.051716,-1.881881,-0.612133,-0.13467,-0.834564
2,831,5,DE,1.059828,0.528273,-0.072071,-0.766063,1.275857,-0.398178,-1.875681,...,-0.047898,0.580054,-0.509118,0.289129,-0.229615,0.144615,-1.208286,-1.811403,-0.29785,-0.470371
3,779,7,DE,0.386191,0.727314,0.25538,-0.778036,0.281094,-0.739291,-1.887303,...,-0.00752,-0.009365,0.02291,-0.017637,-0.003958,0.002239,-0.676226,-0.745182,0.057599,-0.625625
4,841,8,DE,0.9554,0.278566,-0.234473,-0.755877,0.385716,-0.580103,-1.889094,...,-0.464359,1.558207,-1.496966,0.454327,-0.599628,1.309253,0.248085,-2.22684,-0.282744,-0.700027


In [5]:
def grid_search_random_forest(X_train,y_train,X_test,y_test):
    # get categorical columns and encode them
    for col in X_train.columns:
        if X_train[col].dtype == 'object':
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col])
            X_test[col] = le.transform(X_test[col])
    # parameters for grid search
    param_grid = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'bootstrap': [True, False],
        'n_estimators': [100, 200, 300, 400],
        'min_samples_leaf': [1, 2, 4]
    }
    # defining random forest regressor and grid search
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=0)
    grid_search.fit(X_train, y_train)
    # get best parameters
    params = grid_search.best_params_
    print(params)
    
    best_rf = RandomForestRegressor(**params)
    best_rf.fit(X_train, y_train)
    y_pred = best_rf.predict(X_test)

    print(f'Mean Squared Error:  {mse(y_test, y_pred)}')
    print(f'Mean Absolute Error: {mae(y_test, y_pred)}')

In [6]:
grid_search_random_forest(X_train_de,y_train_de,X_test_de,y_test_de)

{'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Mean Squared Error:  1.1737688858387374
Mean Absolute Error:  0.6934183896891786


In [7]:
grid_search_random_forest(X_train_fr,y_train_fr,X_test_fr,y_test_fr)

{'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200}
Mean Squared Error:  0.9022901351759498
Mean Absolute Error:  0.5208156139955012


In [8]:
def stacked_model(X_train,y_train,X_test,y_test):
    # stacked model comprising of linear regression and decision tree regressor
    for col in X_train.columns:
        if X_train[col].dtype == 'object':
            le = LabelEncoder()
            X_train[col] = le.fit_transform(X_train[col])
            X_test[col] = le.transform(X_test[col])
    
    dt = DecisionTreeRegressor()                
    lr = LinearRegression()
    
    meta_model = LinearRegression()
    
    # intercept and max depth are the parameters to be tuned
    param_grid_lr = {'fit_intercept': [True, False]}
    param_grid_dt = {'max_depth': [None, 2, 5, 10]}

    grid_dt = GridSearchCV(dt, param_grid_dt, cv=5)
    grid_lr = GridSearchCV(lr, param_grid_lr, cv=5)
    
    grid_dt.fit(X_train, y_train)
    grid_lr.fit(X_train, y_train)

    base_models = [grid_lr.best_estimator_, grid_dt.best_estimator_]

    stacking_model = StackingRegressor(regressors=base_models, meta_regressor=meta_model)
    
    stacking_model.fit(X_train.values, y_train.values)
    y_pred = stacking_model.predict(X_test)

    print(f'Mean Squared Error:  {mse(y_test, y_pred)}')
    print(f'Mean Absolute Error: {mae(y_test, y_pred)}')

In [9]:
stacked_model(X_train_de,y_train_de,X_test_de,y_test_de)

Mean Squared Error:  1.5626572485129753
Mean Absolute Error:  0.7773450021828013


In [10]:
stacked_model(X_train_fr,y_train_fr,X_test_fr,y_test_fr)

Mean Squared Error:  0.9705016080717486
Mean Absolute Error:  0.5820683329944006
