In [3]:
import pandas as pd
import numpy as np

from models import *

pd.set_option('display.float_format', '{:.4f}'.format)

# Loading Datasets

In [5]:
train_df_baseline = pd.read_csv("../data/trainingset.csv")
test_df_baseline = pd.read_csv("../data/validationset.csv")

train_df_log = pd.read_csv("../data/trainingsettransformed.csv")
test_df_log = pd.read_csv("../data/validationsettransformed.csv")

train_df_log_log = pd.read_csv("../data/trainingset_with_log_target.csv")
test_df_log_log = pd.read_csv("../data/validationset_with_log_target.csv")

# Lasso Regression

## Baseline

In [8]:
target_column = "cTOTEXn"
lasso_baseline, lasso_vip_baseline = lasso_regression(train_df_baseline, target_column)
lasso_results_baseline = evaluation_metrics("Lasso", lasso_baseline, train_df_baseline, test_df_baseline, target_column, scaling = True)
print(lasso_results_baseline)
print(lasso_vip_baseline)

Performing Lasso regression...
Selected features by Lasso (80): Index(['yConnections.streetlights.N7', 'yConnections.cus.N5',
       'yInjectionPoints.KWKG.N7', 'yInjectionPoints.other.N3',
       'yInjectionPoints.other.N4', 'yInjectionPoints.other.N6',
       'yInjectionPoints.other.N7', 'yMeters.unoccupied.N5',
       'yMeters.unoccupied.N7', 'yMeters.read.ext.N5', 'yMeters.read.ext.N6',
       'yMeters.cp.nonctrl.N5', 'yMeters.over10MWh.noRPM.N5',
       'yMeters.over10MWh.RPM.N7', 'yCables.circuit.N3', 'yCables.all.N13.sum',
       'yLines.excl.house.N7', 'yLines.circuit.tot', 'yLines.all.tot',
       'yNet.length.N3', 'yTransformers.power.N7',
       'yTransformers.power.own.N7', 'yTransformers.power.own.tot',
       'yInstalledPower.renewables.wind.N7',
       'yInstalledPower.renewables.biomass.N5',
       'yInstalledPower.renewables.biomass.N7',
       'yInstalledPower.renewables.hydro.N5', 'yInstalledPower.KWKG.N6',
       'yInstalledPower.KWKG.N7', 'yInstalledPower.other.N4'

## Log-Transformed

In [10]:
target_column = "cTOTEXn"
lasso_log, lasso_vip_log = lasso_regression(train_df_log, target_column)
lasso_results_log = evaluation_metrics("Lasso Log Features", lasso_log, train_df_log, test_df_log, target_column, scaling = True)
print(lasso_results_log)
print(lasso_vip_log)

Performing Lasso regression...
Selected features by Lasso (14): Index(['yInjectionPoints.renewables.N3_log',
       'yInjectionPoints.renewables.hydro.N6_log',
       'yInjectionPoints.other.N3_log', 'ySubstations.N5_log',
       'ySubstations.N7_log', 'ySubstations.own.N5_log',
       'ySubstations.own.N7_log', 'yTransformers.ront.N5_log',
       'yInjection.renewables.solar.N3_log', 'yIssues.num_log',
       'yArea.plaza.N7_log', 'zSoil.aBK26_log', 'zSoil.aAK67_log',
       'rInstalledPower.renewables.solar.per.point.N3_log'],
      dtype='object')

Variable Importance (Lasso):
                                              Feature   Coefficient
0                                  yArea.plaza.N7_log 49962856.4838
1                       yInjectionPoints.other.N3_log 25121796.0757
2                                     zSoil.aAK67_log 13451230.4269
3                  yInjectionPoints.renewables.N3_log  8989675.7587
4                  yInjection.renewables.solar.N3_log  8620474.3298
5    

## Log-Log-Transformed

In [22]:
target_column = "cTOTEXn_log"
lasso_log_log, lasso_vip_log_log = lasso_regression(train_df_log_log, target_column)
lasso_results_log_log = evaluation_metrics("Lasso Log Features and Outcome", lasso_log_log, train_df_log_log, test_df_log_log, target_column, scaling = True)
print(lasso_results_log_log)
print(lasso_vip_log_log)

Performing Lasso regression...
Selected features by Lasso (40): Index(['yInjectionPoints.renewables.hydro.N5_log',
       'yInjectionPoints.other.N5_log', 'yInjectionPoints.other.N7_log',
       'yMeters.flatrate.N7_log', 'yMeters.cp.nonctrl.N5_log',
       'yMeters.over10MWh.RPM.N3_log', 'yMeters.others.N7_log',
       'yMeters.others.tot_log', 'yNet.length.all.tot_log',
       'yNet.length.excl.house.tot_log', 'ySubstations.N4_log',
       'ySwitchingstations.N3_log', 'yInstalledPower.renewables.hydro.N6_log',
       'yInstalledPower.KWKG.N4_log', 'yInstalledPower.KWKG.N7_log',
       'yInstalledPower.other.N7_log', 'yInstalledPower.other.tot_log',
       'yInstalledPower.dec.sum_log', 'yInstalledPower.KWKG.other.tot_log',
       'yInjection.renewables.solar.N6_log',
       'yInjection.renewables.hydro.N5_log',
       'yEnergy.recovered.from.lower.net.N3_log', 'yEnergy.other.N7_log',
       'yEnergy.losses.tot_log', 'yEnergy.delivered.N1357.sum_log',
       'yPeakload.2.N6_log', 'yPe

# Random Forest Regression

## Baseline

In [25]:
target_column = "cTOTEXn"
rf_baseline, rf_vip_baseline = random_forest_regression(train_df_baseline, target_column)
rf_results_baseline = evaluation_metrics("Random Forest", rf_baseline, train_df_baseline, test_df_baseline, target_column)
print(rf_results_baseline)
print(rf_vip_baseline)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters found: {'rf__max_depth': 20, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 500}

Feature Importance (Random Forest):
                                 Feature  Importance
0                     yEnergy.losses.tot      0.1060
1                         yInjection.tot      0.1032
2                      yEnergy.losses.N6      0.0736
3            yEnergy.delivered.N1357.sum      0.0724
4                     yLines.all.N57.sum      0.0656
5   yInstalledPower.nonsimcurt.N1to4.sum      0.0601
6                     yInstalledPower.N3      0.0535
7                           yPeakload.N5      0.0495
8                           yPeakload.N3      0.0486
9                     yLines.circuit.tot      0.0432
10                          zSoil.GB0378      0.0421
11                        yPeakload.2.N5      0.0421
12                     yEnergy.losses.N3      0.0386
13                  yEnergy.deliver

## Log-Transformed

In [26]:
target_column = "cTOTEXn"
rf_log, rf_vip_log = random_forest_regression(train_df_log, target_column)
rf_results_log = evaluation_metrics("Random Forest", rf_log, train_df_log, test_df_log, target_column)
print(rf_results_log)
print(rf_vip_log)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters found: {'rf__max_depth': 20, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 500}

Feature Importance (Random Forest):
                                     Feature  Importance
0                     yEnergy.losses.tot_log      0.1060
1                         yInjection.tot_log      0.1032
2                      yEnergy.losses.N6_log      0.0736
3            yEnergy.delivered.N1357.sum_log      0.0724
4                     yLines.all.N57.sum_log      0.0656
5   yInstalledPower.nonsimcurt.N1to4.sum_log      0.0601
6                     yInstalledPower.N3_log      0.0535
7                           yPeakload.N5_log      0.0495
8                           yPeakload.N3_log      0.0486
9                     yLines.circuit.tot_log      0.0432
10                          zSoil.GB0378_log      0.0421
11                        yPeakload.2.N5_log      0.0421
12                     yEnergy.losse

## Log-Log-Transformed

In [27]:
target_column = "cTOTEXn_log"
rf_log_log, rf_vip_log_log = random_forest_regression(train_df_log_log, target_column)
rf_results_log_log = evaluation_metrics("Random Forest", rf_log_log, train_df_log_log, test_df_log_log, target_column)
print(rf_results_log_log)
print(rf_vip_log_log)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters found: {'rf__max_depth': 20, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 500}

Feature Importance (Random Forest):
                                          Feature  Importance
0                           yPeakload.corr.N6_log      0.0970
1                      yTransformers.power.N6_log      0.0939
2                      yPeakload.injection.N5_log      0.0912
3                          yEnergy.losses.tot_log      0.0807
4                              yInjection.tot_log      0.0566
5                        yEnergy.delivered.N7_log      0.0534
6                  yTransformers.power.own.N6_log      0.0530
7                               yInjection.N5_log      0.0517
8           yEnergy.delivered.to.customers.N7_log      0.0501
9                       yEnergy.delivered.tot_log      0.0484
10                               yPeakload.N5_log      0.0468
11                         yMeter

# Summary Table