# Chapter 4.1. - Supervised Machine Learning

In this JupyterNotebook, the models described in Chapter 4.1.2. are trained and evaluated, differentiated by calculation approach, classification task, machine learning technique and parameter subset.

## Parametrization-First-Aggregation-Second / Community Profit (PFAS / Profit)

#### Logistic Regression (PFAS / Profit)

##### All Features (PFAS / Profit)

In [1]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_PFAS_Profit_2020-09-25.pkl')

X = data.iloc[:,:-1].copy()
y = data['CommunityProfit_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

lr_params = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 100],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0' : [10e-5, 10e-4, 10e-3, 10e-2, 1],
    'tol' : [0.00001, 0.0001, 0.001],
    'l1_ratio' : [0.15, 0.3, 0.5]
}

lr_classifier = SGDClassifier(loss='log',
                              random_state=42,
                              max_iter=10e+6)

lr_gridsearch = GridSearchCV(estimator=lr_classifier,
                             param_grid=lr_params,
                             scoring='accuracy',
                             cv=3,
                             n_jobs=3)

lr_gridsearch.fit(X_train, y_train)

lr_result = pd.DataFrame(lr_gridsearch.cv_results_)
lr_result.to_pickle('Data/GridSearchResults_PFAS_Profit_AllFeatures_LR.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=lr_gridsearch.predict(X_test)))

Calculation Runtime: 7.7e+01 Minutes
              precision    recall  f1-score   support

        High       0.39      0.48      0.43       100
         Low       0.37      0.15      0.21       100
      Medium       0.38      0.52      0.44       100

    accuracy                           0.38       300
   macro avg       0.38      0.38      0.36       300
weighted avg       0.38      0.38      0.36       300



In [41]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Profit_AllFeatures_LR.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 1, 'eta0': 0.01, 'l1_ratio': 0.5, 'learning_rate': 'adaptive', 'penalty': 'elasticnet', 'tol': 1e-05}


##### Selected Features (PFAS / Profit)

In [7]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

selected_features = ['median_daily_maximum_demand_winter_weekend', 'night_impact_spring/autumn_weekend', 'fft_peak_summer_workday', 'maximum_tou_winter_weekend', 'maximum_tou_winter_workday', 'maximum_tou_spring/autumn_weekend', 'variance_summer_workday', 'variance_summer_weekend', 'variance_winter_workday', 'variance_winter_weekend', 'end_of_work_impact_spring/autumn_weekend', 'minimum_tou_summer_weekend', 'morning_slope_winter_weekend', 'night_impact_winter_weekend', 'minimum_tou_winter_weekend', 'fft_peak_winter_workday', 'daily_load_factor_winter_weekend', 'fft_peak_winter_weekend', 'median_daily_maximum_demand_winter_workday', 'night_slope_spring/autumn_weekend'] 

data = pd.read_pickle('Data/Parameter_Values_PFAS_Profit_2020-09-25.pkl')

X = data[selected_features]
y = data['CommunityProfit_Class']

# for col in X.columns:
#     X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

start_time = time.time()

lr_params = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 100],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0' : [10e-5, 10e-4, 10e-3, 10e-2, 1],
    'tol' : [0.00001, 0.0001, 0.001],
    'l1_ratio' : [0.15, 0.3, 0.5]
}

lr_classifier = SGDClassifier(loss='log',
                              random_state=42,
                              max_iter=10e+6)

lr_gridsearch = GridSearchCV(estimator=lr_classifier,
                             param_grid=lr_params,
                             scoring='accuracy',
                             cv=3,
                             n_jobs=3)

lr_gridsearch.fit(X_train, y_train)

lr_result = pd.DataFrame(lr_gridsearch.cv_results_)
lr_result.to_pickle('Data/GridSearchResults_PFAS_Profit_SelectedFeatures_LR.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=lr_gridsearch.predict(X_test)))

Calculation Runtime: 4.0 Minutes
              precision    recall  f1-score   support

        High       0.36      0.38      0.37       167
         Low       0.39      0.35      0.37       167
      Medium       0.31      0.33      0.32       166

    accuracy                           0.35       500
   macro avg       0.35      0.35      0.35       500
weighted avg       0.35      0.35      0.35       500



In [42]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Profit_SelectedFeatures_LR.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 10, 'eta0': 0.0001, 'l1_ratio': 0.15, 'learning_rate': 'adaptive', 'penalty': 'l2', 'tol': 0.001}
{'alpha': 10, 'eta0': 0.0001, 'l1_ratio': 0.3, 'learning_rate': 'adaptive', 'penalty': 'l2', 'tol': 0.001}
{'alpha': 10, 'eta0': 0.0001, 'l1_ratio': 0.5, 'learning_rate': 'adaptive', 'penalty': 'l2', 'tol': 0.001}


#### Extreme Gradient Boosting (PFAS / Profit)

##### All Features (PFAS / Profit)

In [3]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_PFAS_Profit_2020-09-25.pkl')

X = data.iloc[:,:-1]
y = data['CommunityProfit_Class']

for col in X.columns:
    X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

xgb_params = {
    'reg_alpha' : [0, 0.001, 0.1, 1, 10, 100],
    'reg_lambda' : [0, 0.001, 0.1, 1, 10, 100],
    'learning_rate' : [0, 0.01, 0.1, 0.3, 0.5],
    'max_depth ' : [3, 5, 7, 9],
    'n_estimators' : [10,11,12,13,14,15,20,50,100,200,500,1000,2000]
}

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   eval_metric='merror',
                                   random_state=42)

xgb_gridsearch = GridSearchCV(estimator=xgb_classifier,
                              param_grid=xgb_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

xgb_gridsearch.fit(X_train, y_train)

xgb_result = pd.DataFrame(xgb_gridsearch.cv_results_)
xgb_result.to_pickle('Data/GridSearchResults_PFAS_Profit_AllFeatures_XGB.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=xgb_gridsearch.predict(X_test)))

Fitting 3 folds for each of 9360 candidates, totalling 28080 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    0.4s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    2.6s
[Parallel(n_jobs=3)]: Done 109 tasks      | elapsed:    5.8s
[Parallel(n_jobs=3)]: Done 283 tasks      | elapsed:   16.8s
[Parallel(n_jobs=3)]: Done 509 tasks      | elapsed:   32.3s
[Parallel(n_jobs=3)]: Done 780 tasks      | elapsed:   59.7s
[Parallel(n_jobs=3)]: Done 943 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done 1130 tasks      | elapsed:  6.0min
[Parallel(n_jobs=3)]: Done 1343 tasks      | elapsed: 25.7min
[Parallel(n_jobs=3)]: Done 1651 tasks      | elapsed: 34.9min
[Parallel(n_jobs=3)]: Done 2177 tasks      | elapsed: 35.7min
[Parallel(n_jobs=3)]: Done 2466 tasks      | elapsed: 38.6min
[Parallel(n_jobs=3)]: Done 2779 tasks      | elapsed: 66.1min
[Parallel(n_jobs=3)]: Done 3319 tasks      | elapsed: 69.7min
[Parallel(n_jobs=3)]: Done 3818 tasks      | elapsed: 72.2min
[P

Calculation Runtime: 5e+02 Minutes
              precision    recall  f1-score   support

        High       0.38      0.27      0.32       100
         Low       0.29      0.32      0.30       100
      Medium       0.36      0.42      0.39       100

    accuracy                           0.34       300
   macro avg       0.34      0.34      0.33       300
weighted avg       0.34      0.34      0.33       300



In [43]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Profit_AllFeatures_XGB.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'learning_rate': 0.01, 'max_depth ': 3, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 100}
{'learning_rate': 0.01, 'max_depth ': 5, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 100}
{'learning_rate': 0.01, 'max_depth ': 7, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 100}
{'learning_rate': 0.01, 'max_depth ': 9, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 100}


##### Selected Features (PFAS / Profit)

In [9]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

# Import data (and filter) data
selected_features = ['median_daily_maximum_demand_winter_weekend', 'night_impact_spring/autumn_weekend', 'fft_peak_summer_workday', 'maximum_tou_winter_weekend', 'maximum_tou_winter_workday', 'maximum_tou_spring/autumn_weekend', 'variance_summer_workday', 'variance_summer_weekend', 'variance_winter_workday', 'variance_winter_weekend', 'end_of_work_impact_spring/autumn_weekend', 'minimum_tou_summer_weekend', 'morning_slope_winter_weekend', 'night_impact_winter_weekend', 'minimum_tou_winter_weekend', 'fft_peak_winter_workday', 'daily_load_factor_winter_weekend', 'fft_peak_winter_weekend', 'median_daily_maximum_demand_winter_workday', 'night_slope_spring/autumn_weekend'] 

data = pd.read_pickle('Data/Parameter_Values_PFAS_Profit_2020-09-25.pkl')

X = data[selected_features]
y = data['CommunityProfit_Class']

for col in X.columns:
    X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

xgb_params = {
    'reg_alpha' : [0, 0.001, 0.1, 1, 10, 100],
    'reg_lambda' : [0, 0.001, 0.1, 1, 10, 100],
    'learning_rate' : [0, 0.01, 0.1, 0.3, 0.5],
    'max_depth ' : [3, 5, 7, 9],
    'n_estimators' : [10,11,12,13,14,15,20,50,100,200,500,1000,2000]
}

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   eval_metric='merror',
                                   random_state=42)

xgb_gridsearch = GridSearchCV(estimator=xgb_classifier,
                              param_grid=xgb_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

xgb_gridsearch.fit(X_train, y_train)

xgb_result = pd.DataFrame(xgb_gridsearch.cv_results_)
xgb_result.to_pickle('Data/GridSearchResults_PFAS_Profit_SelectedFeatures_XGB.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=xgb_gridsearch.predict(X_test)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 3 folds for each of 9360 candidates, totalling 28080 fits


[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 146 tasks      | elapsed:    2.3s
[Parallel(n_jobs=3)]: Done 398 tasks      | elapsed:    7.2s
[Parallel(n_jobs=3)]: Done 746 tasks      | elapsed:   14.8s
[Parallel(n_jobs=3)]: Done 979 tasks      | elapsed:   36.6s
[Parallel(n_jobs=3)]: Done 1116 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 1279 tasks      | elapsed:  4.0min
[Parallel(n_jobs=3)]: Done 1550 tasks      | elapsed:  8.4min
[Parallel(n_jobs=3)]: Done 2333 tasks      | elapsed:  8.8min
[Parallel(n_jobs=3)]: Done 2570 tasks      | elapsed: 10.3min
[Parallel(n_jobs=3)]: Done 2833 tasks      | elapsed: 16.7min
[Parallel(n_jobs=3)]: Done 3774 tasks      | elapsed: 17.2min
[Parallel(n_jobs=3)]: Done 4087 tasks      | elapsed: 20.6min
[Parallel(n_jobs=3)]: Done 4949 tasks      | elapsed: 25.2min
[Parallel(n_jobs=3)]: Done 5441 tasks      | elapsed: 27.9min
[Parallel(n_jobs=3)]: Done 6362 tasks      | elapsed: 33.5min
[Parallel(n_j

Calculation Runtime: 1.5e+02 Minutes
              precision    recall  f1-score   support

        High       0.34      0.26      0.29       100
         Low       0.40      0.40      0.40       100
      Medium       0.35      0.43      0.38       100

    accuracy                           0.36       300
   macro avg       0.36      0.36      0.36       300
weighted avg       0.36      0.36      0.36       300



In [44]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Profit_SelectedFeatures_XGB.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'learning_rate': 0.1, 'max_depth ': 3, 'n_estimators': 13, 'reg_alpha': 0.001, 'reg_lambda': 1}
{'learning_rate': 0.1, 'max_depth ': 5, 'n_estimators': 13, 'reg_alpha': 0.001, 'reg_lambda': 1}
{'learning_rate': 0.1, 'max_depth ': 7, 'n_estimators': 13, 'reg_alpha': 0.001, 'reg_lambda': 1}
{'learning_rate': 0.1, 'max_depth ': 9, 'n_estimators': 13, 'reg_alpha': 0.001, 'reg_lambda': 1}


#### Multi-Layer Perceptron (PFAS / Profit)

##### All Features (PFAS / Profit)

In [5]:
# Import Python modules
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_PFAS_Profit_2020-09-25.pkl')

X = StandardScaler().fit_transform(data.iloc[:,:-1])
y = data['CommunityProfit_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

hls = []
for i in [10, 30, 50, 70, 100]:
    for j in [1, 2, 3, 4 , 5, 10, 30, 50, 70, 100]:
        hls.append((i,)*j)

mlp_params = {
    'hidden_layer_sizes' : hls,
    'solver' : ['adam', 'lbfgs'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'beta_1' : [0.8, 0.9],
    'tol' : [0.000001, 0.00001, 0.0001]
}

mlp_classifier = MLPClassifier(random_state=42,
                               max_iter=10e+8,
                               activation='relu')

mlp_gridsearch = GridSearchCV(estimator=mlp_classifier,
                              param_grid=mlp_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

start_time = time.time()

mlp_gridsearch.fit(X_train, y_train)

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

mlp_result = pd.DataFrame(mlp_gridsearch.cv_results_)
mlp_result.to_pickle('Data/GridSearchResults_PFAS_Profit_AllFeatures_MLP.pkl')

print(classification_report(y_true=y_test,
                            y_pred=mlp_gridsearch.predict(X_test)))

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done  76 tasks      | elapsed:    6.6s
[Parallel(n_jobs=3)]: Done 169 tasks      | elapsed:   18.3s
[Parallel(n_jobs=3)]: Done 332 tasks      | elapsed:   31.1s
[Parallel(n_jobs=3)]: Done 493 tasks      | elapsed:   55.2s
[Parallel(n_jobs=3)]: Done 654 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 853 tasks      | elapsed:  3.6min
[Parallel(n_jobs=3)]: Done 1060 tasks      | elapsed:  4.9min
[Parallel(n_jobs=3)]: Done 1397 tasks      | elapsed:  5.5min
[Parallel(n_jobs=3)]: Done 1660 tasks      | elapsed:  6.8min
[Parallel(n_jobs=3)]: Done 1975 tasks      | elapsed:  9.6min
[Parallel(n_jobs=3)]: Done 2331 tasks      | elapsed: 11.7min
[Parallel(n_jobs=3)]: Done 2644 tasks      | elapsed: 19.3min
[Parallel(n_jobs=3)]: Done 3044 tasks      | elapsed: 23.9min
[Parallel(n_jobs=3)]: Done 3455 tasks      | elapsed: 28.1min
[P

Calculation Runtime: 3.4e+02 Minutes
              precision    recall  f1-score   support

        High       0.38      0.35      0.37       100
         Low       0.35      0.38      0.36       100
      Medium       0.37      0.37      0.37       100

    accuracy                           0.37       300
   macro avg       0.37      0.37      0.37       300
weighted avg       0.37      0.37      0.37       300



In [20]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Profit_AllFeatures_MLP.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.0001, 'beta_1': 0.8, 'hidden_layer_sizes': (30, 30, 30), 'solver': 'lbfgs', 'tol': 1e-06}
{'alpha': 0.0001, 'beta_1': 0.9, 'hidden_layer_sizes': (30, 30, 30), 'solver': 'lbfgs', 'tol': 1e-06}


##### Selected Features (PFAS / Profit)

In [11]:
# Import Python modules
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_PFAS_Profit_2020-09-25.pkl')

selected_features = ['median_daily_maximum_demand_winter_weekend', 'night_impact_spring/autumn_weekend', 'fft_peak_summer_workday', 'maximum_tou_winter_weekend', 'maximum_tou_winter_workday', 'maximum_tou_spring/autumn_weekend', 'variance_summer_workday', 'variance_summer_weekend', 'variance_winter_workday', 'variance_winter_weekend', 'end_of_work_impact_spring/autumn_weekend', 'minimum_tou_summer_weekend', 'morning_slope_winter_weekend', 'night_impact_winter_weekend', 'minimum_tou_winter_weekend', 'fft_peak_winter_workday', 'daily_load_factor_winter_weekend', 'fft_peak_winter_weekend', 'median_daily_maximum_demand_winter_workday', 'night_slope_spring/autumn_weekend'] 

X = StandardScaler().fit_transform(data[selected_features])
y = data['CommunityProfit_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

hls = []
for i in [10, 30, 50, 70, 100]:
    for j in [1, 2, 3, 4 , 5, 10, 30, 50, 70, 100]:
        hls.append((i,)*j)

mlp_params = {
    'hidden_layer_sizes' : hls,
    'solver' : ['adam', 'lbfgs'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'beta_1' : [0.8, 0.9],
    'tol' : [0.000001, 0.00001, 0.0001]
}

mlp_classifier = MLPClassifier(random_state=42,
                               max_iter=10e+8,
                               activation='relu')

mlp_gridsearch = GridSearchCV(estimator=mlp_classifier,
                              param_grid=mlp_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

start_time = time.time()

mlp_gridsearch.fit(X_train, y_train)

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

mlp_result = pd.DataFrame(mlp_gridsearch.cv_results_)
mlp_result.to_pickle('Data/GridSearchResults_PFAS_Profit_SelectedFeatures_MLP.pkl')

print(classification_report(y_true=y_test,
                            y_pred=mlp_gridsearch.predict(X_test)))

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  68 tasks      | elapsed:   19.2s
[Parallel(n_jobs=3)]: Done 131 tasks      | elapsed:   41.2s
[Parallel(n_jobs=3)]: Done 294 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 440 tasks      | elapsed:  1.3min
[Parallel(n_jobs=3)]: Done 579 tasks      | elapsed:  2.4min
[Parallel(n_jobs=3)]: Done 742 tasks      | elapsed:  5.0min
[Parallel(n_jobs=3)]: Done 929 tasks      | elapsed:  7.3min
[Parallel(n_jobs=3)]: Done 1214 tasks      | elapsed:  8.3min
[Parallel(n_jobs=3)]: Done 1508 tasks      | elapsed: 10.0min
[Parallel(n_jobs=3)]: Done 1771 tasks      | elapsed: 13.9min
[Parallel(n_jobs=3)]: Done 2085 tasks      | elapsed: 16.2min
[Parallel(n_jobs=3)]: Done 2398 tasks      | elapsed: 21.0min
[Parallel(n_jobs=3)]: Done 2735 tasks      | elapsed: 37.1min
[Parallel(n_jobs=3)]: Done 3149 tasks      | elapsed: 39.6min
[Pa

Calculation Runtime: 4.1e+02 Minutes
              precision    recall  f1-score   support

        High       0.36      0.32      0.34       100
         Low       0.36      0.36      0.36       100
      Medium       0.35      0.38      0.36       100

    accuracy                           0.35       300
   macro avg       0.35      0.35      0.35       300
weighted avg       0.35      0.35      0.35       300



In [21]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Profit_SelectedFeatures_MLP.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.1, 'beta_1': 0.8, 'hidden_layer_sizes': (50, 50, 50, 50, 50), 'solver': 'lbfgs', 'tol': 0.0001}
{'alpha': 0.1, 'beta_1': 0.9, 'hidden_layer_sizes': (50, 50, 50, 50, 50), 'solver': 'lbfgs', 'tol': 0.0001}


## Parametrization-First-Aggregation-Second / Community Gain (PFAS / Gain)

#### Logistic Regression (PFAS / Gain)

##### All Features (PFAS / Gain)

In [1]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_PFAS_Gain_2020-09-25.pkl')

X = data.iloc[:,:-1].copy()
y = data['CommunityGain_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

lr_params = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 100],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0' : [10e-5, 10e-4, 10e-3, 10e-2, 1],
    'tol' : [0.00001, 0.0001, 0.001],
    'l1_ratio' : [0.15, 0.3, 0.5]
}

lr_classifier = SGDClassifier(loss='log',
                              random_state=42,
                              max_iter=10e+6)

lr_gridsearch = GridSearchCV(estimator=lr_classifier,
                             param_grid=lr_params,
                             scoring='accuracy',
                             cv=3,
                             n_jobs=3)

lr_gridsearch.fit(X_train, y_train)

lr_result = pd.DataFrame(lr_gridsearch.cv_results_)
lr_result.to_pickle('Data/GridSearchResults_PFAS_Gain_AllFeatures_LR.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=lr_gridsearch.predict(X_test)))

Calculation Runtime: 3.5e+02 Minutes
              precision    recall  f1-score   support

        High       0.33      0.30      0.31       100
         Low       0.35      0.33      0.34       100
      Medium       0.38      0.44      0.41       100

    accuracy                           0.36       300
   macro avg       0.35      0.36      0.35       300
weighted avg       0.35      0.36      0.35       300



In [22]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Gain_AllFeatures_LR.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.01, 'eta0': 0.01, 'l1_ratio': 0.15, 'learning_rate': 'adaptive', 'penalty': 'l1', 'tol': 0.001}
{'alpha': 0.01, 'eta0': 0.01, 'l1_ratio': 0.3, 'learning_rate': 'adaptive', 'penalty': 'l1', 'tol': 0.001}
{'alpha': 0.01, 'eta0': 0.01, 'l1_ratio': 0.5, 'learning_rate': 'adaptive', 'penalty': 'l1', 'tol': 0.001}


##### Selected Features (PFAS / Gain)

In [3]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

selected_features = ['maximum_tou_summer_workday', 'end_of_work_impact_summer_weekend', 'lunch_impact_spring/autumn_weekend', 'minimum_tou_spring/autumn_workday', 'kurtosis_summer_workday', 'kurtosis_summer_weekend', 'fft_peak_summer_weekend', 'morning_slope_summer_workday', 'end_of_work_impact_summer_workday', 'median_daily_maximum_demand_spring/autumn_workday', 'night_impact_winter_weekend', 'morning_slope_spring/autumn_workday', 'fft_peak_summer_workday', 'minimum_tou_spring/autumn_weekend', 'minimum_tou_winter_weekend', 'minimum_tou_winter_workday', 'median_daily_minimum_demand_summer_workday', 'variance_spring/autumn_workday', 'night_slope_winter_weekend', 'fft_peak_winter_workday'] 

data = pd.read_pickle('Data/Parameter_Values_PFAS_Gain_2020-09-25.pkl')

X = data[selected_features]
y = data['CommunityGain_Class']

# for col in X.columns:
#     X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

start_time = time.time()

lr_params = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 100],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0' : [10e-5, 10e-4, 10e-3, 10e-2, 1],
    'tol' : [0.00001, 0.0001, 0.001],
    'l1_ratio' : [0.15, 0.3, 0.5]
}

lr_classifier = SGDClassifier(loss='log',
                              random_state=42,
                              max_iter=10e+6)

lr_gridsearch = GridSearchCV(estimator=lr_classifier,
                             param_grid=lr_params,
                             scoring='accuracy',
                             cv=3,
                             n_jobs=3)

lr_gridsearch.fit(X_train, y_train)

lr_result = pd.DataFrame(lr_gridsearch.cv_results_)
lr_result.to_pickle('Data/GridSearchResults_PFAS_Gain_SelectedFeatures_LR.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=lr_gridsearch.predict(X_test)))

Calculation Runtime: 5.4 Minutes
              precision    recall  f1-score   support

        High       0.33      0.41      0.36       167
         Low       0.39      0.29      0.33       167
      Medium       0.38      0.39      0.38       166

    accuracy                           0.36       500
   macro avg       0.37      0.36      0.36       500
weighted avg       0.37      0.36      0.36       500



In [23]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Gain_SelectedFeatures_LR.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.0001, 'eta0': 0.1, 'l1_ratio': 0.15, 'learning_rate': 'adaptive', 'penalty': 'elasticnet', 'tol': 1e-05}
{'alpha': 0.0001, 'eta0': 0.1, 'l1_ratio': 0.15, 'learning_rate': 'adaptive', 'penalty': 'elasticnet', 'tol': 0.0001}
{'alpha': 0.0001, 'eta0': 0.1, 'l1_ratio': 0.15, 'learning_rate': 'adaptive', 'penalty': 'elasticnet', 'tol': 0.001}


#### Extreme Gradient Boosting (PFAS / Gain)

##### All Features (PFAS / Gain)

In [5]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_PFAS_Gain_2020-09-25.pkl')

X = data.iloc[:,:-1]
y = data['CommunityGain_Class']

for col in X.columns:
    X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

xgb_params = {
    'reg_alpha' : [0, 0.001, 0.1, 1, 10, 100],
    'reg_lambda' : [0, 0.001, 0.1, 1, 10, 100],
    'learning_rate' : [0, 0.01, 0.1, 0.3, 0.5],
    'max_depth ' : [3, 5, 7, 9],
    'n_estimators' : [10,11,12,13,14,15,20,50,100,200,500,1000,2000]
}

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   eval_metric='merror',
                                   random_state=42)

xgb_gridsearch = GridSearchCV(estimator=xgb_classifier,
                              param_grid=xgb_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

xgb_gridsearch.fit(X_train, y_train)

xgb_result = pd.DataFrame(xgb_gridsearch.cv_results_)
xgb_result.to_pickle('Data/GridSearchResults_PFAS_Gain_AllFeatures_XGB.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=xgb_gridsearch.predict(X_test)))

Fitting 3 folds for each of 9360 candidates, totalling 28080 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    0.6s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    2.7s
[Parallel(n_jobs=3)]: Done 109 tasks      | elapsed:    6.1s
[Parallel(n_jobs=3)]: Done 283 tasks      | elapsed:   17.0s
[Parallel(n_jobs=3)]: Done 509 tasks      | elapsed:   32.5s
[Parallel(n_jobs=3)]: Done 782 tasks      | elapsed:   59.7s
[Parallel(n_jobs=3)]: Done 946 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done 1133 tasks      | elapsed:  6.1min
[Parallel(n_jobs=3)]: Done 1346 tasks      | elapsed: 24.9min
[Parallel(n_jobs=3)]: Done 1724 tasks      | elapsed: 33.0min
[Parallel(n_jobs=3)]: Done 2215 tasks      | elapsed: 33.8min
[Parallel(n_jobs=3)]: Done 2502 tasks      | elapsed: 37.3min
[Parallel(n_jobs=3)]: Done 2815 tasks      | elapsed: 65.5min
[Parallel(n_jobs=3)]: Done 3391 tasks      | elapsed: 66.1min
[Parallel(n_jobs=3)]: Done 3854 tasks      | elapsed: 68.9min
[P

Calculation Runtime: 4.9e+02 Minutes
              precision    recall  f1-score   support

        High       0.29      0.27      0.28       100
         Low       0.32      0.39      0.35       100
      Medium       0.23      0.19      0.21       100

    accuracy                           0.28       300
   macro avg       0.28      0.28      0.28       300
weighted avg       0.28      0.28      0.28       300



In [24]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Gain_AllFeatures_XGB.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'learning_rate': 0.01, 'max_depth ': 3, 'n_estimators': 13, 'reg_alpha': 0.1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 3, 'n_estimators': 50, 'reg_alpha': 1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 5, 'n_estimators': 13, 'reg_alpha': 0.1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 5, 'n_estimators': 50, 'reg_alpha': 1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 7, 'n_estimators': 13, 'reg_alpha': 0.1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 7, 'n_estimators': 50, 'reg_alpha': 1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 9, 'n_estimators': 13, 'reg_alpha': 0.1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 9, 'n_estimators': 50, 'reg_alpha': 1, 'reg_lambda': 10}


##### Selected Features (PFAS / Gain)

In [7]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

# Import data (and filter) data
selected_features = ['maximum_tou_summer_workday', 'end_of_work_impact_summer_weekend', 'lunch_impact_spring/autumn_weekend', 'minimum_tou_spring/autumn_workday', 'kurtosis_summer_workday', 'kurtosis_summer_weekend', 'fft_peak_summer_weekend', 'morning_slope_summer_workday', 'end_of_work_impact_summer_workday', 'median_daily_maximum_demand_spring/autumn_workday', 'night_impact_winter_weekend', 'morning_slope_spring/autumn_workday', 'fft_peak_summer_workday', 'minimum_tou_spring/autumn_weekend', 'minimum_tou_winter_weekend', 'minimum_tou_winter_workday', 'median_daily_minimum_demand_summer_workday', 'variance_spring/autumn_workday', 'night_slope_winter_weekend', 'fft_peak_winter_workday'] 

data = pd.read_pickle('Data/Parameter_Values_PFAS_Gain_2020-09-25.pkl')

X = data[selected_features]
y = data['CommunityGain_Class']

for col in X.columns:
    X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

xgb_params = {
    'reg_alpha' : [0, 0.001, 0.1, 1, 10, 100],
    'reg_lambda' : [0, 0.001, 0.1, 1, 10, 100],
    'learning_rate' : [0, 0.01, 0.1, 0.3, 0.5],
    'max_depth ' : [3, 5, 7, 9],
    'n_estimators' : [10,11,12,13,14,15,20,50,100,200,500,1000,2000]
}

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   eval_metric='merror',
                                   random_state=42)

xgb_gridsearch = GridSearchCV(estimator=xgb_classifier,
                              param_grid=xgb_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

xgb_gridsearch.fit(X_train, y_train)

xgb_result = pd.DataFrame(xgb_gridsearch.cv_results_)
xgb_result.to_pickle('Data/GridSearchResults_PFAS_Gain_SelectedFeatures_XGB.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=xgb_gridsearch.predict(X_test)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 3 folds for each of 9360 candidates, totalling 28080 fits


[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 146 tasks      | elapsed:    2.3s
[Parallel(n_jobs=3)]: Done 398 tasks      | elapsed:    6.8s
[Parallel(n_jobs=3)]: Done 746 tasks      | elapsed:   15.1s
[Parallel(n_jobs=3)]: Done 979 tasks      | elapsed:   37.2s
[Parallel(n_jobs=3)]: Done 1116 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 1279 tasks      | elapsed:  4.1min
[Parallel(n_jobs=3)]: Done 1514 tasks      | elapsed:  8.4min
[Parallel(n_jobs=3)]: Done 2324 tasks      | elapsed:  8.9min
[Parallel(n_jobs=3)]: Done 2561 tasks      | elapsed: 10.3min
[Parallel(n_jobs=3)]: Done 2824 tasks      | elapsed: 16.8min
[Parallel(n_jobs=3)]: Done 3747 tasks      | elapsed: 17.3min
[Parallel(n_jobs=3)]: Done 4060 tasks      | elapsed: 20.3min
[Parallel(n_jobs=3)]: Done 4805 tasks      | elapsed: 25.3min
[Parallel(n_jobs=3)]: Done 5417 tasks      | elapsed: 27.6min
[Parallel(n_jobs=3)]: Done 6257 tasks      | elapsed: 33.7min
[Parallel(n_j

Calculation Runtime: 1.4e+02 Minutes
              precision    recall  f1-score   support

        High       0.27      0.19      0.22       100
         Low       0.23      0.24      0.23       100
      Medium       0.23      0.29      0.26       100

    accuracy                           0.24       300
   macro avg       0.24      0.24      0.24       300
weighted avg       0.24      0.24      0.24       300



In [25]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Gain_SelectedFeatures_XGB.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'learning_rate': 0.01, 'max_depth ': 3, 'n_estimators': 11, 'reg_alpha': 1, 'reg_lambda': 0}
{'learning_rate': 0.01, 'max_depth ': 5, 'n_estimators': 11, 'reg_alpha': 1, 'reg_lambda': 0}
{'learning_rate': 0.01, 'max_depth ': 7, 'n_estimators': 11, 'reg_alpha': 1, 'reg_lambda': 0}
{'learning_rate': 0.01, 'max_depth ': 9, 'n_estimators': 11, 'reg_alpha': 1, 'reg_lambda': 0}


#### Multi-Layer Perceptron (PFAS / Gain)

##### All Features (PFAS / Gain)

In [9]:
# Import Python modules
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_PFAS_Gain_2020-09-25.pkl')

X = StandardScaler().fit_transform(data.iloc[:,:-1])
y = data['CommunityGain_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

hls = []
for i in [10, 30, 50, 70, 100]:
    for j in [1, 2, 3, 4 , 5, 10, 30, 50, 70, 100]:
        hls.append((i,)*j)

mlp_params = {
    'hidden_layer_sizes' : hls,
    'solver' : ['adam', 'lbfgs'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'beta_1' : [0.8, 0.9],
    'tol' : [0.000001, 0.00001, 0.0001]
}

mlp_classifier = MLPClassifier(random_state=42,
                               max_iter=10e+8,
                               activation='relu')

mlp_gridsearch = GridSearchCV(estimator=mlp_classifier,
                              param_grid=mlp_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

start_time = time.time()

mlp_gridsearch.fit(X_train, y_train)

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

mlp_result = pd.DataFrame(mlp_gridsearch.cv_results_)
mlp_result.to_pickle('Data/GridSearchResults_PFAS_Gain_AllFeatures_MLP.pkl')

print(classification_report(y_true=y_test,
                            y_pred=mlp_gridsearch.predict(X_test)))

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  95 tasks      | elapsed:   11.1s
[Parallel(n_jobs=3)]: Done 178 tasks      | elapsed:   27.4s
[Parallel(n_jobs=3)]: Done 340 tasks      | elapsed:   50.6s
[Parallel(n_jobs=3)]: Done 501 tasks      | elapsed:  1.3min
[Parallel(n_jobs=3)]: Done 678 tasks      | elapsed:  2.2min
[Parallel(n_jobs=3)]: Done 841 tasks      | elapsed:  3.7min
[Parallel(n_jobs=3)]: Done 1052 tasks      | elapsed:  4.8min
[Parallel(n_jobs=3)]: Done 1389 tasks      | elapsed:  5.4min
[Parallel(n_jobs=3)]: Done 1664 tasks      | elapsed:  7.0min
[Parallel(n_jobs=3)]: Done 1927 tasks      | elapsed:  8.5min
[Parallel(n_jobs=3)]: Done 2301 tasks      | elapsed: 10.6min
[Parallel(n_jobs=3)]: Done 2614 tasks      | elapsed: 15.0min
[Parallel(n_jobs=3)]: Done 3020 tasks      | elapsed: 21.6min
[Parallel(n_jobs=3)]: Done 3410 tasks      | elapsed: 26.1min
[P

Calculation Runtime: 3.5e+02 Minutes
              precision    recall  f1-score   support

        High       0.39      0.39      0.39       100
         Low       0.27      0.28      0.28       100
      Medium       0.27      0.26      0.26       100

    accuracy                           0.31       300
   macro avg       0.31      0.31      0.31       300
weighted avg       0.31      0.31      0.31       300



In [26]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Gain_AllFeatures_MLP.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 1, 'beta_1': 0.8, 'hidden_layer_sizes': (70, 70, 70, 70), 'solver': 'lbfgs', 'tol': 1e-06}
{'alpha': 1, 'beta_1': 0.8, 'hidden_layer_sizes': (70, 70, 70, 70), 'solver': 'lbfgs', 'tol': 1e-05}
{'alpha': 1, 'beta_1': 0.8, 'hidden_layer_sizes': (70, 70, 70, 70), 'solver': 'lbfgs', 'tol': 0.0001}
{'alpha': 1, 'beta_1': 0.9, 'hidden_layer_sizes': (70, 70, 70, 70), 'solver': 'lbfgs', 'tol': 1e-06}
{'alpha': 1, 'beta_1': 0.9, 'hidden_layer_sizes': (70, 70, 70, 70), 'solver': 'lbfgs', 'tol': 1e-05}
{'alpha': 1, 'beta_1': 0.9, 'hidden_layer_sizes': (70, 70, 70, 70), 'solver': 'lbfgs', 'tol': 0.0001}


##### Selected Features (PFAS / Gain)

In [11]:
# Import Python modules
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_PFAS_Gain_2020-09-25.pkl')

selected_features = ['maximum_tou_summer_workday', 'end_of_work_impact_summer_weekend', 'lunch_impact_spring/autumn_weekend', 'minimum_tou_spring/autumn_workday', 'kurtosis_summer_workday', 'kurtosis_summer_weekend', 'fft_peak_summer_weekend', 'morning_slope_summer_workday', 'end_of_work_impact_summer_workday', 'median_daily_maximum_demand_spring/autumn_workday', 'night_impact_winter_weekend', 'morning_slope_spring/autumn_workday', 'fft_peak_summer_workday', 'minimum_tou_spring/autumn_weekend', 'minimum_tou_winter_weekend', 'minimum_tou_winter_workday', 'median_daily_minimum_demand_summer_workday', 'variance_spring/autumn_workday', 'night_slope_winter_weekend', 'fft_peak_winter_workday'] 

X = StandardScaler().fit_transform(data[selected_features])
y = data['CommunityGain_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

hls = []
for i in [10, 30, 50, 70, 100]:
    for j in [1, 2, 3, 4 , 5, 10, 30, 50, 70, 100]:
        hls.append((i,)*j)

mlp_params = {
    'hidden_layer_sizes' : hls,
    'solver' : ['adam', 'lbfgs'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'beta_1' : [0.8, 0.9],
    'tol' : [0.000001, 0.00001, 0.0001]
}

mlp_classifier = MLPClassifier(random_state=42,
                               max_iter=10e+8,
                               activation='relu')

mlp_gridsearch = GridSearchCV(estimator=mlp_classifier,
                              param_grid=mlp_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

start_time = time.time()

mlp_gridsearch.fit(X_train, y_train)

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

mlp_result = pd.DataFrame(mlp_gridsearch.cv_results_)
mlp_result.to_pickle('Data/GridSearchResults_PFAS_Gain_SelectedFeatures_MLP.pkl')

print(classification_report(y_true=y_test,
                            y_pred=mlp_gridsearch.predict(X_test)))

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  68 tasks      | elapsed:   16.1s
[Parallel(n_jobs=3)]: Done 131 tasks      | elapsed:   57.5s
[Parallel(n_jobs=3)]: Done 294 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 478 tasks      | elapsed:  1.9min
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed:  2.6min
[Parallel(n_jobs=3)]: Done 805 tasks      | elapsed:  4.8min
[Parallel(n_jobs=3)]: Done 992 tasks      | elapsed:  9.8min
[Parallel(n_jobs=3)]: Done 1301 tasks      | elapsed: 13.0min
[Parallel(n_jobs=3)]: Done 1622 tasks      | elapsed: 17.7min
[Parallel(n_jobs=3)]: Done 1885 tasks      | elapsed: 23.1min
[Parallel(n_jobs=3)]: Done 2280 tasks      | elapsed: 31.0min
[Parallel(n_jobs=3)]: Done 2593 tasks      | elapsed: 35.2min
[Parallel(n_jobs=3)]: Done 2970 tasks      | elapsed: 45.8min
[Parallel(n_jobs=3)]: Done 3362 tasks      | elapsed: 52.1min
[Pa

Calculation Runtime: 4e+02 Minutes
              precision    recall  f1-score   support

        High       0.38      0.45      0.41       100
         Low       0.30      0.25      0.27       100
      Medium       0.29      0.29      0.29       100

    accuracy                           0.33       300
   macro avg       0.33      0.33      0.33       300
weighted avg       0.33      0.33      0.33       300



In [27]:
result = pd.read_pickle('Data/GridSearchResults_PFAS_Gain_SelectedFeatures_MLP.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.01, 'beta_1': 0.8, 'hidden_layer_sizes': (30, 30, 30, 30, 30), 'solver': 'lbfgs', 'tol': 1e-05}
{'alpha': 0.01, 'beta_1': 0.9, 'hidden_layer_sizes': (30, 30, 30, 30, 30), 'solver': 'lbfgs', 'tol': 1e-05}


## Aggregation-First-Parametrization-Second / Community Profit (AFPS / Profit)

#### Logistic Regression (AFPS / Profit)

##### All Features (AFPS / Profit)

In [1]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_AFPS_Profit_2020-09-25.pkl')

X = data.iloc[:,:-1].copy()
y = data['CommunityProfit_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

lr_params = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 100],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0' : [10e-5, 10e-4, 10e-3, 10e-2, 1],
    'tol' : [0.00001, 0.0001, 0.001],
    'l1_ratio' : [0.15, 0.3, 0.5]
}

lr_classifier = SGDClassifier(loss='log',
                              random_state=42,
                              max_iter=10e+6)

lr_gridsearch = GridSearchCV(estimator=lr_classifier,
                             param_grid=lr_params,
                             scoring='accuracy',
                             cv=3,
                             n_jobs=3)

lr_gridsearch.fit(X_train, y_train)

lr_result = pd.DataFrame(lr_gridsearch.cv_results_)
lr_result.to_pickle('Data/GridSearchResults_AFPS_Profit_AllFeatures_LR.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=lr_gridsearch.predict(X_test)))

Calculation Runtime: 5.9e+01 Minutes
              precision    recall  f1-score   support

        High       0.38      0.44      0.41       100
         Low       0.30      0.21      0.25       100
      Medium       0.36      0.41      0.38       100

    accuracy                           0.35       300
   macro avg       0.35      0.35      0.35       300
weighted avg       0.35      0.35      0.35       300



In [28]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Profit_AllFeatures_LR.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.1, 'eta0': 1, 'l1_ratio': 0.5, 'learning_rate': 'adaptive', 'penalty': 'elasticnet', 'tol': 1e-05}
{'alpha': 0.1, 'eta0': 1, 'l1_ratio': 0.5, 'learning_rate': 'adaptive', 'penalty': 'elasticnet', 'tol': 0.0001}


##### Selected Features (AFPS / Profit)

In [5]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

selected_features =  ['median_daily_maximum_demand_spring/autumn_workday', 'daily_range_factor_winter_weekend', 'daily_range_factor_winter_workday', 'daily_nonuniformity_coefficient_winter_workday', 'daily_range_factor_spring/autumn_workday', 'daily_load_factor_spring/autumn_workday', 'variance_winter_workday', 'morning_slope_summer_weekend', 'median_daily_maximum_demand_summer_workday', 'median_daily_maximum_demand_winter_workday', 'maximum_tou_spring/autumn_workday', 'morning_slope_winter_weekend', 'variance_winter_weekend', 'pv_correlation_winter_weekend', 'minimum_tou_spring/autumn_workday', 'daily_nonuniformity_coefficient_winter_weekend', 'variance_spring/autumn_weekend', 'daily_load_factor_winter_weekend', 'minimum_tou_winter_workday', 'variance_summer_weekend'] 

data = pd.read_pickle('Data/Parameter_Values_AFPS_Profit_2020-09-25.pkl')

X = data[selected_features]
y = data['CommunityProfit_Class']

# for col in X.columns:
#     X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

start_time = time.time()

lr_params = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 100],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0' : [10e-5, 10e-4, 10e-3, 10e-2, 1],
    'tol' : [0.00001, 0.0001, 0.001],
    'l1_ratio' : [0.15, 0.3, 0.5]
}

lr_classifier = SGDClassifier(loss='log',
                              random_state=42,
                              max_iter=10e+6)

lr_gridsearch = GridSearchCV(estimator=lr_classifier,
                             param_grid=lr_params,
                             scoring='accuracy',
                             cv=3,
                             n_jobs=3)

lr_gridsearch.fit(X_train, y_train)

lr_result = pd.DataFrame(lr_gridsearch.cv_results_)
lr_result.to_pickle('Data/GridSearchResults_AFPS_Profit_SelectedFeatures_LR.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=lr_gridsearch.predict(X_test)))

Calculation Runtime: 2.0 Minutes
              precision    recall  f1-score   support

        High       0.37      0.53      0.43       167
         Low       0.38      0.16      0.23       167
      Medium       0.37      0.42      0.39       166

    accuracy                           0.37       500
   macro avg       0.37      0.37      0.35       500
weighted avg       0.37      0.37      0.35       500



In [30]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Profit_SelectedFeatures_LR.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.1, 'eta0': 0.01, 'l1_ratio': 0.3, 'learning_rate': 'adaptive', 'penalty': 'elasticnet', 'tol': 0.001}


#### Extreme Gradient Boosting (AFPS / Profit)

##### All Features (AFPS / Profit)

In [7]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_AFPS_Profit_2020-09-25.pkl')

X = data.iloc[:,:-1]
y = data['CommunityProfit_Class']

for col in X.columns:
    X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

xgb_params = {
    'reg_alpha' : [0, 0.001, 0.1, 1, 10, 100],
    'reg_lambda' : [0, 0.001, 0.1, 1, 10, 100],
    'learning_rate' : [0, 0.01, 0.1, 0.3, 0.5],
    'max_depth ' : [3, 5, 7, 9],
    'n_estimators' : [10,11,12,13,14,15,20,50,100,200,500,1000,2000]
}

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   eval_metric='merror',
                                   random_state=42)

xgb_gridsearch = GridSearchCV(estimator=xgb_classifier,
                              param_grid=xgb_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

xgb_gridsearch.fit(X_train, y_train)

xgb_result = pd.DataFrame(xgb_gridsearch.cv_results_)
xgb_result.to_pickle('Data/GridSearchResults_AFPS_Profit_AllFeatures_XGB.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=xgb_gridsearch.predict(X_test)))

Fitting 3 folds for each of 9360 candidates, totalling 28080 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    0.8s
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    3.6s
[Parallel(n_jobs=3)]: Done 109 tasks      | elapsed:    7.0s
[Parallel(n_jobs=3)]: Done 283 tasks      | elapsed:   18.3s
[Parallel(n_jobs=3)]: Done 509 tasks      | elapsed:   36.7s
[Parallel(n_jobs=3)]: Done 782 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 946 tasks      | elapsed:  2.2min
[Parallel(n_jobs=3)]: Done 1133 tasks      | elapsed:  6.5min
[Parallel(n_jobs=3)]: Done 1346 tasks      | elapsed: 26.0min
[Parallel(n_jobs=3)]: Done 1654 tasks      | elapsed: 34.7min
[Parallel(n_jobs=3)]: Done 2180 tasks      | elapsed: 35.5min
[Parallel(n_jobs=3)]: Done 2469 tasks      | elapsed: 38.6min
[Parallel(n_jobs=3)]: Done 2782 tasks      | elapsed: 66.0min
[Parallel(n_jobs=3)]: Done 3325 tasks      | elapsed: 68.7min
[Parallel(n_jobs=3)]: Done 3821 tasks      | elapsed: 71.0min
[P

Calculation Runtime: 5.3e+02 Minutes
              precision    recall  f1-score   support

        High       0.38      0.37      0.38       100
         Low       0.35      0.31      0.33       100
      Medium       0.37      0.42      0.39       100

    accuracy                           0.37       300
   macro avg       0.37      0.37      0.37       300
weighted avg       0.37      0.37      0.37       300



In [31]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Profit_AllFeatures_XGB.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'learning_rate': 0.5, 'max_depth ': 3, 'n_estimators': 10, 'reg_alpha': 0, 'reg_lambda': 0}
{'learning_rate': 0.5, 'max_depth ': 3, 'n_estimators': 10, 'reg_alpha': 0, 'reg_lambda': 0.001}
{'learning_rate': 0.5, 'max_depth ': 3, 'n_estimators': 10, 'reg_alpha': 0.001, 'reg_lambda': 0}
{'learning_rate': 0.5, 'max_depth ': 5, 'n_estimators': 10, 'reg_alpha': 0, 'reg_lambda': 0}
{'learning_rate': 0.5, 'max_depth ': 5, 'n_estimators': 10, 'reg_alpha': 0, 'reg_lambda': 0.001}
{'learning_rate': 0.5, 'max_depth ': 5, 'n_estimators': 10, 'reg_alpha': 0.001, 'reg_lambda': 0}
{'learning_rate': 0.5, 'max_depth ': 7, 'n_estimators': 10, 'reg_alpha': 0, 'reg_lambda': 0}
{'learning_rate': 0.5, 'max_depth ': 7, 'n_estimators': 10, 'reg_alpha': 0, 'reg_lambda': 0.001}
{'learning_rate': 0.5, 'max_depth ': 7, 'n_estimators': 10, 'reg_alpha': 0.001, 'reg_lambda': 0}
{'learning_rate': 0.5, 'max_depth ': 9, 'n_estimators': 10, 'reg_alpha': 0, 'reg_lambda': 0}
{'learning_rate': 0.5, 'max_depth ': 9, 'n_est

##### Selected Features (AFPS / Profit)

In [9]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

# Import data (and filter) data
selected_features =  ['median_daily_maximum_demand_spring/autumn_workday', 'daily_range_factor_winter_weekend', 'daily_range_factor_winter_workday', 'daily_nonuniformity_coefficient_winter_workday', 'daily_range_factor_spring/autumn_workday', 'daily_load_factor_spring/autumn_workday', 'variance_winter_workday', 'morning_slope_summer_weekend', 'median_daily_maximum_demand_summer_workday', 'median_daily_maximum_demand_winter_workday', 'maximum_tou_spring/autumn_workday', 'morning_slope_winter_weekend', 'variance_winter_weekend', 'pv_correlation_winter_weekend', 'minimum_tou_spring/autumn_workday', 'daily_nonuniformity_coefficient_winter_weekend', 'variance_spring/autumn_weekend', 'daily_load_factor_winter_weekend', 'minimum_tou_winter_workday', 'variance_summer_weekend'] 

data = pd.read_pickle('Data/Parameter_Values_AFPS_Profit_2020-09-25.pkl')

X = data[selected_features]
y = data['CommunityProfit_Class']

for col in X.columns:
    X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

xgb_params = {
    'reg_alpha' : [0, 0.001, 0.1, 1, 10, 100],
    'reg_lambda' : [0, 0.001, 0.1, 1, 10, 100],
    'learning_rate' : [0, 0.01, 0.1, 0.3, 0.5],
    'max_depth ' : [3, 5, 7, 9],
    'n_estimators' : [10,11,12,13,14,15,20,50,100,200,500,1000,2000]
}

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   eval_metric='merror',
                                   random_state=42)

xgb_gridsearch = GridSearchCV(estimator=xgb_classifier,
                              param_grid=xgb_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

xgb_gridsearch.fit(X_train, y_train)

xgb_result = pd.DataFrame(xgb_gridsearch.cv_results_)
xgb_result.to_pickle('Data/GridSearchResults_AFPS_Profit_SelectedFeatures_XGB.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=xgb_gridsearch.predict(X_test)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 3 folds for each of 9360 candidates, totalling 28080 fits


[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=3)]: Done  86 tasks      | elapsed:    1.6s
[Parallel(n_jobs=3)]: Done 338 tasks      | elapsed:    5.6s
[Parallel(n_jobs=3)]: Done 686 tasks      | elapsed:   12.8s
[Parallel(n_jobs=3)]: Done 964 tasks      | elapsed:   34.2s
[Parallel(n_jobs=3)]: Done 1101 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done 1264 tasks      | elapsed:  3.7min
[Parallel(n_jobs=3)]: Done 1499 tasks      | elapsed:  8.4min
[Parallel(n_jobs=3)]: Done 2327 tasks      | elapsed:  8.8min
[Parallel(n_jobs=3)]: Done 2564 tasks      | elapsed: 10.2min
[Parallel(n_jobs=3)]: Done 2827 tasks      | elapsed: 16.7min
[Parallel(n_jobs=3)]: Done 3768 tasks      | elapsed: 17.3min
[Parallel(n_jobs=3)]: Done 4081 tasks      | elapsed: 20.6min
[Parallel(n_jobs=3)]: Done 5102 tasks      | elapsed: 25.5min
[Parallel(n_jobs=3)]: Done 5465 tasks      | elapsed: 28.6min
[Parallel(n_jobs=3)]: Done 6467 tasks      | elapsed: 33.8min
[Parallel(n_j

Calculation Runtime: 1.4e+02 Minutes
              precision    recall  f1-score   support

        High       0.29      0.27      0.28       100
         Low       0.28      0.14      0.19       100
      Medium       0.34      0.53      0.41       100

    accuracy                           0.31       300
   macro avg       0.30      0.31      0.29       300
weighted avg       0.30      0.31      0.29       300



In [32]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Profit_SelectedFeatures_XGB.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'learning_rate': 0.01, 'max_depth ': 3, 'n_estimators': 50, 'reg_alpha': 0.1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 5, 'n_estimators': 50, 'reg_alpha': 0.1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 7, 'n_estimators': 50, 'reg_alpha': 0.1, 'reg_lambda': 10}
{'learning_rate': 0.01, 'max_depth ': 9, 'n_estimators': 50, 'reg_alpha': 0.1, 'reg_lambda': 10}


#### Multi-Layer Perceptron (AFPS / Profit)

##### All Features (AFPS / Profit)

In [11]:
# Import Python modules
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_AFPS_Profit_2020-09-25.pkl')

X = StandardScaler().fit_transform(data.iloc[:,:-1])
y = data['CommunityProfit_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

hls = []
for i in [10, 30, 50, 70, 100]:
    for j in [1, 2, 3, 4 , 5, 10, 30, 50, 70, 100]:
        hls.append((i,)*j)

mlp_params = {
    'hidden_layer_sizes' : hls,
    'solver' : ['adam', 'lbfgs'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'beta_1' : [0.8, 0.9],
    'tol' : [0.000001, 0.00001, 0.0001]
}

mlp_classifier = MLPClassifier(random_state=42,
                               max_iter=10e+8,
                               activation='relu')

mlp_gridsearch = GridSearchCV(estimator=mlp_classifier,
                              param_grid=mlp_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

start_time = time.time()

mlp_gridsearch.fit(X_train, y_train)

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

mlp_result = pd.DataFrame(mlp_gridsearch.cv_results_)
mlp_result.to_pickle('Data/GridSearchResults_AFPS_Profit_AllFeatures_MLP.pkl')

print(classification_report(y_true=y_test,
                            y_pred=mlp_gridsearch.predict(X_test)))

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=3)]: Done 104 tasks      | elapsed:   19.2s
[Parallel(n_jobs=3)]: Done 196 tasks      | elapsed:   25.4s
[Parallel(n_jobs=3)]: Done 343 tasks      | elapsed:   43.4s
[Parallel(n_jobs=3)]: Done 520 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 689 tasks      | elapsed:  2.5min
[Parallel(n_jobs=3)]: Done 880 tasks      | elapsed:  4.0min
[Parallel(n_jobs=3)]: Done 1144 tasks      | elapsed:  4.7min
[Parallel(n_jobs=3)]: Done 1441 tasks      | elapsed:  5.9min
[Parallel(n_jobs=3)]: Done 1721 tasks      | elapsed:  7.5min
[Parallel(n_jobs=3)]: Done 2042 tasks      | elapsed:  9.4min
[Parallel(n_jobs=3)]: Done 2370 tasks      | elapsed: 11.7min
[Parallel(n_jobs=3)]: Done 2683 tasks      | elapsed: 22.2min
[Parallel(n_jobs=3)]: Done 3029 tasks      | elapsed: 26.6min
[Parallel(n_jobs=3)]: Done 3413 tasks      | elapsed: 31.8min
[P

Calculation Runtime: 4e+02 Minutes
              precision    recall  f1-score   support

        High       0.34      0.30      0.32       100
         Low       0.32      0.37      0.35       100
      Medium       0.37      0.37      0.37       100

    accuracy                           0.35       300
   macro avg       0.35      0.35      0.35       300
weighted avg       0.35      0.35      0.35       300



In [33]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Profit_AllFeatures_MLP.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.01, 'beta_1': 0.8, 'hidden_layer_sizes': (30, 30, 30, 30, 30, 30, 30, 30, 30, 30), 'solver': 'lbfgs', 'tol': 0.0001}
{'alpha': 0.01, 'beta_1': 0.9, 'hidden_layer_sizes': (30, 30, 30, 30, 30, 30, 30, 30, 30, 30), 'solver': 'lbfgs', 'tol': 0.0001}


In [None]:
# # Import Python modules
# import pandas as pd
# import numpy as np
# from sklearn.neural_network import MLPClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import classification_report
# from sklearn.preprocessing import StandardScaler
# from datetime import datetime
# import time

# data = pd.read_pickle('Data/Parameter_Values_Aggregated_Profit_2020-09-15.pkl')

# X = X_train = StandardScaler().fit_transform(data.iloc[:,:94])
# y = data['CommunityProfit_Class']

# # Train-Test-Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# mlp_classifier = MLPClassifier(hidden_layer_sizes=(50,),
#                                activation='relu',
#                                solver='adam',
#                                random_state=42,
#                                max_iter=1000000,
#                                beta_1=0.8,
#                                verbose=0,
#                                tol=0.00001)

# mlp_classifier.fit(X_train, y_train)

# print(classification_report(y_true=y_test,
#                             y_pred=mlp_classifier.predict(X_test)))

##### Selected Features (AFPS / Profit)

In [13]:
# Import Python modules
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_AFPS_Profit_2020-09-25.pkl')

selected_features =  ['median_daily_maximum_demand_spring/autumn_workday', 'daily_range_factor_winter_weekend', 'daily_range_factor_winter_workday', 'daily_nonuniformity_coefficient_winter_workday', 'daily_range_factor_spring/autumn_workday', 'daily_load_factor_spring/autumn_workday', 'variance_winter_workday', 'morning_slope_summer_weekend', 'median_daily_maximum_demand_summer_workday', 'median_daily_maximum_demand_winter_workday', 'maximum_tou_spring/autumn_workday', 'morning_slope_winter_weekend', 'variance_winter_weekend', 'pv_correlation_winter_weekend', 'minimum_tou_spring/autumn_workday', 'daily_nonuniformity_coefficient_winter_weekend', 'variance_spring/autumn_weekend', 'daily_load_factor_winter_weekend', 'minimum_tou_winter_workday', 'variance_summer_weekend'] 

X = StandardScaler().fit_transform(data[selected_features])
y = data['CommunityProfit_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

hls = []
for i in [10, 30, 50, 70, 100]:
    for j in [1, 2, 3, 4 , 5, 10, 30, 50, 70, 100]:
        hls.append((i,)*j)

mlp_params = {
    'hidden_layer_sizes' : hls,
    'solver' : ['adam', 'lbfgs'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'beta_1' : [0.8, 0.9],
    'tol' : [0.000001, 0.00001, 0.0001]
}

mlp_classifier = MLPClassifier(random_state=42,
                               max_iter=10e+8,
                               activation='relu')

mlp_gridsearch = GridSearchCV(estimator=mlp_classifier,
                              param_grid=mlp_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=3,
                              verbose=6)

start_time = time.time()

mlp_gridsearch.fit(X_train, y_train)

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

mlp_result = pd.DataFrame(mlp_gridsearch.cv_results_)
mlp_result.to_pickle('Data/GridSearchResults_AFPS_Profit_SelectedFeatures_MLP.pkl')

print(classification_report(y_true=y_test,
                            y_pred=mlp_gridsearch.predict(X_test)))

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done  68 tasks      | elapsed:   34.2s
[Parallel(n_jobs=3)]: Done 131 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 290 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 416 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 567 tasks      | elapsed:  2.3min
[Parallel(n_jobs=3)]: Done 745 tasks      | elapsed:  3.3min
[Parallel(n_jobs=3)]: Done 932 tasks      | elapsed:  9.5min
[Parallel(n_jobs=3)]: Done 1175 tasks      | elapsed: 13.2min
[Parallel(n_jobs=3)]: Done 1412 tasks      | elapsed: 15.7min
[Parallel(n_jobs=3)]: Done 1675 tasks      | elapsed: 19.8min
[Parallel(n_jobs=3)]: Done 1968 tasks      | elapsed: 40.9min
[Parallel(n_jobs=3)]: Done 2434 tasks      | elapsed: 55.7min
[Parallel(n_jobs=3)]: Done 2771 tasks      | elapsed: 67.1min
[Parallel(n_jobs=3)]: Done 3170 tasks      | elapsed: 71.3min
[Pa

Calculation Runtime: 4.5e+02 Minutes
              precision    recall  f1-score   support

        High       0.42      0.49      0.45       100
         Low       0.34      0.33      0.34       100
      Medium       0.38      0.33      0.35       100

    accuracy                           0.38       300
   macro avg       0.38      0.38      0.38       300
weighted avg       0.38      0.38      0.38       300



In [34]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Profit_SelectedFeatures_MLP.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.01, 'beta_1': 0.8, 'hidden_layer_sizes': (10, 10), 'solver': 'lbfgs', 'tol': 1e-06}
{'alpha': 0.01, 'beta_1': 0.8, 'hidden_layer_sizes': (10, 10), 'solver': 'lbfgs', 'tol': 1e-05}
{'alpha': 0.01, 'beta_1': 0.8, 'hidden_layer_sizes': (10, 10), 'solver': 'lbfgs', 'tol': 0.0001}
{'alpha': 0.01, 'beta_1': 0.9, 'hidden_layer_sizes': (10, 10), 'solver': 'lbfgs', 'tol': 1e-06}
{'alpha': 0.01, 'beta_1': 0.9, 'hidden_layer_sizes': (10, 10), 'solver': 'lbfgs', 'tol': 1e-05}
{'alpha': 0.01, 'beta_1': 0.9, 'hidden_layer_sizes': (10, 10), 'solver': 'lbfgs', 'tol': 0.0001}


## Aggregation-First-Parametrization-Second / Community Gain (AFPS / Gain)

#### Logistic Regression (AFPS / Gain)

##### All Features (AFPS / Gain)

In [15]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_AFPS_Gain_2020-09-25.pkl')

X = data.iloc[:,:-1].copy()
y = data['CommunityGain_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

lr_params = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 100],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0' : [10e-5, 10e-4, 10e-3, 10e-2, 1],
    'tol' : [0.00001, 0.0001, 0.001],
    'l1_ratio' : [0.15, 0.3, 0.5]
}

lr_classifier = SGDClassifier(loss='log',
                              random_state=42,
                              max_iter=10e+6)

lr_gridsearch = GridSearchCV(estimator=lr_classifier,
                             param_grid=lr_params,
                             scoring='accuracy',
                             cv=3,
                             n_jobs=3)

lr_gridsearch.fit(X_train, y_train)

lr_result = pd.DataFrame(lr_gridsearch.cv_results_)
lr_result.to_pickle('Data/GridSearchResults_AFPS_Gain_AllFeatures_LR.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=lr_gridsearch.predict(X_test)))

Calculation Runtime: 2e+02 Minutes
              precision    recall  f1-score   support

        High       0.33      0.31      0.32       100
         Low       0.33      0.60      0.43       100
      Medium       0.27      0.07      0.11       100

    accuracy                           0.33       300
   macro avg       0.31      0.33      0.29       300
weighted avg       0.31      0.33      0.29       300



In [35]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Gain_AllFeatures_LR.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 20, 'eta0': 0.0001, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'penalty': 'l2', 'tol': 1e-05}
{'alpha': 20, 'eta0': 0.0001, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'penalty': 'l2', 'tol': 0.0001}
{'alpha': 20, 'eta0': 0.0001, 'l1_ratio': 0.3, 'learning_rate': 'optimal', 'penalty': 'l2', 'tol': 1e-05}
{'alpha': 20, 'eta0': 0.0001, 'l1_ratio': 0.3, 'learning_rate': 'optimal', 'penalty': 'l2', 'tol': 0.0001}
{'alpha': 20, 'eta0': 0.0001, 'l1_ratio': 0.5, 'learning_rate': 'optimal', 'penalty': 'l2', 'tol': 1e-05}
{'alpha': 20, 'eta0': 0.0001, 'l1_ratio': 0.5, 'learning_rate': 'optimal', 'penalty': 'l2', 'tol': 0.0001}
{'alpha': 20, 'eta0': 0.001, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'penalty': 'l2', 'tol': 1e-05}
{'alpha': 20, 'eta0': 0.001, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'penalty': 'l2', 'tol': 0.0001}
{'alpha': 20, 'eta0': 0.001, 'l1_ratio': 0.3, 'learning_rate': 'optimal', 'penalty': 'l2', 'tol': 1e-05}
{'alpha': 20, 'eta0': 0.001, 'l1_ratio': 

##### Selected Features (AFPS / Gain)

In [17]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

selected_features = ['variance_summer_weekend', 'kurtosis_summer_weekend', 'median_daily_minimum_demand_summer_weekend', 'median_daily_minimum_demand_summer_workday', 'end_of_work_impact_summer_workday', 'daily_nonuniformity_coefficient_summer_weekend', 'pv_correlation_spring/autumn_weekend', 'skewness_summer_weekend', 'lunch_impact_spring/autumn_weekend', 'kurtosis_summer_workday', 'morning_slope_spring/autumn_weekend', 'skewness_winter_workday', 'summer_winter_ratio_weekend', 'skewness_summer_workday', 'variance_summer_workday', 'maximum_tou_winter_workday', 'minimum_tou_spring/autumn_workday', 'minimum_tou_spring/autumn_weekend', 'minimum_tou_winter_weekend', 'kurtosis_winter_workday']

data = pd.read_pickle('Data/Parameter_Values_AFPS_Gain_2020-09-25.pkl')

X = data[selected_features]
y = data['CommunityGain_Class']

# for col in X.columns:
#     X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

start_time = time.time()

lr_params = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 100],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0' : [10e-5, 10e-4, 10e-3, 10e-2, 1],
    'tol' : [0.00001, 0.0001, 0.001],
    'l1_ratio' : [0.15, 0.3, 0.5]
}

lr_classifier = SGDClassifier(loss='log',
                              random_state=42,
                              max_iter=10e+6)

lr_gridsearch = GridSearchCV(estimator=lr_classifier,
                             param_grid=lr_params,
                             scoring='accuracy',
                             cv=3,
                             n_jobs=3)

lr_gridsearch.fit(X_train, y_train)

lr_result = pd.DataFrame(lr_gridsearch.cv_results_)
lr_result.to_pickle('Data/GridSearchResults_AFPS_Gain_SelectedFeatures_LR.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=lr_gridsearch.predict(X_test)))

Calculation Runtime: 1.7e+01 Minutes
              precision    recall  f1-score   support

        High       0.35      0.22      0.27       167
         Low       0.35      0.50      0.41       167
      Medium       0.34      0.32      0.33       166

    accuracy                           0.34       500
   macro avg       0.34      0.34      0.33       500
weighted avg       0.34      0.34      0.33       500



In [36]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Gain_SelectedFeatures_LR.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.1, 'eta0': 0.1, 'l1_ratio': 0.15, 'learning_rate': 'adaptive', 'penalty': 'l2', 'tol': 1e-05}
{'alpha': 0.1, 'eta0': 0.1, 'l1_ratio': 0.3, 'learning_rate': 'adaptive', 'penalty': 'l2', 'tol': 1e-05}
{'alpha': 0.1, 'eta0': 0.1, 'l1_ratio': 0.5, 'learning_rate': 'adaptive', 'penalty': 'l2', 'tol': 1e-05}


#### Extreme Gradient Boosting (AFPS / Gain)

##### All Features (AFPS / Gain)

In [1]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_AFPS_Gain_2020-09-25.pkl')

X = data.iloc[:,:-1]
y = data['CommunityGain_Class']

for col in X.columns:
    X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

xgb_params = {
    'reg_alpha' : [0, 0.001, 0.1, 1, 10, 100],
    'reg_lambda' : [0, 0.001, 0.1, 1, 10, 100],
    'learning_rate' : [0, 0.01, 0.1, 0.3, 0.5],
    'max_depth ' : [3, 5, 7, 9],
    'n_estimators' : [10,11,12,13,14,15,20,50,100,200,500,1000,2000]
}

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   eval_metric='merror',
                                   random_state=42)

xgb_gridsearch = GridSearchCV(estimator=xgb_classifier,
                              param_grid=xgb_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=-1,
                              verbose=6)

xgb_gridsearch.fit(X_train, y_train)

xgb_result = pd.DataFrame(xgb_gridsearch.cv_results_)
xgb_result.to_pickle('Data/GridSearchResults_AFPS_Gain_AllFeatures_XGB.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=xgb_gridsearch.predict(X_test)))

Fitting 3 folds for each of 9360 candidates, totalling 28080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done 605 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 1505 tasks      | elapsed: 47.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 48.3min
[Parallel(n_jobs=-1)]: Done 2105 tasks      | elapsed: 48.7min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 51.3min
[Parallel(n_jobs=-1)]: Done 2805 tasks      | elap

Calculation Runtime: 5.3e+02 Minutes
              precision    recall  f1-score   support

        High       0.31      0.32      0.32       100
         Low       0.26      0.24      0.25       100
      Medium       0.31      0.33      0.32       100

    accuracy                           0.30       300
   macro avg       0.30      0.30      0.30       300
weighted avg       0.30      0.30      0.30       300



In [37]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Gain_AllFeatures_XGB.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'learning_rate': 0.5, 'max_depth ': 3, 'n_estimators': 10, 'reg_alpha': 10, 'reg_lambda': 0.1}
{'learning_rate': 0.5, 'max_depth ': 5, 'n_estimators': 10, 'reg_alpha': 10, 'reg_lambda': 0.1}
{'learning_rate': 0.5, 'max_depth ': 7, 'n_estimators': 10, 'reg_alpha': 10, 'reg_lambda': 0.1}
{'learning_rate': 0.5, 'max_depth ': 9, 'n_estimators': 10, 'reg_alpha': 10, 'reg_lambda': 0.1}


##### Selected Features (AFPS / Gain)

In [1]:
# Import Python modules
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime
import time

# Import data (and filter) data
selected_features = ['variance_summer_weekend', 'kurtosis_summer_weekend', 'median_daily_minimum_demand_summer_weekend', 'median_daily_minimum_demand_summer_workday', 'end_of_work_impact_summer_workday', 'daily_nonuniformity_coefficient_summer_weekend', 'pv_correlation_spring/autumn_weekend', 'skewness_summer_weekend', 'lunch_impact_spring/autumn_weekend', 'kurtosis_summer_workday', 'morning_slope_spring/autumn_weekend', 'skewness_winter_workday', 'summer_winter_ratio_weekend', 'skewness_summer_workday', 'variance_summer_workday', 'maximum_tou_winter_workday', 'minimum_tou_spring/autumn_workday', 'minimum_tou_spring/autumn_weekend', 'minimum_tou_winter_weekend', 'kurtosis_winter_workday']

data = pd.read_pickle('Data/Parameter_Values_AFPS_Gain_2020-09-25.pkl')

X = data[selected_features].copy()
y = data['CommunityGain_Class']

for col in X.columns:
    X[col] = pd.to_numeric(X[col])

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

xgb_params = {
    'reg_alpha' : [0, 0.001, 0.1, 1, 10, 100],
    'reg_lambda' : [0, 0.001, 0.1, 1, 10, 100],
    'learning_rate' : [0, 0.01, 0.1, 0.3, 0.5],
    'max_depth ' : [3, 5, 7, 9],
    'n_estimators' : [10,11,12,13,14,15,20,50,100,200,500,1000,2000]
}

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   eval_metric='merror',
                                   random_state=42)

xgb_gridsearch = GridSearchCV(estimator=xgb_classifier,
                              param_grid=xgb_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=-1,
                              verbose=6)

xgb_gridsearch.fit(X_train, y_train)

xgb_result = pd.DataFrame(xgb_gridsearch.cv_results_)
xgb_result.to_pickle('Data/GridSearchResults_AFPS_Gain_SelectedFeatures_XGB.pkl')

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

print(classification_report(y_true=y_test,
                            y_pred=xgb_gridsearch.predict(X_test)))

Fitting 3 folds for each of 9360 candidates, totalling 28080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 344 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 954 tasks      | elapsed:   54.3s
[Parallel(n_jobs=-1)]: Done 1117 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1304 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1668 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 2394 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done 2657 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 3152 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 3885 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 4222 tasks      | elapsed: 24.3min
[Parallel(n_jobs=-1)]: Done 5129 tasks      | el

Calculation Runtime: 1.2e+02 Minutes
              precision    recall  f1-score   support

        High       0.35      0.35      0.35       100
         Low       0.36      0.34      0.35       100
      Medium       0.32      0.33      0.32       100

    accuracy                           0.34       300
   macro avg       0.34      0.34      0.34       300
weighted avg       0.34      0.34      0.34       300



In [38]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Gain_SelectedFeatures_XGB.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'learning_rate': 0.5, 'max_depth ': 3, 'n_estimators': 20, 'reg_alpha': 1, 'reg_lambda': 100}
{'learning_rate': 0.5, 'max_depth ': 5, 'n_estimators': 20, 'reg_alpha': 1, 'reg_lambda': 100}
{'learning_rate': 0.5, 'max_depth ': 7, 'n_estimators': 20, 'reg_alpha': 1, 'reg_lambda': 100}
{'learning_rate': 0.5, 'max_depth ': 9, 'n_estimators': 20, 'reg_alpha': 1, 'reg_lambda': 100}


#### Multi-Layer Perceptron (AFPS / Gain)

##### All Features (AFPS / Gain)

In [3]:
# Import Python modules
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

data = pd.read_pickle('Data/Parameter_Values_AFPS_Gain_2020-09-25.pkl')

X = StandardScaler().fit_transform(data.iloc[:,:-1])
y = data['CommunityGain_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

start_time = time.time()

hls = []
for i in [10, 30, 50, 70, 100]:
    for j in [1, 2, 3, 4 , 5, 10, 30, 50, 70, 100]:
        hls.append((i,)*j)

mlp_params = {
    'hidden_layer_sizes' : hls,
    'solver' : ['adam', 'lbfgs'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'beta_1' : [0.8, 0.9],
    'tol' : [0.000001, 0.00001, 0.0001]
}

mlp_classifier = MLPClassifier(random_state=42,
                               max_iter=10e+8,
                               activation='relu')

mlp_gridsearch = GridSearchCV(estimator=mlp_classifier,
                              param_grid=mlp_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=-1,
                              verbose=6)

start_time = time.time()

mlp_gridsearch.fit(X_train, y_train)

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

mlp_result = pd.DataFrame(mlp_gridsearch.cv_results_)
mlp_result.to_pickle('Data/GridSearchResults_AFPS_Gain_AllFeatures_MLP.pkl')

print(classification_report(y_true=y_test,
                            y_pred=mlp_gridsearch.predict(X_test)))

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done 450 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done 601 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 764 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 952 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 1232 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1510 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 1773 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 2084 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 2397 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 2734 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 3097 tasks      | elap

Calculation Runtime: 3e+02 Minutes
              precision    recall  f1-score   support

        High       0.28      0.28      0.28       100
         Low       0.22      0.20      0.21       100
      Medium       0.31      0.33      0.32       100

    accuracy                           0.27       300
   macro avg       0.27      0.27      0.27       300
weighted avg       0.27      0.27      0.27       300



In [39]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Gain_AllFeatures_MLP.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.001, 'beta_1': 0.8, 'hidden_layer_sizes': (100, 100, 100, 100, 100, 100, 100, 100, 100, 100), 'solver': 'lbfgs', 'tol': 1e-05}
{'alpha': 0.001, 'beta_1': 0.9, 'hidden_layer_sizes': (100, 100, 100, 100, 100, 100, 100, 100, 100, 100), 'solver': 'lbfgs', 'tol': 1e-05}


In [None]:
# # Import Python modules
# import pandas as pd
# import numpy as np
# from sklearn.neural_network import MLPClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import classification_report
# from sklearn.preprocessing import StandardScaler
# from datetime import datetime
# import time

# data = pd.read_pickle('Data/Parameter_Values_Aggregated_Profit_2020-09-15.pkl')

# X = X_train = StandardScaler().fit_transform(data.iloc[:,:94])
# y = data['CommunityProfit_Class']

# # Train-Test-Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# mlp_classifier = MLPClassifier(hidden_layer_sizes=(50,),
#                                activation='relu',
#                                solver='adam',
#                                random_state=42,
#                                max_iter=1000000,
#                                beta_1=0.8,
#                                verbose=0,
#                                tol=0.00001)

# mlp_classifier.fit(X_train, y_train)

# print(classification_report(y_true=y_test,
#                             y_pred=mlp_classifier.predict(X_test)))

##### Selected Features (AFPS / Gain)

In [None]:
# Import Python modules
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import time

selected_features = ['variance_summer_weekend', 'kurtosis_summer_weekend', 'median_daily_minimum_demand_summer_weekend', 'median_daily_minimum_demand_summer_workday', 'end_of_work_impact_summer_workday', 'daily_nonuniformity_coefficient_summer_weekend', 'pv_correlation_spring/autumn_weekend', 'skewness_summer_weekend', 'lunch_impact_spring/autumn_weekend', 'kurtosis_summer_workday', 'morning_slope_spring/autumn_weekend', 'skewness_winter_workday', 'summer_winter_ratio_weekend', 'skewness_summer_workday', 'variance_summer_workday', 'maximum_tou_winter_workday', 'minimum_tou_spring/autumn_workday', 'minimum_tou_spring/autumn_weekend', 'minimum_tou_winter_weekend', 'kurtosis_winter_workday']

data = pd.read_pickle('Data/Parameter_Values_AFPS_Gain_2020-09-25.pkl')

X = StandardScaler().fit_transform(data[selected_features])
y = data['CommunityGain_Class']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
n
hls = []
for i in [10, 30, 50, 70, 100]:
    for j in [1, 2, 3, 4 , 5, 10, 30, 50, 70, 100]:
        hls.append((i,)*j)

mlp_params = {
    'hidden_layer_sizes' : hls,
    'solver' : ['adam', 'lbfgs'],
    'alpha' : [0.0001, 0.001, 0.01, 0.1, 1],
    'beta_1' : [0.8, 0.9],
    'tol' : [0.000001, 0.00001, 0.0001]
}

mlp_classifier = MLPClassifier(random_state=42,
                               max_iter=10e+8,
                               activation='relu')

mlp_gridsearch = GridSearchCV(estimator=mlp_classifier,
                              param_grid=mlp_params,
                              scoring='accuracy',
                              cv=3,
                              n_jobs=-1,
                              verbose=6)

start_time = time.time()

mlp_gridsearch.fit(X_train, y_train)

print('Calculation Runtime: {:.2} Minutes'.format((time.time() - start_time)/60))

mlp_result = pd.DataFrame(mlp_gridsearch.cv_results_)
mlp_result.to_pickle('Data/GridSearchResults_AFPS_Gain_SelectedFeatures_MLP.pkl')

In [7]:
print(classification_report(y_true=y_test,
                            y_pred=mlp_gridsearch.predict(X_test)))

              precision    recall  f1-score   support

        High       0.29      0.14      0.19       100
         Low       0.29      0.31      0.30       100
      Medium       0.31      0.44      0.36       100

    accuracy                           0.30       300
   macro avg       0.29      0.30      0.28       300
weighted avg       0.29      0.30      0.28       300



In [40]:
result = pd.read_pickle('Data/GridSearchResults_AFPS_Gain_SelectedFeatures_MLP.pkl')
best = result[result['rank_test_score']==1]['params'].to_list()
for x in best:
    print(x)

{'alpha': 0.01, 'beta_1': 0.8, 'hidden_layer_sizes': (10, 10, 10, 10, 10), 'solver': 'lbfgs', 'tol': 1e-06}
{'alpha': 0.01, 'beta_1': 0.8, 'hidden_layer_sizes': (10, 10, 10, 10, 10), 'solver': 'lbfgs', 'tol': 1e-05}
{'alpha': 0.01, 'beta_1': 0.8, 'hidden_layer_sizes': (10, 10, 10, 10, 10), 'solver': 'lbfgs', 'tol': 0.0001}
{'alpha': 0.01, 'beta_1': 0.9, 'hidden_layer_sizes': (10, 10, 10, 10, 10), 'solver': 'lbfgs', 'tol': 1e-06}
{'alpha': 0.01, 'beta_1': 0.9, 'hidden_layer_sizes': (10, 10, 10, 10, 10), 'solver': 'lbfgs', 'tol': 1e-05}
{'alpha': 0.01, 'beta_1': 0.9, 'hidden_layer_sizes': (10, 10, 10, 10, 10), 'solver': 'lbfgs', 'tol': 0.0001}


------------------------------------
#### Calculation of the Number of Fitted Models

In [45]:
import pandas as pd

lr = pd.read_pickle('Data/GridSearchResults_PFAS_Profit_AllFeatures_LR.pkl')
xgb = pd.read_pickle('Data/GridSearchResults_PFAS_Profit_AllFeatures_XGB.pkl')
mlp = pd.read_pickle('Data/GridSearchResults_PFAS_Profit_AllFeatures_MLP.pkl')

n_classification_tasks = 2
n_parameter_subsets = 2
n_calculation_approches = 2
n_models_lr = len(lr.index)
n_models_xgb = len(xgb.index)
n_models_mlp = len(mlp.index)

print('Total Number of Evaluated Models: ', n_classification_tasks*
                                            n_parameter_subsets*
                                            n_calculation_approches*
                                            (n_models_lr + n_models_xgb + n_models_mlp))

Total Number of Evaluated Models:  137760
