In [3]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 

In [104]:
# Setze die globale Einstellung für die Anzeige von numerischen Werten
pd.set_option('display.float_format', '{:.4f}'.format)

# Pfad zur Excel-Datei
path_to_csv = "../data/trainingset.csv"

# Lade die Excel-Datei
df = pd.read_csv(path_to_csv)

# Pfad zur Excel-Datei
path_to_csv = "../data/trainingset_with_log_target.csv"

# Lade die Excel-Datei
df2 = pd.read_csv(path_to_csv)

# Pfad zur Excel-Datei
path_to_csv = "../data/trainingsettransformed.csv"

# Lade die Excel-Datei
df3 = pd.read_csv(path_to_csv)

In [62]:
df['cTOTEXn'] = np.sqrt(df['cTOTEXn'])

In [90]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def lasso_regression(train_df, target_column, test_size=0.1, random_state=42):
    # Split data into features and target
    X = train_df.drop(columns=[target_column])
    y = train_df[target_column]
    
    # Split into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Standardize the features using only the training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print("Performing Lasso regression...")
    
    # Lasso regression with cross-validation for hyperparameter tuning of regularization parameter alpha
    lasso = LassoCV(cv=5, random_state=random_state, max_iter=10000).fit(X_train_scaled, y_train)
    
    selected_features_lasso = np.where(lasso.coef_ != 0)[0]
    selected_feature_names_lasso = X.columns[selected_features_lasso]

    print(f"Selected features by Lasso ({len(selected_feature_names_lasso)}): {selected_feature_names_lasso}")

    print("Predicting on the training and test data (Lasso)...")

    y_train_pred_lasso = lasso.predict(X_train_scaled)
    y_test_pred_lasso = lasso.predict(X_test_scaled)

    print("Evaluating the model (Lasso)...")
    
    # Evaluating on train data
    train_rmse_lasso = np.sqrt(mean_squared_error(y_train, y_train_pred_lasso))
    train_mae_lasso = mean_absolute_error(y_train, y_train_pred_lasso)
    train_mape_lasso = mean_absolute_percentage_error(y_train, y_train_pred_lasso)

    # Evaluating on test data
    test_rmse_lasso = np.sqrt(mean_squared_error(y_test, y_test_pred_lasso))
    test_mae_lasso = mean_absolute_error(y_test, y_test_pred_lasso)
    test_mape_lasso = mean_absolute_percentage_error(y_test, y_test_pred_lasso)

    # Collect metrics
    results_lasso_dict = {
        "Model": ["Lasso"] * 6,
        "Metric": ["Training RMSE", "Training MAE", "Training MAPE",
                   "Testing RMSE", "Testing MAE", "Testing MAPE"],
        "Value": [f"{train_rmse_lasso:.2f}", f"{train_mae_lasso:.2f}", f"{train_mape_lasso:.2f}",
                  f"{test_rmse_lasso:.2f}", f"{test_mae_lasso:.2f}", f"{test_mape_lasso:.2f}"]
    }

    results_lasso_df = pd.DataFrame(results_lasso_dict)
    print(results_lasso_df)
    
    # Collect variable importance
    variable_importance_dict = {
        "Feature": selected_feature_names_lasso,
        "Coefficient": lasso.coef_[selected_features_lasso]
    }
    variable_importance_df = pd.DataFrame(variable_importance_dict)
    variable_importance_df = variable_importance_df.sort_values(by="Coefficient", ascending=False).reset_index(drop=True)

    print("\nVariable Importance (Lasso):")
    print(variable_importance_df)
    
    return lasso


In [120]:
df_filtered = df.filter(regex='(tot|sum)$|^cTOTEXn$')
lasso_regression(df_filtered, 'cTOTEXn', random_state = 43)

Performing Lasso regression...
Selected features by Lasso (15): Index(['yConnections.other.dso.same.tot', 'yMeters.cp.nonctrl.tot',
       'yMeters.read.tot', 'yMeters.read.noncp.sum',
       'yMeters.over10MWh.noRPM.tot', 'yMeters.over10MWh.RPM.tot',
       'yMeters.noncp.ctrl.tot', 'yLines.all.N13.sum', 'yLines.circuit.tot',
       'yTransformers.power.own.tot', 'yInstalledPower.renewables.hydro.tot',
       'yEnergy.losses.tot', 'yEnergy.delivered.N1357.sum',
       'yEnergy.delivered.net.N23.sum', 'zPowered.nets.level.tot'],
      dtype='object')
Predicting on the training and test data (Lasso)...
Evaluating the model (Lasso)...
   Model         Metric        Value
0  Lasso  Training RMSE   5206406.68
1  Lasso   Training MAE   3470564.77
2  Lasso  Training MAPE         0.16
3  Lasso   Testing RMSE  16715711.79
4  Lasso    Testing MAE  10248885.54
5  Lasso   Testing MAPE         0.22

Variable Importance (Lasso):
                                 Feature   Coefficient
0              

In [100]:
lasso_regression(df, 'cTOTEXn', random_state = 32)

Performing Lasso regression...
Selected features by Lasso (73): Index(['yConnections.N5', 'yConnections.streetlights.N7',
       'yConnections.cus.N5', 'yConnections.other.dso.same.tot',
       'yInjectionPoints.KWKG.N7', 'yInjectionPoints.other.N7',
       'yConnections.incl.inj.N5', 'yMeters.read.ext.N3',
       'yMeters.read.ext.N5', 'yMeters.read.ext.N6', 'yMeters.cp.nonctrl.N5',
       'yMeters.over10MWh.RPM.N3', 'yMeters.over10MWh.RPM.N5',
       'yMeters.cp.tot', 'yCables.circuit.N3', 'yCables.streetlights.N7',
       'yCables.all.N13.sum', 'yLines.circuit.tot', 'yNet.length.N3',
       'yTransformers.power.ront.N4', 'ySwitchingstations.N3',
       'yInstalledPower.renewables.hydro.N5', 'yInstalledPower.KWKG.N7',
       'yInjection.generation.N3', 'yInjection.renewables.hydro.N5',
       'yInjection.KWKG.N6', 'yInjection.other.N3', 'yInjection.N5',
       'yInjection.other.dso.N3', 'yInjection.other.dso.N4',
       'yInjection.other.dso.N5', 'yInjection.upper.nets.N6',
       'y

In [94]:
lasso = lasso_regression(df2, 'cTOTEXn_log', random_state = 44)

Performing Lasso regression...
Selected features by Lasso (23): Index(['yInjectionPoints.renewables.hydro.N5_log',
       'yInjectionPoints.other.N7_log', 'yMeters.flatrate.N7_log',
       'yMeters.over10MWh.RPM.N5_log', 'yNet.length.excl.house.tot_log',
       'ySubstations.N4_log', 'ySubstations.own.N4_log',
       'ySwitchingstations.N3_log', 'yInstalledPower.KWKG.N7_log',
       'yInstalledPower.other.N7_log', 'yInstalledPower.other.tot_log',
       'yInstalledPower.dec.sum_log', 'yInstalledPower.KWKG.other.tot_log',
       'yEnergy.losses.tot_log', 'yEnergy.delivered.N1357.sum_log',
       'yPeakload.2.N6_log', 'yPeakload.injection.N6_log', 'yPeakload.max_log',
       'yArea.other.N7_log', 'zResources.excl.transformer.N4_log',
       'zSoil.BK0267_log', 'zSoil.GB0378_log', 'rMeters.active.share.N7_poly'],
      dtype='object')
Predicting on the training and test data (Lasso)...
Evaluating the model (Lasso)...
   Model         Metric Value
0  Lasso  Training RMSE  0.14
1  Lasso   T

In [31]:
print("Optimal alpha: ", lasso.alpha_)

# Check coefficients
print("Coefficients: ", lasso.coef_)

Optimal alpha:  39028.82941938902
Coefficients:  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000

In [37]:
df2.describe()

Unnamed: 0,yConnections.N3_log,yConnections.N4_log,yConnections.N5_log,yConnections.N6_log,yConnections.N7_log,yConnections.tot_log,yConnections.streetlights.dir.N7_log,yConnections.streetlights.N7_log,yConnections.streetlights.N6_log,yConnections.streetlights.sum_log,...,rUsageHours.injection.N5,rUsageHours.injection.N6,rUsageHours.injection.N7,rUsageHours.delivered.N3_log,rUsageHours.delivered.N4_log,rUsageHours.delivered.N5_poly,rUsageHours.delivered.N6,rUsageHours.delivered.N7_poly,rUsageHours.delivered.tot_log,cTOTEXn_log
count,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,...,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0
mean,1.4058,3.6467,6.8036,8.5325,10.3275,10.5981,2.531,4.7394,2.1056,5.3197,...,5291.5595,4741.0001,4940.3942,4.2693,5.7175,62022906.1227,4858.7045,62022906.1227,9.6948,62022907.1227
std,1.7576,2.4604,1.4707,1.4508,1.8031,1.3863,3.6592,2.5242,2.6932,2.2947,...,1046.9922,765.9807,798.073,4.2873,4.0677,37634324.5109,745.3133,31427401.9267,0.2733,154397591.8399
min,0.0,0.0,0.0,0.0,0.0,3.0445,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-15342854.9377,0.0,-3492497.6788,8.8101,1432415.0264
25%,0.0,1.1705,6.0828,7.8406,9.7686,9.9268,0.0,4.3372,0.0,5.0321,...,5062.2213,4490.4592,4859.1365,0.0,0.0,45605366.8201,4613.9814,46815961.9807,9.4941,9386663.3752
50%,0.0,4.1584,6.5978,8.3175,10.1227,10.332,0.0,5.3469,0.0,5.6936,...,5383.1066,4837.1555,5105.8928,3.7623,8.453,54684818.6875,4922.048,61694823.6483,9.7281,15578564.3705
75%,2.6908,5.4424,7.4305,9.1935,11.053,11.223,5.4323,6.3306,4.734,6.5265,...,5716.606,5104.8187,5266.4616,8.5558,8.6038,65450190.2215,5190.1096,75657996.5316,9.895,40194247.8085
max,7.0094,8.7263,10.9474,12.5876,14.4851,14.6228,12.6916,10.2509,9.1944,10.2794,...,8222.0332,7647.0297,7111.7842,9.6423,10.0623,359532845.3226,7832.0494,261704977.0451,10.7328,1280552773.1402


In [58]:
lasso_regression(df, 'cTOTEXn', random_state = 45)

Performing Lasso regression...
Selected features by Lasso (4): Index(['rPeakload.injection.per.Peakload.N6_poly',
       'rMeters.active.share.all_poly', 'rUsageHours.delivered.N5_poly',
       'rUsageHours.delivered.N7_poly'],
      dtype='object')
Predicting on the training and test data (Lasso)...
Evaluating the model (Lasso)...
   Model         Metric         Value
0  Lasso  Training RMSE  143361603.02
1  Lasso   Training MAE   64500609.36
2  Lasso  Training MAPE          3.31
3  Lasso   Testing RMSE   66808362.19
4  Lasso    Testing MAE   48208302.56
5  Lasso   Testing MAPE          3.14

Variable Importance (Lasso):
                                    Feature  Coefficient
0  rPeakload.injection.per.Peakload.N6_poly       0.7551
1             rMeters.active.share.all_poly       0.7357
2             rUsageHours.delivered.N5_poly       0.7301
3             rUsageHours.delivered.N7_poly       0.3364


In [48]:
lasso_regression(df, 'cTOTEXn', random_state = 45)

Performing Lasso regression...
Selected features by Lasso (5): Index(['yInjection.generation.tot', 'yInjection.dec.sum',
       'yEnergy.delivered.N1357.sum', 'yEnergy.delivered.net.N2to4.sum',
       'yEnergy.delivered.net.N5to7.sum'],
      dtype='object')
Predicting on the training and test data (Lasso)...
Evaluating the model (Lasso)...
   Model         Metric        Value
0  Lasso  Training RMSE  13979682.62
1  Lasso   Training MAE   6867632.27
2  Lasso  Training MAPE         0.21
3  Lasso   Testing RMSE  53658764.30
4  Lasso    Testing MAE  17172598.66
5  Lasso   Testing MAPE         0.21

Variable Importance (Lasso):
                           Feature  Coefficient
0      yEnergy.delivered.N1357.sum       0.0099
1  yEnergy.delivered.net.N5to7.sum       0.0078
2        yInjection.generation.tot       0.0028
3               yInjection.dec.sum       0.0023
4  yEnergy.delivered.net.N2to4.sum      -0.0027


In [None]:
66808362.19
53658764.30

62022907.12