# ENEXIS Graduation Project

####  Feature selection using lasso

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

<a id='readpv'></a>
#### Reading of PV installed capacity & demographics dataset

In [2]:
c_path = "../Data/"
v_file = "PV installed capacity & demographics"

In [3]:
df = pd.read_csv(filepath_or_buffer = c_path + v_file + ".csv", encoding= 'UTF-8')

Setup the pipeline

In [4]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

pipeline = Pipeline([
    ('impute', imputer),
    ('scaler',StandardScaler()), 
    ('model',Lasso())])

Optimize the model on the alpha hyperparameter in the specified range

In [5]:
search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.5,10,0.01)},
                      cv = 5, 
                      scoring="neg_mean_squared_error",
                      verbose=3, error_score="raise")

Use only the 2019 data, since that is the most complete

In [6]:
year = 'Year'
df = df[df[year]=='2019-01-01']

And since only one year has no variance, it can be removed from the data set

In [7]:
df = df.drop(columns=year)

Further removal of string type columns

In [8]:
string_types = df.dtypes[df.dtypes == 'object']
string_types

BU_2021            object
BU_NAAM            object
WK_2021            object
WK_NAAM            object
GM_2021            object
GM_NAAM            object
ProvinciecodePV    object
Provincienaam      object
dtype: object

In [9]:
string_type_columns = [
    'BU_2021','BU_NAAM','WK_2021',
    'WK_NAAM','GM_2021','GM_NAAM',
    'ProvinciecodePV','Provincienaam']
df = df.drop(columns=string_type_columns)

Remove columns that have derived columns

In [10]:
redundant_columns = [
    'Aantal aansluitingen met opwekinstallatie', 
    'OV_per_installatie',
    'PVinstallaties_per100houshoudens']
df = df.drop(columns=redundant_columns)

In [11]:
opgesteld_vermogen = 'Opgesteld_vermogen_per100houshoudens'

df_y = df[opgesteld_vermogen]
df_features = df.drop(columns=opgesteld_vermogen)
df_features = df_features.drop(columns='Opgesteld vermogen')

X_train, X_test, y_train, y_test = train_test_split(df_features, df_y, test_size=0.33, random_state=123)

Do a grid search for the best features

In [12]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 9990 candidates, totalling 49950 fits
[CV 1/5] END .............model__alpha=0.01;, score=-1265.134 total time=   0.0s
[CV 2/5] END .............model__alpha=0.01;, score=-1394.542 total time=   0.0s
[CV 3/5] END .............model__alpha=0.01;, score=-1801.389 total time=   0.0s
[CV 4/5] END .............model__alpha=0.01;, score=-1112.522 total time=   0.0s
[CV 5/5] END .............model__alpha=0.01;, score=-1097.916 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 1/5] END ............model__alpha=0.011;, score=-1265.088 total time=   0.0s
[CV 2/5] END ............model__alpha=0.011;, score=-1394.988 total time=   0.0s
[CV 3/5] END ............model__alpha=0.011;, score=-1801.277 total time=   0.0s
[CV 4/5] END ............model__alpha=0.011;, score=-1112.558 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END ............model__alpha=0.011;, score=-1097.854 total time=   0.0s
[CV 1/5] END model__alpha=0.011999999999999999;, score=-1265.042 total time=   0.0s
[CV 2/5] END model__alpha=0.011999999999999999;, score=-1395.472 total time=   0.0s
[CV 3/5] END model__alpha=0.011999999999999999;, score=-1801.165 total time=   0.0s
[CV 4/5] END model__alpha=0.011999999999999999;, score=-1112.596 total time=   0.0s
[CV 5/5] END model__alpha=0.011999999999999999;, score=-1097.792 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 1/5] END model__alpha=0.012999999999999998;, score=-1264.997 total time=   0.0s
[CV 2/5] END model__alpha=0.012999999999999998;, score=-1395.993 total time=   0.0s
[CV 3/5] END model__alpha=0.012999999999999998;, score=-1801.054 total time=   0.0s
[CV 4/5] END model__alpha=0.012999999999999998;, score=-1112.634 total time=   0.0s
[CV 5/5] END model__alpha=0.012999999999999998;, score=-1097.732 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 1/5] END model__alpha=0.013999999999999997;, score=-1264.953 total time=   0.0s
[CV 2/5] END model__alpha=0.013999999999999997;, score=-1396.401 total time=   0.0s
[CV 3/5] END model__alpha=0.013999999999999997;, score=-1800.942 total time=   0.0s
[CV 4/5] END model__alpha=0.013999999999999997;, score=-1112.673 total time=   0.0s
[CV 5/5] END model__alpha=0.013999999999999997;, score=-1097.672 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 1/5] END model__alpha=0.014999999999999996;, score=-1264.910 total time=   0.0s
[CV 2/5] END model__alpha=0.014999999999999996;, score=-1396.368 total time=   0.0s
[CV 3/5] END model__alpha=0.014999999999999996;, score=-1800.832 total time=   0.0s
[CV 4/5] END model__alpha=0.014999999999999996;, score=-1112.713 total time=   0.0s
[CV 5/5] END model__alpha=0.014999999999999996;, score=-1097.612 total time=   0.0s
[CV 1/5] END model__alpha=0.015999999999999993;, score=-1264.867 total time=   0.0s
[CV 2/5] END model__alpha=0.015999999999999993;, score=-1396.335 total time=   0.0s
[CV 3/5] END model__alpha=0.015999999999999993;, score=-1800.721 total time=   0.0s
[CV 4/5] END model__alpha=0.015999999999999993;, score=-1112.754 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END model__alpha=0.015999999999999993;, score=-1097.554 total time=   0.0s
[CV 1/5] END model__alpha=0.016999999999999994;, score=-1264.826 total time=   0.0s
[CV 2/5] END model__alpha=0.016999999999999994;, score=-1396.302 total time=   0.0s
[CV 3/5] END model__alpha=0.016999999999999994;, score=-1800.611 total time=   0.0s
[CV 4/5] END model__alpha=0.016999999999999994;, score=-1112.797 total time=   0.0s
[CV 5/5] END model__alpha=0.016999999999999994;, score=-1097.495 total time=   0.0s
[CV 1/5] END model__alpha=0.017999999999999995;, score=-1264.785 total time=   0.0s
[CV 2/5] END model__alpha=0.017999999999999995;, score=-1396.269 total time=   0.0s
[CV 3/5] END model__alpha=0.017999999999999995;, score=-1800.501 total time=   0.0s
[CV 4/5] END model__alpha=0.017999999999999995;, score=-1112.840 total time=   0.0s
[CV 5/5] END model__alpha=0.017999999999999995;, score=-1097.438 total time=   0.0s
[CV 1/5] END model__alpha=0.018999999999999993;, score=-1264.745 total time=

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 5/5] END model__alpha=0.018999999999999993;, score=-1097.381 total time=   0.0s
[CV 1/5] END model__alpha=0.01999999999999999;, score=-1264.708 total time=   0.0s
[CV 2/5] END model__alpha=0.01999999999999999;, score=-1396.203 total time=   0.0s
[CV 3/5] END model__alpha=0.01999999999999999;, score=-1800.283 total time=   0.0s
[CV 4/5] END model__alpha=0.01999999999999999;, score=-1112.988 total time=   0.0s
[CV 5/5] END model__alpha=0.01999999999999999;, score=-1097.325 total time=   0.0s
[CV 1/5] END model__alpha=0.02099999999999999;, score=-1264.671 total time=   0.0s
[CV 2/5] END model__alpha=0.02099999999999999;, score=-1396.170 total time=   0.0s
[CV 3/5] END model__alpha=0.02099999999999999;, score=-1800.174 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 4/5] END model__alpha=0.02099999999999999;, score=-1113.079 total time=   0.0s
[CV 5/5] END model__alpha=0.02099999999999999;, score=-1097.269 total time=   0.0s
[CV 1/5] END model__alpha=0.021999999999999992;, score=-1264.635 total time=   0.0s
[CV 2/5] END model__alpha=0.021999999999999992;, score=-1396.138 total time=   0.0s
[CV 3/5] END model__alpha=0.021999999999999992;, score=-1800.066 total time=   0.0s
[CV 4/5] END model__alpha=0.021999999999999992;, score=-1113.169 total time=   0.0s
[CV 5/5] END model__alpha=0.021999999999999992;, score=-1097.215 total time=   0.0s
[CV 1/5] END model__alpha=0.02299999999999999;, score=-1264.599 total time=   0.0s
[CV 2/5] END model__alpha=0.02299999999999999;, score=-1396.105 total time=   0.0s
[CV 3/5] END model__alpha=0.02299999999999999;, score=-1799.958 total time=   0.0s
[CV 4/5] END model__alpha=0.02299999999999999;, score=-1113.261 total time=   0.0s
[CV 5/5] END model__alpha=0.02299999999999999;, score=-1097.160 total time=   0.0s

Check the outcome of the grid search and evaluate the coefficients at the point of the best result

In [None]:
search.best_params_

In [None]:
coefficients = search.best_estimator_.named_steps['model'].coef_

What coefficients are NOT relevant:

In [None]:
np.array(df_features.columns)[coefficients == 0]

What coefficients are relevant and should be taken into account:

In [None]:
np.array(df_features.columns)[coefficients != 0]