# ENEXIS Graduation Project

####  Feature selection using lasso

In [159]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

<a id='readpv'></a>
#### Reading of PV installed capacity & demographics dataset

In [160]:
c_path = "../Data/"
v_file = "PV installed capacity & demographics"

In [161]:
df = pd.read_csv(filepath_or_buffer = c_path + v_file + ".csv", encoding= 'UTF-8')

Setup the pipeline

In [162]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

pipeline = Pipeline([
    ('impute', imputer),
    ('scaler',StandardScaler()), 
    ('model',Lasso())])

Optimize the model on the alpha hyperparameter in the specified range

In [163]:
search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.01,3,0.005)},
                      cv = 5, 
                      scoring="neg_mean_squared_error",
                      verbose=3, error_score="raise")

Use only the 2019 data, since that is the most complete

In [164]:
year = 'Year'
df = df[df[year]=='2019-01-01']

And since only one year has no variance, it can be removed from the data set

In [165]:
df = df.drop(columns=year)

Further removal of string type columns

In [166]:
string_types = df.dtypes[df.dtypes == 'object']
string_types

BU_2021            object
BU_NAAM            object
WK_2021            object
WK_NAAM            object
GM_2021            object
GM_NAAM            object
ProvinciecodePV    object
Provincienaam      object
dtype: object

In [167]:
string_type_columns = [
    'BU_2021','BU_NAAM','WK_2021',
    'WK_NAAM','GM_2021','GM_NAAM',
    'ProvinciecodePV','Provincienaam']
df = df.drop(columns=string_type_columns)

In [168]:
opgesteld_vermogen = 'Opgesteld_vermogen_per100houshoudens'

df_y = df[opgesteld_vermogen]
df_features = df.drop(columns=opgesteld_vermogen)

# feature_columns = df.loc[:, df.columns != opgesteld_vermogen]

X_train, X_test, y_train, y_test = train_test_split(df_features, df_y, test_size=0.33, random_state=55)


Do a grid search for the best features

In [169]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 598 candidates, totalling 2990 fits
[CV 1/5] END ..............model__alpha=0.01;, score=-138.855 total time=   0.0s
[CV 2/5] END ..............model__alpha=0.01;, score=-117.008 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.01;, score=-256.487 total time=   0.0s
[CV 4/5] END ..............model__alpha=0.01;, score=-172.838 total time=   0.0s
[CV 5/5] END ..............model__alpha=0.01;, score=-309.051 total time=   0.0s
[CV 1/5] END .............model__alpha=0.015;, score=-138.857 total time=   0.0s
[CV 2/5] END .............model__alpha=0.015;, score=-116.881 total time=   0.0s
[CV 3/5] END .............model__alpha=0.015;, score=-256.482 total time=   0.0s
[CV 4/5] END .............model__alpha=0.015;, score=-172.592 total time=   0.0s
[CV 5/5] END .............model__alpha=0.015;, score=-308.995 total time=   0.0s
[CV 1/5] END model__alpha=0.019999999999999997;, score=-138.865 total time=   0.0s
[CV 2/5] END model__alpha=0.019999999999999

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('impute', SimpleImputer()),
                                       ('scaler', StandardScaler()),
                                       ('model', Lasso())]),
             param_grid={'model__alpha': array([0.01 , 0.015, 0.02 , 0.025, 0.03 , 0.035, 0.04 , 0.045, 0.05 ,
       0.055, 0.06 , 0.065, 0.07 , 0.075, 0.08 , 0.085, 0.09 , 0.095,
       0.1  , 0.105, 0.11 , 0.115, 0.12 , 0.125, 0.13 , 0.135, 0.14 ,
       0.145, 0.15 , 0.155, 0.16 , 0.165, 0....
       2.71 , 2.715, 2.72 , 2.725, 2.73 , 2.735, 2.74 , 2.745, 2.75 ,
       2.755, 2.76 , 2.765, 2.77 , 2.775, 2.78 , 2.785, 2.79 , 2.795,
       2.8  , 2.805, 2.81 , 2.815, 2.82 , 2.825, 2.83 , 2.835, 2.84 ,
       2.845, 2.85 , 2.855, 2.86 , 2.865, 2.87 , 2.875, 2.88 , 2.885,
       2.89 , 2.895, 2.9  , 2.905, 2.91 , 2.915, 2.92 , 2.925, 2.93 ,
       2.935, 2.94 , 2.945, 2.95 , 2.955, 2.96 , 2.965, 2.97 , 2.975,
       2.98 , 2.985, 2.99 , 2.995])},
     

Check the outcome of the grid search and evaluate the coefficients at the point of the best result

In [170]:
# sorted(search.cv_results_.keys())

In [171]:
search.best_params_

{'model__alpha': 0.05499999999999999}

In [172]:
coefficients = search.best_estimator_.named_steps['model'].coef_

What coe

In [173]:
importance = np.abs(coefficients)
importance

array([1.96991121e+01, 1.44100330e+01, 1.47699519e+01, 6.67250975e-03,
       3.52736588e+00, 1.07697434e+00, 1.27716450e+00, 0.00000000e+00,
       0.00000000e+00, 1.83355523e+00, 3.49171963e+00, 0.00000000e+00,
       0.00000000e+00, 1.77917676e-01, 3.42214812e+00, 1.67213140e+00,
       2.00688927e-01, 6.27295595e-01, 3.93187317e-01, 2.42078980e+00,
       1.67193892e+00, 8.57156805e-02, 3.34496155e+01])

In [174]:
np.array(df_features.columns)[importance > 0]

array(['Aantal aansluitingen met opwekinstallatie', 'Opgesteld vermogen',
       'OV_per_installatie', 'MeestVoorkomendePostcode_113',
       'HuishoudensTotaal_28', 'GemiddeldeHuishoudensgrootte_32',
       'Bevolkingsdichtheid_33', 'PercentageEengezinswoning_36',
       'Koopwoningen_40', 'BouwjaarVanaf2000_46',
       'GemiddeldElektriciteitsverbruikTotaal_47',
       'GemiddeldAardgasverbruikTotaal_55', 'Gemiddeld_opleidingsniveau',
       'Percent_inkomensontvangers',
       'Bedrijfsvestigingen_per_huishuidens',
       'PersonenautoSPerHuishouden_102', 'AfstandTotSchool_108',
       'MateVanStedelijkheid_115', 'PVinstallaties_per100houshoudens'],
      dtype=object)