# ENEXIS Graduation Project

####  Feature selection using lasso

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

<a id='readpv'></a>
#### Reading of PV installed capacity & demographics dataset

In [None]:
c_path = "../Data/"
v_file = "PV installed capacity & demographics"

In [None]:
df = pd.read_csv(filepath_or_buffer = c_path + v_file + ".csv", encoding= 'UTF-8')

Setup the pipeline

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

pipeline = Pipeline([
    ('impute', imputer),
    ('scaler',StandardScaler()), 
    ('model',Lasso())])

Optimize the model on the alpha hyperparameter in the specified range

In [None]:
search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.01,10,0.001)},
                      cv = 5, 
                      scoring="neg_mean_squared_error",
                      verbose=3, error_score="raise")

Use only the 2019 data, since that is the most complete

In [None]:
year = 'Year'
df = df[df[year]=='2019-01-01']

And since only one year has no variance, it can be removed from the data set

In [None]:
df = df.drop(columns=year)

Further removal of string type columns

In [None]:
string_types = df.dtypes[df.dtypes == 'object']
string_types

In [None]:
string_type_columns = [
    'BU_2021','BU_NAAM','WK_2021',
    'WK_NAAM','GM_2021','GM_NAAM',
    'ProvinciecodePV','Provincienaam']
df = df.drop(columns=string_type_columns)

Remove columns that have derived columns

In [None]:
redundant_columns = [
    'Aantal aansluitingen met opwekinstallatie', 
    'OV_per_installatie',
    'PVinstallaties_per100houshoudens']
df = df.drop(columns=redundant_columns)

In [None]:
opgesteld_vermogen = 'Opgesteld_vermogen_per100houshoudens'

df_y = df[opgesteld_vermogen]
df_features = df.drop(columns=opgesteld_vermogen)
df_features = df_features.drop(columns='Opgesteld vermogen')

X_train, X_test, y_train, y_test = train_test_split(df_features, df_y, test_size=0.33, random_state=123)

Do a grid search for the best features

In [None]:
search.fit(X_train,y_train)

Check the outcome of the grid search and evaluate the coefficients at the point of the best result

In [None]:
search.best_params_

In [None]:
coefficients = search.best_estimator_.named_steps['model'].coef_

What coefficients are NOT relevant:

In [None]:
np.array(df_features.columns)[coefficients == 0]

Here we see that the average value of the property is NOT a good feature. 

What coefficients are relevant and should be taken into account:

In [None]:
np.array(df_features.columns)[coefficients != 0]