# ENEXIS Graduation Project

####  Feature selection using lasso

In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

<a id='readpv'></a>
#### Reading of PV installed capacity & demographics dataset

In [49]:
c_path = "../Data/"
v_file = "PV installed capacity & demographics"

In [50]:
df = pd.read_csv(filepath_or_buffer = c_path + v_file + ".csv", encoding= 'UTF-8')

Setup the pipeline

In [51]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

pipeline_lasso = Pipeline([
    ('impute', imputer),
    ('scaler',StandardScaler()), 
    ('model',Lasso())])

Optimize the model on the alpha hyperparameter in the specified range

In [52]:
search = GridSearchCV(pipeline_lasso,
                      {'model__alpha':np.arange(0.5,10,0.01)},
                      cv = 5, 
                      scoring="neg_mean_squared_error",
                      verbose=3, error_score="raise")

Use only the 2019 data, since that is the most complete

In [53]:
year = 'Year'
df = df[df[year]=='2019-01-01']

And since only one year has no variance, it can be removed from the data set

In [54]:
df = df.drop(columns=year)

Further removal of string type columns

In [55]:
string_types = df.dtypes[df.dtypes == 'object']
string_types

BU_2021            object
BU_NAAM            object
WK_2021            object
WK_NAAM            object
GM_2021            object
GM_NAAM            object
ProvinciecodePV    object
Provincienaam      object
dtype: object

In [56]:
string_type_columns = [
    'BU_2021','BU_NAAM','WK_2021',
    'WK_NAAM','GM_2021','GM_NAAM',
    'ProvinciecodePV','Provincienaam']
df = df.drop(columns=string_type_columns)

Remove columns that have derived columns

In [57]:
redundant_columns = [
    'Aantal aansluitingen met opwekinstallatie', 
    'OV_per_installatie',
    'PVinstallaties_per100houshoudens']
df = df.drop(columns=redundant_columns)

In [58]:
opgesteld_vermogen = 'Opgesteld_vermogen_per100houshoudens'

df_y = df[opgesteld_vermogen]
df_features = df.drop(columns=opgesteld_vermogen)
df_features = df_features.drop(columns='Opgesteld vermogen')

X_train, X_test, y_train, y_test = train_test_split(df_features, df_y, test_size=0.33, random_state=123)

Do a grid search for the best features

In [59]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 950 candidates, totalling 4750 fits
[CV 1/5] END ..............model__alpha=0.5;, score=-1251.922 total time=   0.0s
[CV 2/5] END ..............model__alpha=0.5;, score=-1404.708 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.5;, score=-1770.119 total time=   0.0s
[CV 4/5] END ..............model__alpha=0.5;, score=-1118.602 total time=   0.0s
[CV 5/5] END ..............model__alpha=0.5;, score=-1087.312 total time=   0.0s
[CV 1/5] END .............model__alpha=0.51;, score=-1251.795 total time=   0.0s
[CV 2/5] END .............model__alpha=0.51;, score=-1404.980 total time=   0.0s
[CV 3/5] END .............model__alpha=0.51;, score=-1769.790 total time=   0.0s
[CV 4/5] END .............model__alpha=0.51;, score=-1118.682 total time=   0.0s
[CV 5/5] END .............model__alpha=0.51;, score=-1087.219 total time=   0.0s
[CV 1/5] END .............model__alpha=0.52;, score=-1251.668 total time=   0.0s
[CV 2/5] END .............model__alpha=0.52;,

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('impute', SimpleImputer()),
                                       ('scaler', StandardScaler()),
                                       ('model', Lasso())]),
             param_grid={'model__alpha': array([0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 ,
       0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71,
       0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82,
       0.83, 0.84, 0.85, 0...
       9.3 , 9.31, 9.32, 9.33, 9.34, 9.35, 9.36, 9.37, 9.38, 9.39, 9.4 ,
       9.41, 9.42, 9.43, 9.44, 9.45, 9.46, 9.47, 9.48, 9.49, 9.5 , 9.51,
       9.52, 9.53, 9.54, 9.55, 9.56, 9.57, 9.58, 9.59, 9.6 , 9.61, 9.62,
       9.63, 9.64, 9.65, 9.66, 9.67, 9.68, 9.69, 9.7 , 9.71, 9.72, 9.73,
       9.74, 9.75, 9.76, 9.77, 9.78, 9.79, 9.8 , 9.81, 9.82, 9.83, 9.84,
       9.85, 9.86, 9.87, 9.88, 9.89, 9.9 , 9.91, 9.92, 9.93, 9.94, 9.95,
       9.96, 9.97, 9.98, 9.99])},


Check the outcome of the grid search and evaluate the coefficients at the point of the best result

In [60]:
search.best_params_

{'model__alpha': 0.7000000000000002}

In [61]:
coefficients = search.best_estimator_.named_steps['model'].coef_

What coefficients are NOT relevant:

In [62]:
np.array(df_features.columns)[coefficients == 0]

array(['Woningvoorraad_34', 'InBezitWoningcorporatie_42',
       'GemiddeldElektriciteitsverbruikTotaal_47',
       'Percent_inkomensontvangers'], dtype=object)

What coefficients are relevant and should be taken into account:

In [63]:
np.array(df_features.columns)[coefficients != 0]

array(['MeestVoorkomendePostcode_113', 'HuishoudensTotaal_28',
       'GemiddeldeHuishoudensgrootte_32', 'Bevolkingsdichtheid_33',
       'GemiddeldeWoningwaarde_35', 'PercentageEengezinswoning_36',
       'Koopwoningen_40', 'InBezitOverigeVerhuurders_43',
       'BouwjaarVanaf2000_46', 'GemiddeldAardgasverbruikTotaal_55',
       'Gemiddeld_opleidingsniveau',
       'Bedrijfsvestigingen_per_huishuidens',
       'PersonenautoSPerHuishouden_102', 'AfstandTotSchool_108',
       'MateVanStedelijkheid_115'], dtype=object)

Use the lasso model with the optimized values

In [64]:
grid_search_selected_features = ['MeestVoorkomendePostcode_113', 'HuishoudensTotaal_28',
       'GemiddeldeHuishoudensgrootte_32', 'Bevolkingsdichtheid_33',
       'GemiddeldeWoningwaarde_35', 'PercentageEengezinswoning_36',
       'Koopwoningen_40', 'InBezitOverigeVerhuurders_43',
       'BouwjaarVanaf2000_46', 'GemiddeldAardgasverbruikTotaal_55',
       'Gemiddeld_opleidingsniveau',
       'Bedrijfsvestigingen_per_huishuidens',
       'PersonenautoSPerHuishouden_102', 'AfstandTotSchool_108',
       'MateVanStedelijkheid_115']


## Model using: Linear regression

In [65]:
from sklearn.linear_model import LinearRegression

In [66]:
pipeline_linear_regression = Pipeline([
    ('impute', imputer),
    ('scaler',StandardScaler()), 
    ('model',LinearRegression())])

pipeline_linear_regression.fit(X_train, y_train)
pipeline_linear_regression.score(X_test, y_test)

0.42734011525501703

## Model using: Support Vector machine

In [67]:
from sklearn.svm import SVR

In [68]:
svm_regression = SVR(gamma='auto')

In [69]:
pipeline_svm = Pipeline([
    ('impute', imputer),
    ('scaler',StandardScaler()), 
    ('model',svm_regression)])

pipeline_svm.fit(X_train, y_train, model__sample_weight=None)
pipeline_svm.score(X_test, y_test)

0.37010206079399455

## Model using: Lasso (the one used for feature selection)

In [70]:
pipeline_lasso.fit(X_train, y_train)
pipeline_lasso.score(X_test, y_test)

0.42585802602490486

## Model using: Decision tree

In [71]:
from sklearn.tree import DecisionTreeRegressor

In [72]:
max_range = 10
for i in range(1,max_range):

    decision_tree_regressor = DecisionTreeRegressor(max_depth=i)

    pipeline_decision_tree = Pipeline([
        ('impute', imputer),
        ('scaler',StandardScaler()), 
        ('model', decision_tree_regressor)])

    pipeline_decision_tree.fit(X_train, y_train, model__sample_weight=None)
    score = pipeline_decision_tree.score(X_test, y_test)

    print("Depth  %2.f" % i, ":Score: %.2f" % score)


Depth   1 :Score: 0.25
Depth   2 :Score: 0.37
Depth   3 :Score: 0.38
Depth   4 :Score: 0.37
Depth   5 :Score: 0.35
Depth   6 :Score: 0.29
Depth   7 :Score: 0.29
Depth   8 :Score: 0.15
Depth   9 :Score: 0.16


## Model using: K nearest neighbours (KNN)

In [73]:
from sklearn.neighbors import KNeighborsRegressor

In [74]:
max_range = 10
for i in range(1,max_range):

    nearest_neighbour_regressor = KNeighborsRegressor(n_neighbors=i)

    pipeline_nearest_neighbour = Pipeline([
        ('impute', imputer),
        ('scaler',StandardScaler()), 
        ('model', nearest_neighbour_regressor)])

    pipeline_nearest_neighbour.fit(X_train, y_train)
    score = pipeline_nearest_neighbour.score(X_test, y_test)

    print("Neighbours  %2.f" % i, ":Score: %.2f" % score)

Neighbours   1 :Score: 0.03
Neighbours   2 :Score: 0.29
Neighbours   3 :Score: 0.36
Neighbours   4 :Score: 0.38
Neighbours   5 :Score: 0.39
Neighbours   6 :Score: 0.41
Neighbours   7 :Score: 0.41
Neighbours   8 :Score: 0.40
Neighbours   9 :Score: 0.40
