# ENEXIS Graduation Project

- Feature selection using lasso
- Try different models using these features

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

<a id='readpv'></a>
#### Reading of PV installed capacity & demographics dataset

In [None]:
c_path = "../data/"
v_file = "PV installed capacity & demographics"

In [None]:
df = pd.read_csv(filepath_or_buffer = c_path + v_file + ".csv", encoding= 'UTF-8')
len(df)

18686

Setup the pipeline

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

pipeline_lasso = Pipeline([
    ('impute', imputer),
    ('scaler',StandardScaler()), 
    ('model',Lasso())])

Optimize the model on the alpha hyperparameter in the specified range

In [None]:
search = GridSearchCV(pipeline_lasso,
                      {'model__alpha':np.arange(0.5,10,0.01)},
                      cv = 5, 
                      scoring="neg_mean_squared_error",
                      verbose=3, error_score="raise")

Use only the 2019 data, since that is the most complete

In [None]:
year = 'Year'
df = df[df[year]=='2019-01-01']

And since only one year has no variance, it can be removed from the data set

In [None]:
df = df.drop(columns=year)

Further removal of string type columns

In [None]:
string_types = df.dtypes[df.dtypes == 'object']
string_types

BU_2021            object
BU_NAAM            object
WK_2021            object
WK_NAAM            object
GM_2021            object
GM_NAAM            object
ProvinciecodePV    object
Provincienaam      object
dtype: object

In [None]:
string_type_columns = [
    'BU_2021','BU_NAAM','WK_2021',
    'WK_NAAM','GM_2021','GM_NAAM',
    'ProvinciecodePV','Provincienaam']
df = df.drop(columns=string_type_columns)

Remove columns that have derived columns

In [None]:
redundant_columns = [
    'Aantal aansluitingen met opwekinstallatie', 
    'OV_per_installatie',
    'PVinstallaties_per100houshoudens']
df = df.drop(columns=redundant_columns)

In [None]:
opgesteld_vermogen = 'Opgesteld_vermogen_per100houshoudens'

df_y = df[opgesteld_vermogen]
df_features = df.drop(columns=opgesteld_vermogen)
df_features = df_features.drop(columns='Opgesteld vermogen')

X_train, X_test, y_train, y_test = train_test_split(df_features, df_y, test_size=0.33, random_state=123)

Do a grid search for the best features

In [None]:
search.fit(X_train,y_train)

[CV 2/5] END model__alpha=8.000000000000007;, score=-1614.102 total time=   0.1s
[CV 3/5] END model__alpha=8.000000000000007;, score=-1882.173 total time=   0.1s
[CV 4/5] END model__alpha=8.000000000000007;, score=-1236.433 total time=   0.1s
[CV 5/5] END model__alpha=8.000000000000007;, score=-1210.499 total time=   0.1s
[CV 1/5] END model__alpha=8.010000000000007;, score=-1274.730 total time=   0.1s
[CV 2/5] END model__alpha=8.010000000000007;, score=-1614.356 total time=   0.1s
[CV 3/5] END model__alpha=8.010000000000007;, score=-1882.404 total time=   0.1s
[CV 4/5] END model__alpha=8.010000000000007;, score=-1236.600 total time=   0.1s
[CV 5/5] END model__alpha=8.010000000000007;, score=-1210.706 total time=   0.1s
[CV 1/5] END model__alpha=8.020000000000007;, score=-1274.829 total time=   0.1s
[CV 2/5] END model__alpha=8.020000000000007;, score=-1614.610 total time=   0.1s
[CV 3/5] END model__alpha=8.020000000000007;, score=-1882.636 total time=   0.1s
[CV 4/5] END model__alpha=8.

Check the outcome of the grid search and evaluate the coefficients at the point of the best result

In [None]:
search.best_params_

{'model__alpha': 0.7000000000000002}

In [None]:
coefficients = search.best_estimator_.named_steps['model'].coef_

What coefficients are NOT relevant:

In [None]:
np.array(df_features.columns)[coefficients == 0]

array(['Woningvoorraad_34', 'InBezitWoningcorporatie_42',
       'GemiddeldElektriciteitsverbruikTotaal_47',
       'Percent_inkomensontvangers'], dtype=object)

What coefficients are relevant and should be taken into account:

In [None]:
np.array(df_features.columns)[coefficients != 0]

array(['MeestVoorkomendePostcode_113', 'HuishoudensTotaal_28',
       'GemiddeldeHuishoudensgrootte_32', 'Bevolkingsdichtheid_33',
       'GemiddeldeWoningwaarde_35', 'PercentageEengezinswoning_36',
       'Koopwoningen_40', 'InBezitOverigeVerhuurders_43',
       'BouwjaarVanaf2000_46', 'GemiddeldAardgasverbruikTotaal_55',
       'Gemiddeld_opleidingsniveau',
       'Bedrijfsvestigingen_per_huishuidens',
       'PersonenautoSPerHuishouden_102', 'AfstandTotSchool_108',
       'MateVanStedelijkheid_115'], dtype=object)

Use the lasso model with the optimized values

In [None]:
grid_search_selected_features = ['MeestVoorkomendePostcode_113', 'HuishoudensTotaal_28',
       'GemiddeldeHuishoudensgrootte_32', 'Bevolkingsdichtheid_33',
       'GemiddeldeWoningwaarde_35', 'PercentageEengezinswoning_36',
       'Koopwoningen_40', 'InBezitOverigeVerhuurders_43',
       'BouwjaarVanaf2000_46', 'GemiddeldAardgasverbruikTotaal_55',
       'Gemiddeld_opleidingsniveau',
       'Bedrijfsvestigingen_per_huishuidens',
       'PersonenautoSPerHuishouden_102', 'AfstandTotSchool_108',
       'MateVanStedelijkheid_115']


## Model using: Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
pipeline_linear_regression = Pipeline([
    ('impute', imputer),
    ('scaler',StandardScaler()), 
    ('model',LinearRegression())])

pipeline_linear_regression.fit(X_train, y_train)
pipeline_linear_regression.score(X_test, y_test)

0.42734011525501703

## Model using: Support Vector machine

In [None]:
from sklearn.svm import SVR

In [None]:
svm_regression = SVR(gamma='auto')

In [None]:
pipeline_svm = Pipeline([
    ('impute', imputer),
    ('scaler',StandardScaler()), 
    ('model',svm_regression)])

pipeline_svm.fit(X_train, y_train, model__sample_weight=None)
pipeline_svm.score(X_test, y_test)

0.37010206079399444

## Model using: Lasso (the one used for feature selection)

In [None]:
pipeline_lasso.fit(X_train, y_train)
pipeline_lasso.score(X_test, y_test)

0.42585802602490463

## Model using: Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
max_range = 10
for i in range(1,max_range):

    decision_tree_regressor = DecisionTreeRegressor(max_depth=i)

    pipeline_decision_tree = Pipeline([
        ('impute', imputer),
        ('scaler',StandardScaler()), 
        ('model', decision_tree_regressor)])

    pipeline_decision_tree.fit(X_train, y_train, model__sample_weight=None)
    score = pipeline_decision_tree.score(X_test, y_test)

    print("Depth  %2.f" % i, ":Score: %.2f" % score)


Depth   1 :Score: 0.25
Depth   2 :Score: 0.37
Depth   3 :Score: 0.38
Depth   4 :Score: 0.37
Depth   5 :Score: 0.34
Depth   6 :Score: 0.33
Depth   7 :Score: 0.22
Depth   8 :Score: 0.21
Depth   9 :Score: 0.19


## Model using: K nearest neighbours (KNN)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
max_range = 10
for i in range(1,max_range):

    nearest_neighbour_regressor = KNeighborsRegressor(n_neighbors=i)

    pipeline_nearest_neighbour = Pipeline([
        ('impute', imputer),
        ('scaler',StandardScaler()), 
        ('model', nearest_neighbour_regressor)])

    pipeline_nearest_neighbour.fit(X_train, y_train)
    score = pipeline_nearest_neighbour.score(X_test, y_test)

    print("Neighbours  %2.f" % i, ":Score: %.2f" % score)

Neighbours   1 :Score: 0.03
Neighbours   2 :Score: 0.29
Neighbours   3 :Score: 0.36
Neighbours   4 :Score: 0.38
Neighbours   5 :Score: 0.39
Neighbours   6 :Score: 0.41
Neighbours   7 :Score: 0.41
Neighbours   8 :Score: 0.40
Neighbours   9 :Score: 0.40


## Conclusion

The best models for the data and the selected features are:
- linear model
- lasso model
- KNN model (6 or 7 neighbours)

The decision tree and SVM model are not as good as the above mentioned ones.

The highest score is 0,43 for the linear model.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d0604020-40e6-4d7d-a2ba-74ef2b385723' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>