# Advanced programming: assignment 3

### Daniel A.
### UID: 100444499

#### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler


#### Importing and Manipulating the data

In [3]:
# importing the data
data = pd.read_pickle("./data/wind_pickle.pickle")

# dropping cols
data.drop(['steps', 'month', 'day', 'hour'], axis=1, inplace=True)

# set seed
my_NIA = 34291182
np.random.seed(my_NIA)

# selecting 10% of cols except year and energy
cols = list(set(data.columns) - {'year', 'energy'})
cols_selected = [np.random.choice(cols) for x in range(int(len(cols)*0.1))]

# adding 5% missing values at random places
for col in cols_selected:
    selected_indexes = [np.random.choice(
        data.index) for x in range(int(len(data)*0.05))]
    for idx in selected_indexes:
        data.loc[idx,col] = np.nan

# saving the dataset
data.to_pickle('./data/data.pickle')

#### Further data preprocessing for modelling

In [4]:
# train partition
train = data[data['year'].isin([2005,2006])].drop('year',axis=1)
X_train = train[[x for x in train.columns if x != 'energy']].values
y_train = train['energy'].values
# validation partition
validation = data[data['year'].isin([2007,2008])].drop('year',axis=1)
X_validation = validation[[x for x in train.columns if x != 'energy']].values
y_validation = validation['energy'].values
# test partition
test = data[data['year'].isin([2009,2010])].drop('year',axis=1)
X_test = test[[x for x in train.columns if x != 'energy']].values
y_test = test['energy'].values

### 1 - Model selection and hyper-parameter tuning

#### Training and evaluating KNN, Reg trees, SVMs with default hyper-parameters

In [5]:
# creating a dataframe to keep track of scores and results
scores = {'knn':[],'svm':[],'dtr':[]}

Here we define 3 pipelines, one for KNN, another one for SVM and another one for Decision Trees. 

For KNN and SVM we use imputation (with simple imputer using the mean strategy) and a min max scaler.

For Decision trees we only perform imputation and then the model fit.

We create a dictionary with each model name as key and each pipeline as value.

We then create a loop where we print the name of the model (to know how far the loop has gone) and then we append to the scores dataframe the MAE of each prediction.

In [6]:
# knn pipeline
knn = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('model', KNeighborsRegressor())])

# svm
svm = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('model', SVR())])

# tree
dtr = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('model', DecisionTreeRegressor())])

# models
models = {'knn':knn, 'svm':svm, 'dtr':dtr}
for name, model in models.items():
    print(name)
    models[name].fit(X_train, y_train)
    scores[name].append(mean_absolute_error(y_test, models[name].predict(X_test)))

knn
svm
dtr


#### Training and evaluating KNN, Reg trees, SVMs with imputation and hyper-parameter tuning using RandomizedSearch

In this section we define pipelines with the intention to use RandomizedSearch to perform hyperparameter tuning.

For KNN we first define the parameters for steps in the pipeline:

For the model we test:
 - n_neighbors for all values between 2 and 30 with steps of 1
 - leaf_size for all values between 28 and 40 with steps of 1
 - algorithm for auto, ball_tree, kd_tree and brute
 - weights for uniform and distance

For the imputer we test:
 - strategy for mean and median

After defining the hyper-parameters to test we define the pipeline with the same steps as before, first imputer, then the scaler and then the model. Then we define the RandomizedSearchCV hyper-parameter tuning with 10 iterations and using MAE for scoring.

In [7]:
# knn
knn_params = {
    'model__n_neighbors':np.random.random_integers(2,30,1),
    'model__leaf_size':np.random.random_integers(28,40,1),
    'model__algorithm':['auto','ball_tree','kd_tree','brute'],
    'model__weights':['uniform','distance'],
    'impute__strategy':['mean','median']
}

knn = Pipeline([
    ('impute', SimpleImputer()),
    ('scaler', MinMaxScaler()),
    ('model', KNeighborsRegressor())])

knn_RS = RandomizedSearchCV(estimator=knn,
                            param_distributions=knn_params,
                            n_iter=10,
                            scoring='neg_mean_absolute_error')

For SVM we first define the parameters for steps in the pipeline:

For the model we test:
 - degree for all values between 2 and 7 with steps of 1
 - gamma for all values between 0.0001 and 0.1 with steps of 0.005 along with the scale and auto options
 - shrinking with True and False
 - C for all values between 0 and 5 with steps of 0.5

For the imputer we test:
 - strategy for mean and median

After defining the hyper-parameters to test we define the pipeline with the same steps as before, first imputer, then the scaler and then the model. Then we define the RandomizedSearchCV hyper-parameter tuning with 10 iterations and using MAE for scoring.

In [None]:
# svm
svm_params = {
    'model__degree':np.random.random_integers(2,7,1),
    'model__gamma':list({'scale','auto'}.union(set(np.arange(0.0001,0.1,0.005)))),
    'model__shrinking':[True, False],
    'model__C':np.arange(0,5,0.5),
    'impute__strategy':['mean','median']
}

svm = Pipeline([
    ('impute', SimpleImputer()),
    ('scaler', MinMaxScaler()),
    ('model', SVR())])


svm_RS = RandomizedSearchCV(estimator=svm,
                            param_distributions=svm_params,
                            n_iter=10,
                            scoring='neg_mean_absolute_error')

For Decision trees we first define the parameters for steps in the pipeline:

For the model we test:
 - criterion for mse, friedman_mse, mae and poisson
 - splitter for best and random
 - max_dept for all values between 2 and 14 with steps of 2
 - min_samples_split for all values between 2 and 10 with steps of 2
 - min_samples_leaf for all values between 1 and 10 with steps of 1
 - max_features for auto, sqrt and log2

For the imputer we test:
 - strategy for mean and median

After defining the hyper-parameters to test we define the pipeline with the same steps as before, first imputer, then the scaler and then the model. Then we define the RandomizedSearchCV hyper-parameter tuning with 10 iterations and using MAE for scoring.

In [None]:
# tree
dtr_params = {
    'model__criterion':['mse', 'friedman_mse', 'mae', 'poisson'],
    'model__splitter':['best','random'],
    'model__max_depth':np.arange(2,14,2),
    'model__min_samples_split':np.arange(2,10,2),
    'model__min_samples_leaf':np.arange(1,10,1),
    'model__max_features':['auto','sqrt','log2'],
    'impute__strategy':['mean','median'],
}

dtr = Pipeline([
    ('impute', SimpleImputer()),
    ('model', DecisionTreeRegressor())])

dtr_RS = RandomizedSearchCV(estimator=dtr,
                            param_distributions=dtr_params,
                            n_iter=10,
                            scoring='neg_mean_absolute_error')

Like before, we loop over the model names (keys of the models dictionary) and the RandomizedSearchCV corresponding to each model's pipeline (values of the models dictionary). We fit using the RandomizedSearchCV and add the mean absolute error to the scores dictionary.

In [8]:
# models
models = {'knn':knn_RS, 'svm':svm_RS, 'dtr':dtr_RS}

# running fit
for name, model in models.items():
    print(name)
    models[name].fit(X_train, y_train)
    scores[name].append(mean_absolute_error(y_test, models[name].predict(X_test)))

knn
svm
dtr


#### Checking scores for the 6 models

Row 0 are the models with default params and Row 1 are the models with hyperparameter tuning done

In [13]:
pd.DataFrame(scores)

Unnamed: 0,knn,svm,dtr
0,365.651736,517.55669,396.987431
1,363.807967,482.167105,324.342324


According to the MAE,0 the best model is the decision trees with hyperparameter tuning done

### 2 - Attribute selection