# Advanced programming: assignment 3

### Daniel A.
### UID: 100444499

#### Importing Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler


#### Importing and Manipulating the data

In [7]:
# importing the data
data = pd.read_pickle("./data/wind_pickle.pickle")

# dropping cols
data.drop(['steps', 'month', 'day', 'hour'], axis=1, inplace=True)

# set seed
my_NIA = 34291182
np.random.seed(my_NIA)

# selecting 10% of cols except year and energy
cols = list(set(data.columns) - {'year', 'energy'})
cols_selected = [np.random.choice(cols) for x in range(int(len(cols)*0.1))]

# adding 5% missing values at random places
for col in cols_selected:
    selected_indexes = [np.random.choice(
        data.index) for x in range(int(len(data)*0.05))]
    for idx in selected_indexes:
        data.loc[idx,col] = np.nan

# saving the dataset
data.to_pickle('./data/data.pickle')

#### Further data preprocessing for modelling

In [8]:
# train partition
train = data[data['year'].isin([2005,2006])].drop('year',axis=1)
X_train = train[[x for x in train.columns if x != 'energy']].values
y_train = train['energy'].values
# validation partition
validation = data[data['year'].isin([2007,2008])].drop('year',axis=1)
X_validation = validation[[x for x in train.columns if x != 'energy']].values
y_validation = validation['energy'].values
# test partition
test = data[data['year'].isin([2009,2010])].drop('year',axis=1)
X_test = test[[x for x in train.columns if x != 'energy']].values
y_test = test['energy'].values

### 1 - Model selection and hyper-parameter tuning

#### Training and evaluating KNN, Reg trees, SVMs with default hyper-parameters

In [9]:
# creating a dataframe to keep track of scores and results
scores = {'knn':[],'svm':[],'dtr':[]}

In [10]:
# knn pipeline
knn = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('model', KNeighborsRegressor())])

# svm
svm = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler()),
    ('model', SVR())])

# tree
dtr = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('model', DecisionTreeRegressor())])

# models
models = {'knn':knn, 'svm':svm, 'dtr':dtr}
for name, model in models.items():
    print(name)
    models[name].fit(X_train, y_train)
    scores[name].append(mean_absolute_error(y_test, models[name].predict(X_test)))

knn
svm
dtr


#### Training and evaluating KNN, Reg trees, SVMs with imputation and hyper-parameter tuning using RandomizedSearch

In [17]:
# knn
knn_params = {
    'model__n_neighbors':np.random.random_integers(2,30,1),
    'model__leaf_size':np.random.random_integers(28,40,1),
    'model__algorithm':['auto','ball_tree','kd_tree','brute'],
    'model__weights':['uniform','distance'],
    'impute__strategy':['mean','median']
}

knn = Pipeline([
    ('impute', SimpleImputer()),
    ('scaler', MinMaxScaler()),
    ('model', KNeighborsRegressor())])

knn_RS = RandomizedSearchCV(estimator=knn,
                            param_distributions=knn_params,
                            n_iter=10,
                            scoring='neg_mean_absolute_error')

# svm
svm_params = {
    'model__degree':np.random.random_integers(2,7,1),
    'model__gamma':list({'scale','auto'}.union(set(np.arange(0.0001,0.1,0.005)))),
    'model__shrinking':[True, False],
    'model__C':np.arange(0,5,0.5),
    'impute__strategy':['mean','median']
}

svm = Pipeline([
    ('impute', SimpleImputer()),
    ('scaler', MinMaxScaler()),
    ('model', SVR())])


svm_RS = RandomizedSearchCV(estimator=svm,
                            param_distributions=svm_params,
                            n_iter=10,
                            scoring='neg_mean_absolute_error')

# tree
dtr_params = {
    'model__criterion':['mse', 'friedman_mse', 'mae', 'poisson'],
    'model__splitter':['best','random'],
    'model__max_depth':[True, False],
    'impute__strategy':['mean','median']
}

dtr = Pipeline([
    ('impute', SimpleImputer()),
    ('model', DecisionTreeRegressor())])

dtr_RS = RandomizedSearchCV(estimator=dtr,
                            param_distributions=dtr_params,
                            n_iter=10,
                            scoring='neg_mean_absolute_error')


In [18]:
# models
models = {'knn':knn_RS, 'svm':svm_RS, 'dtr':dtr_RS}

# running fit
for name, model in models.items():
    print(name)
    models[name].fit(X_train, y_train)
    scores[name].append(mean_absolute_error(y_test, models[name].predict(X_test)))

knn
svm
dtr


In [19]:
scores

{'knn': [364.0584113744075, 360.3873194993698, 360.3873194993698],
 'svm': [517.4896965513028, 465.95557670137885, 467.3369759546066],
 'dtr': [397.0562796208531, 455.2118199052133]}