In [17]:
# Libraries:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import copy
import random

Just using the dataset from the data preparation file

In [19]:
dataset = pd.read_csv('/Users/alexander/Documents/MSc Data Science/Python/Coursework/Copia de PythonCoursework/data/FINALdataset.csv')

Used a manual approach instead of test train split to keep the groups intact

In [21]:
# Splitting into test and train datasets, manually to preserve the groups
X = dataset.drop(['Task ID','Supplier ID','Cost'], axis='columns')
y = dataset['Cost']
Groups = dataset['Task ID']

all_tasks = Groups.unique()
np.random.seed(42)
TestGroup = np.random.choice(all_tasks, size=20, replace=False)

test_loc = dataset['Task ID'].isin(list(TestGroup))

test_ids = dataset[test_loc]['Task ID'].to_numpy()
train_ids = dataset[~test_loc]['Task ID'].to_numpy()

train_tasks = dataset['Task ID'][~test_loc]

X_test = X[test_loc].to_numpy()
y_test = y[test_loc].to_numpy()
X_train = X[~test_loc].to_numpy()
y_train = y[~test_loc].to_numpy()

Fitting a simple Lasso regression model on the training data, R2 is not very impressive but its not negative so its something!

In [23]:
# 3.3 Fitting a Lasso regression model
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train)
print(lasso.score(X_test,y_test))
y_pred = lasso.predict(X_test)

0.5444117174345442


Manuel suggested we should be using numpy instead of pandas for the error functions, apart from that I've tried to keep it simple
rmse_calc is just a useful function to compute our RMSE scores, it doesn't go into the models at all

In [10]:
def error_function(real_costs, pred_costs):
    real_min = real_costs.min()
    pred_supplier = pred_costs.argmin()
    pred_min = real_costs[pred_supplier]
    error = real_min - pred_min
    return error

def rmse_calc(error_list):
    error_array = np.array(error_list)
    squared_errors = error_array*error_array
    rss = np.sum(squared_errors)
    value = np.sqrt(rss/len(error_array))
    return value

Error calculation is slightly more involved for the held out part, because the tasks used for testing are all together initially, so we have to split perform it for each task

In [11]:
held_out_errors = []
for i in TestGroup:
    task_trues = y_test[test_ids == i]
    task_preds = y_pred[test_ids == i]
    held_out_errors.append(error_function(task_trues, task_preds))
    print(error_function(task_trues, task_preds))

-0.039830654283993294
-0.020898605623685618
-0.03073932084081804
-0.019454643228496993
-0.03314564353405608
-0.03275120473185139
-0.02001518678335512
0.0
-0.013177836730864811
-0.019557632960402582
-0.029904123741192667
-0.0713267179265189
-0.03677777223392242
-0.04054769482851289
-0.021350166193281728
-0.032698786210205666
-0.0338678548501507
-0.013985429080132616
-0.0212231504009443
-0.029694603392962704


This won't be the final error, since Alex has some changes he wants to make to feature selection, but the code should all work regardless of the features

In [12]:
print(rmse_calc(held_out_errors))

0.031347243838081504


Here we make a scorer out of our error function, since the validation set is all one task (for each fold) it is fairly simple to apply now

In [13]:
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.metrics import make_scorer

logo = LeaveOneGroupOut()
error_scorer = make_scorer(error_function)

lasso_cv = Lasso(alpha=0.001)
cv_scores = cross_val_score(lasso_cv, X_train, y_train, cv=logo, groups=train_tasks, scoring=error_scorer)

I think the error goes up because we're only using part of the training data in each fold, hopefully it's nothing to worry about

In [14]:
print(rmse_calc(cv_scores))

0.032439356868181835


We only see predictive accuracy of any kind with tiny alpha values, but it does form a ranking which is the main thing
Takes about 5 minutes to run for me, so bear that in mind

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

lasso_hp = Lasso()
params = {'alpha': np.linspace(0.001, 0.0001, 100)}

grid_search = GridSearchCV(lasso_hp, param_grid=params, scoring=error_scorer, cv=logo, n_jobs=-1)
grid_search.fit(X_train, y_train, groups=train_tasks)
lasso_hp_results = pd.DataFrame(grid_search.cv_results_)

Picks out a very very small alpha value, but at least it picks one

In [16]:
print(grid_search.best_params_)

{'alpha': 0.00037272727272727273}


Next job is report to fit the model for the chosen hyperparameters, report the rmse, copy paste the code for a new regression model, maybe MLP? and get some simple viz of the results

In [17]:
grid_search.best_params_['alpha']

0.00037272727272727273

In [18]:
lasso_hp = Lasso(alpha=grid_search.best_params_['alpha'])
lasso_hp.fit(X_train, y_train)
print(lasso_hp.score(X_test,y_test))
y_pred = lasso_hp.predict(X_test)

0.5342701190399681


In [19]:
hp_errors = []
for i in TestGroup:
    task_trues = y_test[test_ids == i]
    task_preds = y_pred[test_ids == i]
    hp_errors.append(error_function(task_trues, task_preds))
    print(error_function(task_trues, task_preds))

-0.023997590206996378
-0.01886495931830251
-0.018176695386359243
-0.006408969222048588
-0.011245950423476758
-0.0104726223705211
-0.02762685539882359
-0.033259263796944605
0.0
-0.007138839421870102
-0.025453823409779752
-0.0345263004921727
-0.010408508583130804
-0.019862313243964813
-0.0333538663567397
0.0
-0.010170413261649969
-0.0006968233468558904
-0.05661013824413952
-0.019587667953720522


In [20]:
print(rmse_calc(hp_errors))

0.02302972973273373


We see a pretty big improvement on our initital alpha choice, thank god!

Now we're repeating the process for MLP, the code is completely symmetrical to the Lasso case except for the parameters

In [24]:
# 3.3 Fitting an MLP regression model
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(hidden_layer_sizes=(100,100))
mlp.fit(X_train, y_train)
print(mlp.score(X_test,y_test))
y_pred = mlp.predict(X_test)

0.4319143939178349


In [25]:
mlp_h_o_errors = []
for i in TestGroup:
    task_trues = y_test[test_ids == i]
    task_preds = y_pred[test_ids == i]
    mlp_h_o_errors.append(error_function(task_trues, task_preds))
    print(error_function(task_trues, task_preds))

print(rmse_calc(mlp_h_o_errors))

-0.023997590206996378
-0.032108510945115076
-0.026527474473635015
-0.027460291488479394
-0.01806416970631919
-0.06152818851905051
-0.039079552845717014
0.0
-0.01019831373805774
-0.026192222521110253
0.0
-0.06585817143254957
-0.012128412404481315
-0.019862313243964813
-0.04338942144588592
-0.033403086676106974
-0.031787106479094385
-0.015918466458471114
-0.017629114162710013
-0.008146861657950089
0.030831954650411227


In [26]:
logo = LeaveOneGroupOut()
error_scorer = make_scorer(error_function)

mlp_cv = MLPRegressor(hidden_layer_sizes=(100,100))
cv_scores = cross_val_score(mlp_cv, X_train, y_train, cv=logo, groups=train_tasks, scoring=error_scorer)

In [27]:
print(rmse_calc(cv_scores))

0.03164548117240795


Works but maybe would change the hidden layer sizes we offer, don't get a very big performance improvement

In [28]:
mlp_hp = MLPRegressor()
params = {'hidden_layer_sizes': [(100,100),(50,50,50),(200)],
          'solver': ['lbfgs', 'sgd', 'adam']}

grid_search_mlp = GridSearchCV(mlp_hp, param_grid=params, scoring=error_scorer, cv=logo, n_jobs=-1)
grid_search_mlp.fit(X_train, y_train, groups=train_tasks)
mlp_hp_results = pd.DataFrame(grid_search_mlp.cv_results_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [29]:
print(grid_search_mlp.best_params_)

{'hidden_layer_sizes': (100, 100), 'solver': 'lbfgs'}


In [30]:
mlp_hp = MLPRegressor(hidden_layer_sizes=grid_search_mlp.best_params_['hidden_layer_sizes'],
                      solver=grid_search_mlp.best_params_['solver'])
mlp_hp.fit(X_train, y_train)
print(mlp_hp.score(X_test,y_test))
y_pred = mlp_hp.predict(X_test)

0.529573703011494


In [31]:
mlp_hp_errors = []
for i in TestGroup:
    task_trues = y_test[test_ids == i]
    task_preds = y_pred[test_ids == i]
    mlp_hp_errors.append(error_function(task_trues, task_preds))
    print(error_function(task_trues, task_preds))

-0.039830654283993294
-0.020898605623685618
-0.02598985301106721
-0.027940593795965307
-0.024294500817306774
-0.0104726223705211
-0.013258418247657333
-0.05818814519281579
-0.013177836730864811
-0.014421008823887471
-0.03870326692015297
-0.0688667467632374
0.0
-0.019862313243964813
-0.002957159628935224
-0.04692017345022437
-0.01276878248146518
-0.015918466458471114
-0.03536559377749171
-0.014904701216639171


In [32]:
print(rmse_calc(mlp_hp_errors))

0.030706019523182495
