In [27]:
import time

import pandas as pd # used to load the data
import numpy as np # optimized numerical library

from sklearn import preprocessing, metrics, utils, decomposition, model_selection, linear_model, discriminant_analysis, svm, tree, ensemble # library providing several ML algorithms and related utility
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import cross_val_score
from imblearn import over_sampling # provides several resampling techniques to cope with unbalanced datasets (https://github.com/scikit-learn-contrib/imbalanced-learn) compatible with sklearn

from collections import Counter

import matplotlib.pyplot as plt # used for plotting

# Start by defining three helper functions:
# - one to plot the sample distribution  acorss the class labels (to see how un-/balanced the dataset is)
# - one to compute and plot the confusion matrix
# - one to plot data in 2D with different colors per class label

def plot_pie(y, labels, title=""):
    target_stats = Counter(y)
    sizes = list(target_stats.values())
    explode = tuple([0.1] * len(target_stats))

    fig, ax = plt.subplots()
    ax.set_title(title + " (size: %d)" % len(y))
    ax.pie(sizes, explode=explode, labels=target_stats.keys(), shadow=True, autopct='%1.1f%%')
    ax.axis('equal')


def compute_and_plot_cm(ytest, ypred, labels, title=""):
    global nfigure
    # Compute confusion matrix
    cm = metrics.confusion_matrix(ytest, ypred)
    
    accuracy = metrics.accuracy_score(ytest, ypred, normalize=True)

    # Normalize the matrix
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    print(cm)

    # Plot the confusion matrix

    nfigure = nfigure + 1
    plt.figure(nfigure) # new numbered figure
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) # plot the confusionmatrix using blue shaded colors
    plt.title("Confusion Matrix Normalized (%s) Accuracy: %.1f%%" % (title, accuracy*100)) # add title
    plt.colorbar() # plot the color bar as legend

    # Plot the x and y ticks using the class label names
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45)
    plt.yticks(tick_marks, labels)


def plot_2d(xpred, ypred, labels, title=""):
    global nfigure
    # define the colors to use for each class label
    colors = ['red', 'blue', 'green', 'yellow', 'black']
    len_colors = len(colors)
    if len_colors < len(labels):
        print("WARNING: we have less colors than classes: some classes will reuse the same color")

    nfigure = nfigure + 1
    plt.figure(nfigure) # new numbered figure
    plt.title("Feature Space (%s)" % title) # add title


    # plot each class label with a separate color 
    for c in range(len(labels)):
        cur_class = (ypred == c) # get all points belonging to class c
        plt.plot(xpred[cur_class, 0], xpred[cur_class, 1], 'o', color=colors[c % len_colors]) # plot class c


nfigure = 0 #used to number the figures

# 1. Loading the data

In [49]:
################ Load data ####################
# Get the dataset loaded 
file = open("../complete_results.csv","r")
train_x = np.loadtxt(file, delimiter=',')
train_y = train_x[:, 5]
train_x = np.delete(train_x, 5, 1)
print(train_x)
print(train_y)
from sklearn.metrics import mean_squared_error
def do_cross_validation(estimator, x, y):
    return cross_val_score(estimator, x, y, cv=5)

[[  2.   2.   0.  64.   1.]
 [  2.   2.   0.  64.  16.]
 [  2.   2.   0.  64.  32.]
 [  2.   2.   0. 256.   1.]
 [  2.   2.   1.  64.   1.]
 [  2.   2.   1.  64.  16.]
 [  2.   2.   1.  64.  32.]
 [  2.   2.   1. 256.   1.]
 [  2.   2.   1. 256.  16.]
 [  2.   2.   1. 256.  32.]
 [  2.   4.   0.  64.   1.]
 [  2.   4.   0.  64.  16.]
 [  2.   4.   0.  64.  32.]
 [  2.   4.   0. 256.   1.]
 [  2.   4.   0. 256.  16.]
 [  2.   4.   1.  64.   1.]
 [  2.   4.   1.  64.  16.]
 [  2.   4.   1.  64.  32.]
 [  2.   4.   1. 256.   1.]
 [  2.   4.   1. 256.  16.]
 [  2.   4.   1. 256.  32.]
 [  4.   2.   0.  64.   1.]
 [  4.   2.   0.  64.  16.]
 [  4.   2.   0.  64.  32.]
 [  4.   2.   0. 256.   1.]
 [  4.   2.   1.  64.   1.]
 [  4.   2.   1.  64.  16.]
 [  4.   2.   1.  64.  32.]
 [  4.   2.   1. 256.   1.]
 [  4.   2.   1. 256.  16.]
 [  4.   2.   1. 256.  32.]
 [  4.   4.   0.  64.   1.]
 [  4.   4.   0.  64.  16.]
 [  4.   4.   0.  64.  32.]
 [  4.   4.   0. 256.   1.]
 [  4.   4.   0. 256

# 2. Preparing the data

### Standardize the data

In [78]:
################ Scale data ####################
# Train a scaler to standardize the features (zero mean and unit variance)
scaler = preprocessing.StandardScaler().fit(train_x)

# ... and scale the features
X_train_scaled = scaler.transform(train_x)
X_test_scaled = scaler.transform(test_x)

### Finding the 2 principle components (Bonus)

In [79]:
################ PCA ####################
# Train a PCA with 2 dimensions
pca = decomposition.PCA(n_components=2).fit(X_train_scaled)

# ... and apply it to the features
X_train_scaled_pca = pca.transform(X_train_scaled)
X_test_scaled_pca = pca.transform(X_test_scaled)

# 3. Linear Regression

In [88]:
################ Linear Regression ##################
# # Train a Logit model on the original features
best_result = None
best_score = 0
for x_train in [[X_train_scaled, 'normal'], [X_train_scaled_pca, 'pca']]:
    for fit_intercept in [False, True]:
        for normalize in [False, True]:
            lr = linear_model.LinearRegression(fit_intercept=fit_intercept, normalize=normalize)
            test_scores = do_cross_validation(lr, x_train[0] ,train_y)
            test_score_mean = np.average(test_scores)
            if test_score_mean > best_score:
                best_score = test_score_mean
                best_result = [x_train[1], fit_intercept, normalize]
print(best_score)
print(best_result)

0.5503801324894482
['normal', True, False]


### Apply LR on PCA components (Bonus)

In [89]:
# Train a Logit model on pca extracted features
best_result = None
best_score = 0
for x_train in [[X_train_scaled, 'normal'], [X_train_scaled_pca, 'pca']]:
    for fit_intercept in [False, True]:
        for normalize in [False, True]:
            lr = linear_model.LinearRegression(fit_intercept=fit_intercept, normalize=normalize)
            test_scores = do_cross_validation(lr, x_train[0],train_y)
            test_score_mean = np.average(test_scores)
            if test_score_mean > best_score:
                best_score = test_score_mean
                best_result = [x_train[1], fit_intercept, normalize]
print(best_score)
print(best_result)

0.5503801324894482
['normal', True, False]


# 4. Apply Support Vector Machine

In [100]:
################ SVM ##################
# Train a SVM model on the original features
# for kernel in ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']:
# for C in range(1,41,8):
#     for gamma in ['scale', 'auto']:
# realC = C / 10
best_result = None
best_score = 0
for x_train in [[X_train_scaled, 'normal'], [X_train_scaled_pca, 'pca']]:
    for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
        for gamma in ['scale', 'auto']:
            for C in range(490,500):
                realC = C / 100
                lr = svm.SVR(kernel=kernel, gamma=gamma, C=realC)
                test_scores = do_cross_validation(lr, x_train[0],train_y)
                test_score_mean = np.average(test_scores)
                if test_score_mean > best_score:
                    best_score = test_score_mean
                    best_result = [x_train[1], kernel, gamma, realC]
print(best_score)
print(best_result)

0.139430670446686
['normal', 'linear', 'scale', 4.99]


# 5. Apply Decision Tree

In [91]:
################ DecisionTree ##################
# Train a DT model on the original features
best_result = None
best_score = 0
for x_train in [[X_train_scaled, 'normal'], [X_train_scaled_pca, 'pca']]:
    for criterion in ['mse', 'friedman_mse', 'mae']:
        for splitter in ['best', 'random']:
            lr = tree.DecisionTreeRegressor(criterion=criterion, splitter=splitter)
            test_scores = do_cross_validation(lr, x_train[0],train_y)
            test_score_mean = np.average(test_scores)
            if test_score_mean > best_score:
                best_score = test_score_mean
                best_result = [x_train[1], criterion, splitter]
print(best_score)
print(best_result)

0.970328368352732
['normal', 'mae', 'best']


# 6. Apply Random Forest

In [102]:
################ RandomForest ##################
# Train a RF model on the original features
# for criterion in ['gini', 'entropy']:
best_result = None
best_score = 0
for x_train in [[X_train_scaled, 'normal'], [X_train_scaled_pca, 'pca']]:
    for criterion in ['mse', 'friedman_mse', 'mae']:
        lr = ensemble.RandomForestRegressor(criterion=criterion)
        test_scores = do_cross_validation(lr, x_train[0],train_y)
        test_score_mean = np.average(test_scores)
        if test_score_mean > best_score:
            best_score = test_score_mean
            best_result = [x_train[1], criterion]
print(best_score)
print(best_result)

0.9747012761449909
['normal', 'mse']


# 7. Multi-Layer Perceptron (MLP)

In [105]:
################ Multi-Layer Perceptron ##################
# Train an MLP model on the original features
best_result = None
best_score = 0
for x_train in [[X_train_scaled, 'normal'], [X_train_scaled_pca, 'pca']]:
    for activation in ['identity', 'logistic', 'tanh', 'relu']:
        for solver in ['lbfgs', 'sgd', 'adam']:
            for learning_rate in ['constant', 'invscaling', 'adaptive']:
                try:
                    lr = MLPRegressor(activation=activation, solver=solver, learning_rate=learning_rate, max_iter=2000)
                    test_scores = do_cross_validation(lr, x_train[0],train_y)
                except:
                    continue
                test_score_mean = np.average(test_scores)
                if test_score_mean > best_score:
                    best_score = test_score_mean
                    best_result = [x_train[1], activation, solver, learning_rate]
print(best_score)
print(best_result)

  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,
  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  ret = umr_sum(arr, axis, dtype, out, keepdims)
  ret = a @ b
  ret = a @ b
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  ret = umr_sum(arr, axis, dtype, out, keepdims)


  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  ret = umr_sum(arr, axis, dtype, out, keepdims)




  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
  activations[i + 1] += self.intercepts_[i]




0.9708769643447734
['normal', 'relu', 'sgd', 'invscaling']




In [98]:
# 7. K nearest neighbour

In [99]:
from sklearn import neighbors

best_result = None
best_score = 0
for x_train in [[X_train_scaled, 'normal'], [X_train_scaled_pca, 'pca']]:
    for n_neighbors in range(1, 10):
        for weights in ['uniform', 'distance']:
            for algorithm in ['auto', 'ball_tree', 'kd_tree', 'brute']:
                for p in [1 ,2, 3]:
                    lr = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, p=p)
                    test_scores = do_cross_validation(lr, x_train[0], train_y)
                    test_score_mean = np.average(test_scores)
                    if test_score_mean > best_score:
                        best_score = test_score_mean
                        best_result = [x_train[1], n_neighbors, weights, algorithm, p]
print(best_score)
print(best_result)

Traceback (most recent call last):
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 1110, in fit
    return self._fit(X)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 467, in _fit
    raise ValueError(
ValueError: Expected n_neighbors > 0. Got 0

Traceback (most recent call last):
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 1110, in

Traceback (most recent call last):
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 1110, in fit
    return self._fit(X)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 467, in _fit
    raise ValueError(
ValueError: Expected n_neighbors > 0. Got 0

Traceback (most recent call last):
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 1110, in

Traceback (most recent call last):
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 1110, in fit
    return self._fit(X)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 467, in _fit
    raise ValueError(
ValueError: Expected n_neighbors > 0. Got 0

Traceback (most recent call last):
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 1110, in

Traceback (most recent call last):
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 1110, in fit
    return self._fit(X)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 467, in _fit
    raise ValueError(
ValueError: Expected n_neighbors > 0. Got 0

Traceback (most recent call last):
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\sande\documents\courses\2020-2021\qpec\qpecs-project\venv\lib\site-packages\sklearn\neighbors\_base.py", line 1110, in

0.9659515256383395
['normal', 2, 'uniform', 'auto', 1]
