# Regression

This is a template notebook for regression modelling.

## How to use the notebook

The following cells:
- specify objective, variables, and variable types,
- set up the regression models,
- read dataset,
- present results from the model,
- provide the model with the best performance.

To just see how it works for a toy example, simply run the whole notebook as is.

For your own project, adjust the details about objectives, variables, dataset etc. and then execute all cells in order.
The board "regression.board" will help you with detailed instructions.

# Imports and general setup

In [0]:
import os
import shutil
from distutils.dir_util import copy_tree

import time
from datetime import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.neural_network import MLPRegressor

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

from joblib import dump, load

# Project

In [0]:
experiment_name = "project"  # please provide a name for the regression experiment
data_dir = "./"           # please provide a name for saving the data for the experiment

# Dataset

In [0]:
example_df = datasets.load_diabetes(return_X_y=False, as_frame=True)['frame']

df = example_df # You may import your own data here eg pd.read_csv("File.csv")
test_size = 0.25 # You may adjust this value to change the train/test split ratio

path = './out'
isExist = os.path.exists(path)
if isExist:
  for root, dirs, files in os.walk(path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(path)

df.to_csv(path + '/dataset.csv')

## Visualising the dataset

In [0]:
df

In [0]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

# Variables

In [0]:
# Specify exactly the name(s) of the dependent variable(s)
target = ['target']

Extracting the data from the target

In [0]:
X = df.drop(columns=target)
y = df[target]

labels = list(X.columns)
num_labels = len(labels)
target_labels = list(y.columns)
num_target_labels = len(target_labels)
print(labels, target_labels)

Splitting the data into test and train

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)
X_train_half, X_train_discard, y_train_half, y_train_discard = train_test_split(X_train, y_train, test_size = 0.5)
# if num_target_labels == 1:
#     y_train_half = y_train_half.ravel()

# Normalising the data

In [0]:
normalisation_method = 'standard' # 'none', 'standard', 'minmax'

# Dimensionality Reduction

In [0]:
reduction_method = 'pca' # 'none', 'manual', or 'pca'

# For manual reduction
variables_to_remove = ['age']

# For Principle Component Reduction
n_pca_components = 'auto' # In range 1 to number of independent variables
pca_model_data = {}

if (n_pca_components != 'auto' and (n_pca_components < 1 or n_pca_components > len(labels))):
    raise ValueError('n_pca_components must be in range 1 to no independent variables')

## Manual Reduction

In [0]:
corr_target = abs(corr[target])
print(corr_target)

In [0]:
if reduction_method == 'manual':
    X_train = X_train.drop(columns=variables_to_remove)
    X_test = X_test.drop(columns=variables_to_remove)
    X_train_half = X_train_half.drop(columns=variables_to_remove)
    for x in variables_to_remove:
        labels.remove(x)

In [0]:
if normalisation_method in ['standard', 'minmax']:
    scaler_x = StandardScaler() if normalisation_method == 'standard' else MinMaxScaler()
    scaler_y = StandardScaler() if normalisation_method == 'standard' else MinMaxScaler()

    scaler_x.fit(X_train)
    X_train = pd.DataFrame(scaler_x.transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(scaler_x.transform(X_test), columns = X_test.columns)

    scaler_y.fit(y_train)
    y_train = pd.DataFrame(scaler_y.transform(y_train)).squeeze()
    y_test = pd.DataFrame(scaler_y.transform(y_test)).squeeze()

    dump(scaler_x, path + '/scaler_x.joblib')
    dump(scaler_y, path + '/scaler_y.joblib')

## Principle Component Analysis

In [0]:
def run_pca(X_train_data, X_train_data_half, X_test_data, pca_components, show_plot=False):
    pca = PCA(n_components = pca_components)
    pca.fit(X_train_data)
    pca_data = pca.transform(X_train_data)
    per_var = pca.explained_variance_ratio_ * 100
    pca_labels = ['PC' + str(x) for x in range (1, len(per_var) + 1)]

    X_train_data = pca.transform(X_train_data)
    X_train_data_half = pca.transform(X_train_data_half)
    X_test_data = pca.transform(X_test_data)
    
    X_concat = pd.concat([pd.DataFrame(X_train_data), pd.DataFrame(X_test_data)])
    X_concat.columns = pca_labels
    
    if show_plot:
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.suptitle('PCA Visualisation')
        ax1.bar(range(1, len(per_var)+1), per_var, tick_label=pca_labels)
        ax1.set_title('Scree Plot')
        ax1.set_ylabel('% of Explained Variance')
        ax1.set_xlabel('Principle Compenent')

        if (n_pca_components >= 2):
            pca_df = pd.DataFrame(pca_data, columns = pca_labels)

            ax2.scatter(pca_df.PC1, pca_df.PC2)
            ax2.set_title('PCA Graph')
            ax2.set_xlabel('PC1 - {0}%'.format(per_var[0]))
            ax2.set_ylabel('PC2 - {0}%'.format(per_var[1]))
        plt.show()

    
    return [X_train_data, X_train_data_half, X_test_data, pca_labels, X_concat, pca]

# Regularisation
You may set your own regularisation constant or let the notebook find the most optimal constant

In [0]:
regularisation_constant = 'auto' # Float or 'auto' for automatic search for optimal constant (may be slow on large dataset)

In [0]:
# Code for auto regularisation
# You may call auto_regularise with show_graph=True to see the graph of regularisation against training and test scores
def auto_regularise(X_train_data, X_test_data, y_train_data, y_test_data, method, show_graph=False):
    model_train = {}
    model_test = {}
    model_time = {}
    output = {}
    alpha = 0.001
    while (alpha <= 100):
        model = method(alpha = alpha)
        start = time.time()
        model.fit(X_train_data, y_train_data) # Train the model
        end = time.time()
        y_pred = model.predict(X_test_data)

        testing_score = model.score(X_test_data, y_test_data)
        model_test[alpha] = testing_score
        
        alpha *= 10

    best_alpha = max(model_test, key=model_test.get)
    alpha = best_alpha/10
    model_test = {}
    while (alpha <= best_alpha * 10):
        model = method(alpha = alpha)
        model.fit(X_train_data, y_train_data) # Train the model
        y_pred = model.predict(X_test_data)

        training_score = model.score(X_train_data, y_train_data)
        testing_score = model.score(X_test_data, y_test_data)

        model_train[alpha] = training_score
        model_test[alpha]= testing_score
        model_time[alpha]= end - start
        
        alpha += best_alpha/10

    best_alpha = max(model_test, key=model_test.get)
    model = method(alpha = best_alpha)
    start = time.time()
    model.fit(X_train_data, y_train_data)
    end = time.time()
    y_pred = model.predict(X_test_data)
    print("Best regularisation constant: ", best_alpha)
    print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    print("Training Score: ", model_train[best_alpha])
    print("Testing Score: ", model_test[best_alpha])
    print("Time Taken: ", model_time[best_alpha])

    if show_graph:
        plt.plot(list(model_train.keys()), list(model_train.values()), label = 'Training score')
        plt.plot(list(model_test.keys()), list(model_test.values()), label = 'Testing score')
        plt.title("Score vs Regularisation constant")
        plt.xlabel("Regularisation")
        plt.ylabel("Score")
        plt.legend()
    
    output['model'] = model
    output['best alpha'] = best_alpha
    output['training'] = model_train[best_alpha]
    output['testing'] = model_test[best_alpha]
    output['time'] = model_time[best_alpha]
    
    return output


## Model Parameters
You may set your own model parameters or let the notebook find the most optimal parameter

In [0]:
# For polynomial regression
poly_degree = 'auto' # Put 'auto' to run multiple degrees

# For decision tree regression
tree_depth = 'auto' # Put 'auto' to run multiple depths

# For MLP Regression
hidden_layer_size = 'auto' # Put 'auto' to run default layer size

## Limits

In [0]:
poly_limit = 4 # Limit for polynomial degree
max_depth = 5 # Limit for decision tree depth
max_layer_size = 200 # Limit for MLP layer size

# Regression Models

In [0]:
run_models = ['linear', 'l1_linear', 'l2_linear', 'poly', 'tree'] # Specify the models you would like to run in the list
# Possible models: ['linear', 'l1_linear', 'l2_linear', 'poly', 'tree', 'forest', 'mlp']

if not run_models:
    raise ValueError('You must pick at least 1 model to run')

In [0]:
linear_index = ["Training Score", "Testing Score", "Regularisation"]
poly_index = ["Training Score", "Testing Score", "Regularisation"]
tree_index = ["Training Score", "Testing Score"]
forest_index = ["Training Score", "Testing Score"]
mlp_index = ["Training Score", "Testing Score"]

divider = '____________________________________________________________________'

## Linear Regression
Ordinary Least Squares Regression

In [0]:
def run_linear(X_train_data, y_train_data, X_test_data, y_test_data, show_graph=False, show_params=True):
    print("Linear Regression")
    linear = LinearRegression()
    start = time.time()
    linear.fit(X_train_data, y_train_data) # Train the model
    end = time.time()
    y_pred = linear.predict(X_test_data)

    if show_params:
        # The coefficients and intercept
        linear_coef = pd.DataFrame(linear.coef_)
        if num_target_labels == 1:
            linear_coef.index = labels
        else:
            linear_coef.columns = labels
            linear_coef.index = target_labels
        print("Coefficients: \n", linear_coef)
        print("Intercept: ", linear.intercept_)
        
    # The mean squared error
    print("Mean squared error: %.2f" % mean_squared_error(y_test_data, y_pred))
    # Training and testing scores
    training = linear.score(X_train_data, y_train_data)
    testing = linear.score(X_test_data, y_test_data)
    print("Training score: %.2f" % training)
    print("Testing score: %.2f" % testing)

    # Time taken
    time_taken = end - start
    print("Time Taken: ", time_taken)

    # Show the plot for the model if in 2D
    if show_graph:
        if (num_labels == 1):
            plt.scatter(X_test_data, y_test_data, color="red")
            plt.plot(X_test_data, y_pred, color="blue", linewidth=3)

            plt.title("Linear Regression of " + labels[0] + " and " + target)
            plt.xlabel(labels[0])
            plt.ylabel(target)
            plt.show()
        else:
            print("No visualisation available for models in higher dimensions")

    print(divider)


    return [linear, training, testing, time_taken]

## L1 Regularised Linear Regression
Linear Model trained with L1 prior as regulariser (aka the Lasso).

In [0]:
def run_l1_linear(X_train_data, y_train_data, X_test_data, y_test_data, show_graph=False, show_params=True):
    print("L1 (Lasso) Linear Regression")
    l1_linear_train = {}
    l1_linear_test = {}
    l1_linear_time = {}
    if regularisation_constant != 'auto':
        alpha = regularisation_constant
        l1_linear = Lasso(alpha = alpha)
        start = time.time()
        l1_linear.fit(X_train_data, y_train_data) # Train the model
        end = time.time()
        y_pred = l1_linear.predict(X_test_data)

        if show_params:
            # The coefficients and intercept
            l1_linear_coef = pd.DataFrame(l1_linear.coef_)
            if num_target_labels == 1:
                l1_linear_coef.index = labels
            else:
                l1_linear_coef.columns = labels
                l1_linear_coef.index = target_labels
            print("Coefficients: \n", l1_linear_coef)
            print("Intercept: ", l1_linear.intercept_)

        # The mean squared error
        print("Mean squared error: %.2f" % mean_squared_error(y_test_data, y_pred))
        # Training and testing scores
        print("Training score: %.2f" % l1_linear.score(X_train_data, y_train))
        print("Testing score: %.2f" % l1_linear.score(X_test_data, y_test))

        #Time taken
        train_time = end - start
        print("Time Taken: ", train_time)
    else:
        regularise_output = auto_regularise(X_train_data, X_test_data, y_train_data, y_test_data, Lasso)
        l1_linear = regularise_output['model']
        best_alpha = regularise_output['best alpha']
        train_time = regularise_output['time']

        if show_params:
        # The coefficients and intercept
            l1_linear_coef = pd.DataFrame(l1_linear.coef_)
            if num_target_labels == 1:
                l1_linear_coef.index = labels
            else:
                l1_linear_coef.columns = labels
                l1_linear_coef.index = target_labels
            print("Coefficients: \n", l1_linear_coef)
            print("Intercept: ", l1_linear.intercept_)
    
    training = l1_linear.score(X_train_data, y_train_data)
    testing = l1_linear.score(X_test_data, y_test_data)
    regularisation = best_alpha if regularisation_constant == 'auto' else regularisation_constant

    if show_graph:
        if (num_labels == 1):
            plt.scatter(X_test_data, y_test_data, color="red")
            plt.plot(X_test_data, y_pred, color="blue", linewidth=3)

            plt.title("L1 Regularised Linear Regression of " + labels[0] + " and " + target)
            plt.xlabel(labels[0])
            plt.ylabel(target)
            plt.show()
        else:
            print("No visualisation available for models in higher dimensions")

    print(divider)

    return [l1_linear, training, testing, regularisation, train_time]

## L2 Regularised Linear Regression
Linear Model trained with L2 prior as regulariser (aka the Ridge).

In [0]:
def run_l2_linear(X_train_data, y_train_data, X_test_data, y_test_data, show_graph=False, show_params=True):
    print('L2 (Ridge) Linear Regression')
    l2_linear_train = {}
    l2_linear_test = {}
    l2_linear_time = {}
    if regularisation_constant != 'auto':
        alpha = regularisation_constant
        l2_linear = Ridge(alpha = alpha)
        start = time.time()
        l2_linear.fit(X_train_data, y_train_data) # Train the model
        end = time.time()
        y_pred = l2_linear.predict(X_test_data)

        if show_params:
            # The coefficients and intercept
            l2_linear_coef = pd.DataFrame(l2_linear.coef_)
            if num_target_labels == 1:
                l2_linear_coef.index = labels
            else:
                l2_linear_coef.columns = labels
                l2_linear_coef.index = target_labels
            print("Coefficients: \n", l2_linear_coef)
            print("Intercept: ", l2_linear.intercept_)

        # The mean squared error
        print("Mean squared error: %.2f" % mean_squared_error(y_test_data, y_pred))
        # Training and testing scores
        print("Training score: %.2f" % l2_linear.score(X_train_data, y_train_data))
        print("Testing score: %.2f" % l2_linear.score(X_test_data, y_test_data))

        #Time taken
        train_time = end - start
        print("Time Taken: ", train_time)
    else:
        regularise_output = auto_regularise(X_train_data, X_test_data, y_train_data, y_test_data, Ridge)
        l2_linear = regularise_output['model']
        best_alpha = regularise_output['best alpha']
        train_time = regularise_output['time']

        if show_params:
            # The coefficients and intercept
            l2_linear_coef = pd.DataFrame(l2_linear.coef_)
            if num_target_labels == 1:
                l2_linear_coef.index = labels
            else:
                l2_linear_coef.columns = labels
                l2_linear_coef.index = target_labels
            print("Coefficients: \n", l2_linear_coef)
            print("Intercept: ", l2_linear.intercept_)

    training = l2_linear.score(X_train_data, y_train_data)
    testing = l2_linear.score(X_test_data, y_test_data)
    regularisation = best_alpha if regularisation_constant == 'auto' else regularisation_constant

    if show_graph:
        if (num_labels == 1):
            plt.scatter(X_test_data, y_test_data, color="red")
            plt.plot(X_test_data, y_pred, color="blue", linewidth=3)

            plt.title("L2 Regularised Linear Regression of " + labels[0] + " and " + target)
            plt.xlabel(labels[0])
            plt.ylabel(target)
            plt.show()
        else:
            print("No visualisation available for models in higher dimensions")

    print(divider)
    
    return [l2_linear, training, testing, regularisation, train_time]

## Polynomial Regression

In [0]:
def run_poly(X_train_data, y_train_data, X_test_data, y_test_data, regularisation_method, poly_data, show_params=True):
    print('Polynomial Regression')
    degree = 2 if poly_degree == 'auto' else poly_degree
    while degree <= poly_limit:
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        poly_X_train = poly.fit_transform(X_train_data)
        poly_X_test = poly.fit_transform(X_test_data)
        poly_train = {}
        poly_test = {}
        poly_time = {}
        if regularisation_constant != 'auto':
            poly_reg = Ridge(alpha = regularisation_constant)
            start = time.time()
            poly_reg.fit(poly_X_train, y_train_data) # Train the model
            end = time.time()
            y_pred = poly_reg.predict(poly_X_test)

            if show_params:
                # The coefficients and intercept
                poly_coef = pd.DataFrame(poly_reg.coef_)
                print("Coefficients: \n", poly_coef)
                print("Intercept: ", poly_reg.intercept_)

            # The mean squared error
            print("Mean squared error: %.2f" % mean_squared_error(y_test_data, y_pred))
            # Training and testing scores
            print("Training score: %.2f" % poly_reg.score(poly_X_train, y_train_data))
            print("Testing score: %.2f" % poly_reg.score(poly_X_test, y_test_data))

            #Time taken
            print("Time Taken: ", end - start)

        else:
            regularise_output = auto_regularise(poly_X_train, poly_X_test, y_train_data, y_test_data, regularisation_method)
            poly_reg = regularise_output['model']
            best_alpha = regularise_output['best alpha']
            train_time = regularise_output['time']
            # The coefficients and intercept
            poly_coef = pd.DataFrame(poly_reg.coef_)
            print("Coefficients: \n", poly_coef)
            print("Intercept: ", poly_reg.intercept_)

        poly_data[degree] = []
        poly_data[degree].append(poly_reg.score(poly_X_train, y_train_data))
        poly_data[degree].append(poly_reg.score(poly_X_test, y_test_data))
        poly_data[degree].append(best_alpha if regularisation_constant == 'auto' else regularisation_constant)
        if poly_degree != 'auto':
            break
        degree += 1
    
    poly_df = pd.DataFrame(poly_data)
    poly_df.index = poly_index
    best_degree = poly_df.loc[['Testing Score']].idxmax(axis=1)[0]
    poly = PolynomialFeatures(degree=best_degree, include_bias=False)
    poly_X_train = poly.fit_transform(X_train_data)
    poly_X_test = poly.fit_transform(X_test_data)
    poly_reg = regularisation_method(alpha = poly_df[best_degree][2])
    start = time.time()
    poly_reg.fit(poly_X_train, y_train_data) # Train the best model
    end = time.time()

    testing = poly_df[best_degree][1]
    time_taken = end - start

    print(divider)

    return [poly_reg, testing, time_taken, poly]


## Decision Tree Regression

In [0]:
def run_tree(X_train_data, y_train_data, X_test_data, y_test_data, tree_data):
    print("Decision Tree Regression")
    depth = 1 if tree_depth == 'auto' else tree_depth
    while depth <= max_depth:
        tree = DecisionTreeRegressor(max_depth=depth)
        start = time.time()
        tree.fit(X_train_data, y_train_data)
        end = time.time()
        y_pred = tree.predict(X_test_data)

        # The depth and leaves
        print("Depth: \n", tree.get_depth())
        print("Leaves: ", tree.get_n_leaves())

        # The mean squared error
        print("Mean squared error: %.2f" % mean_squared_error(y_test_data, y_pred))
        # Training and testing scores
        training = tree.score(X_train_data, y_train_data)
        testing = tree.score(X_test_data, y_test_data)
        print("Training score: %.2f" % training)
        print("Testing score: %.2f" % testing)

        #Time taken
        print("Time Taken: ", end - start)

        tree_data[depth] = []
        tree_data[depth].append(training)
        tree_data[depth].append(testing)

        if tree_depth != 'auto':
            break
        depth += 1

    tree_df = pd.DataFrame(tree_data)
    tree_df.index = tree_index

    best_depth = tree_df.loc[['Testing Score']].idxmax(axis=1)[0]
    tree = DecisionTreeRegressor(max_depth=best_depth)
    start = time.time()
    tree.fit(X_train_data, y_train_data) # Train the best model
    end = time.time()

    testing = tree_df[best_depth][1]
    time_taken = end - start

    print(divider)

    return [tree, testing, time_taken]

## Random Forests Regression

In [0]:
def run_forest(X_train_data, y_train_data, X_test_data, y_test_data, forest_data):
    print("Random Forests Regression")
    depth = 1 if tree_depth == 'auto' else tree_depth
    while depth <= max_depth:
        forest = RandomForestRegressor(max_depth=depth)
        start = time.time()
        forest.fit(X_train_data, y_train_data)
        end = time.time()
        y_pred = forest.predict(X_test_data)

        # The mean squared error
        print("Mean squared error: %.2f" % mean_squared_error(y_test_data, y_pred))
        # Training and testing scores
        training = forest.score(X_train_data, y_train_data)
        testing = forest.score(X_test_data, y_test_data)
        print("Training score: %.2f" % training)
        print("Testing score: %.2f" % testing)

        #Time taken
        print("Time Taken: ", end - start)

        forest_data[depth] = []
        forest_data[depth].append(training)
        forest_data[depth].append(testing)

        if tree_depth != 'auto':
            break
        depth += 1

    forest_df = pd.DataFrame(forest_data)
    forest_df.index = forest_index

    best_depth = forest_df.loc[['Testing Score']].idxmax(axis=1)[0]
    forest = RandomForestRegressor(max_depth=best_depth)
    start = time.time()
    forest.fit(X_train, y_train) # Train the best model
    end = time.time()

    training = forest_df[best_depth][1]
    time_taken = end - start

    print(divider)

    return [forest, training, time_taken]

## Neural Networks/Multilayer Perceptron

In [0]:
def run_mlp(X_train_data, y_train_data, X_test_data, y_test_data, mlp_data):
    print("MLP Regression")
    layer_size = 100 if hidden_layer_size == 'auto' else hidden_layer_size
    alpha = 0.0001 if regularisation_constant == 'auto' else regularisation_constant
    mlp_train = {}
    mlp_test = {}
    mlp_time = {}
    if regularisation_constant != 'auto':
        mlp = MLPRegressor(alpha = alpha)
        start = time.time()
        mlp.fit(X_train_data, y_train_data) # Train the model
        end = time.time()
        y_pred = mlp.predict(X_test_data)
        train_time = end - start

        print("Hidden Layer Size: ", layer_size)
        print("Mean squared error: %.2f" % mean_squared_error(y_test_data, y_pred))
        print("Training Score: ", mlp.score(X_train_data, y_train_data))
        print("Testing Score: ", mlp.score(X_test_data, y_test_data))
        print("Time Taken: ", train_time)
    else:
        print("Hidden Layer Size: ", layer_size)
        regularise_output = auto_regularise(X_train_data, X_test_data, y_train_data, y_test_data, MLPRegressor)
        mlp = regularise_output['model']
        best_alpha = regularise_output['best alpha']
        train_time = regularise_output['time']

    training = mlp.score(X_train_data, y_train_data)
    testing = mlp.score(X_test_data, y_test_data)

    mlp_data[layer_size] = []
    mlp_data[layer_size].append(training)
    mlp_data[layer_size].append(testing)

    print(divider)

    return [mlp, training, testing, train_time]

## Running the models

In [0]:
def run_regression(X_train, y_train, X_test, y_test, X_train_half, y_train_half):
    models = {}
    model_score = {}
    model_time = {}
    model_score_half = {}
    half = 'Training with half the data'

    linear_data = {"No Regularisation": [], "L1": [], "L2": []}
    poly_data = {}
    half_poly_data = {}
    tree_data = {}
    half_tree_data = {}
    forest_data = {}
    half_forest_data = {}
    mlp_data = {}
    half_mlp_data = {}

    if 'linear' in run_models:
        model, training, testing, time_taken = run_linear(X_train, y_train, X_test, y_test)
        linear_data["No Regularisation"].append(training)
        linear_data["No Regularisation"].append(testing)
        linear_data["No Regularisation"].append(0)

        models['linear'] = model
        model_score['linear'] = []
        model_score['linear'].append(testing)
        model_score['linear'].append(time_taken)

        print(half)
        model, training, testing, time_taken = run_linear(X_train_half, y_train_half, X_test, y_test, show_params=False)
        model_score_half['linear'] = []
        model_score_half['linear'].append(testing)
        model_score_half['linear'].append(time_taken)

    if 'l1_linear' in run_models:
        model, training, testing, regularisation, time_taken = run_l1_linear(X_train, y_train, X_test, y_test)
        linear_data["L1"].append(training)
        linear_data["L1"].append(testing)
        linear_data["L1"].append(regularisation)

        models['l1_linear'] = model
        model_score['l1_linear'] = []
        model_score['l1_linear'].append(testing)
        model_score['l1_linear'].append(time_taken)

        print(half)
        model, training, testing, regularisation, time_taken = run_l1_linear(X_train_half, y_train_half, X_test, y_test, show_params=False)
        model_score_half['l1_linear'] = []
        model_score_half['l1_linear'].append(testing)
        model_score_half['l1_linear'].append(time_taken)

    if 'l2_linear' in run_models:
        model, training, testing, regularisation, time_taken = run_l2_linear(X_train, y_train, X_test, y_test)
        linear_data["L2"].append(training)
        linear_data["L2"].append(testing)
        linear_data["L2"].append(regularisation)

        models['l2_linear'] = model
        model_score['l2_linear'] = []
        model_score['l2_linear'].append(testing)
        model_score['l2_linear'].append(time_taken)

        print(half)
        model, training, testing, regularisation, time_taken = run_l2_linear(X_train_half, y_train_half, X_test, y_test, show_params=False)
        model_score_half['l2_linear'] = []
        model_score_half['l2_linear'].append(testing)
        model_score_half['l2_linear'].append(time_taken)

    if 'poly' in run_models:
        model, testing, time_taken, poly = run_poly(X_train, y_train, X_test, y_test, Ridge, poly_data)

        models['poly'] = model
        model_score['poly'] = []
        model_score['poly'].append(testing)
        model_score['poly'].append(time_taken)
        model_score['poly'].append(poly)
        
        print(half)
        model, testing, time_taken, poly = run_poly(X_train_half, y_train_half, X_test, y_test, Ridge, half_poly_data, show_params=False)
        model_score_half['poly'] = []
        model_score_half['poly'].append(testing)
        model_score_half['poly'].append(time_taken)

    if 'tree' in run_models:
        model, testing, time_taken = run_tree(X_train, y_train, X_test, y_test, tree_data)

        models['tree'] = model
        model_score['tree'] = []
        model_score['tree'].append(testing)
        model_score['tree'].append(time_taken)

        print(half)
        model, testing, time_taken = run_tree(X_train_half, y_train_half, X_test, y_test, half_tree_data)
        model_score_half['tree'] = []
        model_score_half['tree'].append(testing)
        model_score_half['tree'].append(time_taken)

    if 'forest' in run_models:
        model, testing, time_taken = run_forest(X_train, y_train, X_test, y_test, forest_data)

        models['forest'] = model
        model_score['forest'] = []
        model_score['forest'].append(testing)
        model_score['forest'].append(time_taken)

        print(half)
        model, testing, time_taken = run_forest(X_train_half, y_train_half, X_test, y_test, half_forest_data)
        model_score_half['forest'] = []
        model_score_half['forest'].append(testing)
        model_score_half['forest'].append(time_taken)


    if 'mlp' in run_models:
        model, training, testing, time_taken = run_mlp(X_train, y_train, X_test, y_test, mlp_data)

        models['mlp'] = model
        model_score['mlp'] = []
        model_score['mlp'].append(testing)
        model_score['mlp'].append(time_taken)

        print(half)
        model, training, testing, time_taken = run_mlp(X_train_half, y_train_half, X_test, y_test, half_mlp_data)
        model_score_half['mlp'] = []
        model_score_half['mlp'].append(testing)
        model_score_half['mlp'].append(time_taken)

    return [models, model_score, model_time, model_score_half, linear_data, poly_data, tree_data, forest_data, mlp_data]


In [0]:
if reduction_method == 'pca':
    pca_components = n_pca_components if n_pca_components != 'auto' else len(labels)
    while pca_components >= 1:
        print('---------- Training with ' + str(pca_components) + ' components ----------')
        X_train_data, X_train_data_half, X_test_data, labels, X_concat, pca_model = run_pca(X_train, X_train_half, X_test, pca_components)
        pca_model_data[pca_components] = run_regression(X_train_data, y_train, X_test_data, y_test, X_train_data_half, y_train_half) + [X_concat, pca_model]
        
        if n_pca_components != 'auto':
            break
        pca_components -= 1
    best_component_scores = {k:[v[1][max(v[1],key=v[1].get)][0], max(v[1],key=v[1].get)] for k,v in pca_model_data.items()}
    print(best_component_scores)
    best_n_components = max(best_component_scores, key=best_component_scores.get)
    models, model_score, model_time, model_score_half, linear_data, poly_data, tree_data, forest_data, mlp_data, X_concat_pca, pca_model = pca_model_data[best_n_components]
    
else:
    models, model_score, model_time, model_sy_dcore_half, linear_data, polata, tree_data, forest_data, mlp_data = run_regression(X_train, y_train, X_test, y_test, X_train_half, y_train_half)
X_concat = pd.concat([pd.DataFrame(X_train), pd.DataFrame(X_test)])
y_concat = pd.concat([pd.DataFrame(y_train), pd.DataFrame(y_test)])
y_concat.columns = target_labels

## Linear Regression Results
Results will show for the PCA component with max score among all models

In [0]:
linear_data = {k:v for k,v in linear_data.items() if v != []}
if linear_data:
    linear_df = pd.DataFrame(linear_data)
    linear_df.index = linear_index
    linear_df
else:
    print('No linear models run')

In [0]:
if linear_data:
    plt.title("Linear Regression Results")
    for index, reg in enumerate(linear_data):
        if index == 0:
            plt.bar(["Training", "Testing"], linear_data[reg][:-1], width = 0.1, label = reg)
            continue
        plt.bar(np.arange(len(linear_data[reg])-1) + 0.1 * index, linear_data[reg][:-1], width = 0.1, label = reg)

    plt.legend(title='Regularisation')
    plt.show()
else:
    print('No linear models run')

## Polynomial Regression Results

In [0]:
if poly_data:
    poly_df = pd.DataFrame(poly_data)
    poly_df.index = poly_index
    poly_df
else:
    print('No polynomial models run')

In [0]:
if poly_data:
    plt.title("Polynomial Regression Results")
    for index, deg in enumerate(poly_data):
        if index == 0:
            plt.bar(["Training", "Testing"], poly_data[deg][:-1], width = 0.1, label = deg)
            continue
        plt.bar(np.arange(len(poly_data[deg])-1) + 0.1 * index, poly_data[deg][:-1], width = 0.1, label = deg)

    plt.legend(title='Polynomial Degree')
    plt.show()
else:
    print('No polynomial models run')

## Decision Tree Results

In [0]:
if tree_data:
    tree_df = pd.DataFrame(tree_data)
    tree_df.index = tree_index
    tree_df
else:
    print('No decision tree models run')

In [0]:
if tree_data:
    plt.title("Decision Tree Regression Results")
    for index, depth in enumerate(tree_data):
        if index == 0:
            plt.bar(["Training", "Testing"], tree_data[depth], width = 0.1, label = depth)
            continue
        plt.bar(np.arange(len(tree_data[depth])) + 0.1 * index, tree_data[depth], width = 0.1, label = depth)

    plt.legend(title='Tree Depth')
    plt.show()
else:
    print('No decision tree models run')

## Random Forests Result

In [0]:
if forest_data:
    forest_df = pd.DataFrame(forest_data)
    forest_df.index = forest_index
    forest_df
else:
    print('No random forest models run')

In [0]:
if forest_data:
    plt.title("Random Forests Regression Results")
    for index, depth in enumerate(forest_data):
        if index == 0:
            plt.bar(["Training", "Testing"], forest_data[depth], width = 0.1, label = depth)
            continue
        plt.bar(np.arange(len(forest_data[depth])) + 0.1 * index, forest_data[depth], width = 0.1, label = depth)

    plt.legend(title='Tree Depth')
    plt.show()
else:
    print('No random forest models run')

## MLP Results

In [0]:
if mlp_data:
    mlp_df = pd.DataFrame(mlp_data)
    mlp_df.index = mlp_index
    mlp_df
else:
    print('No MLP models run')

In [0]:
if mlp_data:
    plt.title("MLP Regression Results")
    for index, size in enumerate(mlp_data):
        if index == 0:
            plt.bar(["Training", "Testing"], mlp_data[size], width = 0.1, label = size)
            continue
        plt.bar(np.arange(len(mlp_data[depth])) + 0.1 * index, mlp_data[size], width = 0.1, label = size)

    plt.legend(title='Hidden Layer Size')
    plt.show()
else:
    print('No MLP models run')

# Final Results

In [0]:
if reduction_method == 'pca':
    pca_df = pd.DataFrame(pca_model_data)
    pca_df.index = ['models', 'model_score', 'model_time', 'model_score_half', 'linear_data', 'poly_data', 'tree_data', 'forest_data', 'mlp_data', 'X_concat', 'pca_model']
    pca_df.to_csv(path + '/pcaresults.csv')
    X_concat_pca.to_csv(path + '/X_concat_pca.csv')
    dump(pca_model, path + '/pca_model.joblib')
    pca_df

In [0]:
cleaned_model_score = model_score.copy()
if cleaned_model_score['poly']:
    cleaned_model_score['poly'] = cleaned_model_score['poly'][:2]
score_df = pd.DataFrame(cleaned_model_score)
score_df.index = ['Score', 'Time']
score_df.to_csv(path + '/results.csv')
X_concat.to_csv(path + '/X_concat.csv')
y_concat.to_csv(path + '/y_concat.csv')
score_df

In [0]:
plt.title("Final Regression Results")
model = list(model_score.keys())
score = [x[0] for x in list(model_score.values())]
plt.bar(model, score)
if 'linear' in run_models: # Shows baseline performance from linear regression if used
    base_score = [score[0]] * len(score)
    plt.plot(base_score, color='red')
    print('Red line shows baseline performance from linear model')
plt.show()

In [0]:
plt.title("Final Regression Times")
model = list(model_score.keys())
final_time = [x[1] for x in list(model_score.values())]
plt.bar(model, final_time)
plt.show()

## Best Model

In [0]:
best_model = max(model_score, key=model_score.get)
best_model_object = models[best_model] 

print("Best Model: ", best_model)
print("Testing Score: ", model_score[best_model][0])
print("Training Time: ", model_score[best_model][1])
print("Parameters: ", best_model_object.get_params())
print("Normalisation: ", normalisation_method)
print("Principle Components: ", best_n_components if reduction_method == 'pca' else 'No PCR')

best_model_object = models[best_model] # This is the trained best model
dump(best_model_object, path + '/model.joblib')

print(model_score[best_model])
if best_model == 'poly':
    dump(model_score[best_model][2], path + '/poly_transform.joblib')

## Results with half the training data
Observe the difference in model performance with half the training data used

In [0]:
score_half_df = pd.DataFrame(model_score_half)
score_half_df.index = ['Half Score', 'Time']
score_half_df

In [0]:
barWidth = 0.25
plt.title("Full vs Half Training Data Scores")

# Set position of bar on X axis
br1 = np.arange(len(run_models))
br2 = [x + barWidth for x in br1]
 
# Make the plot
plt.bar(br1, score_df.loc['Score'].values, width = barWidth, label ='Full Training')
plt.bar(br2, score_half_df.loc['Half Score'].values, width = barWidth, label ='Half Training')
 
# Adding Xticks
plt.xlabel('Training Data')
plt.ylabel('Scores')
plt.xticks([r + barWidth for r in range(len(run_models))], run_models)

plt.legend(title='Training Data')
plt.show()

In [0]:
def save_model(dirname):
    isExist = os.path.exists('./' + dirname)
    if isExist:
        raise ValueError("Directory with name already exists")
        
    copy_tree('./out', './' + dirname)

# Uncomment this line and run this cell to save the most recent model
# save_model('saved_model')