# Regression

This is a template notebook for regression modelling.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}


In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

## How to use the notebook

The following cells:
- specify objective, variables, and variable types,
- set up the regression models,
- read dataset,
- present results from the model,
- provide the model with the best performance.

By default, the notebook is set up to run with an example (sklearn diabetes). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to regression.board for detailed instructions. The headers in this notebook follow the cards on the board.

In [0]:
# <halerium id="39b22147-b458-47ca-b7a6-6e198f0d285f">
# Link to regression.board
# </halerium id="39b22147-b458-47ca-b7a6-6e198f0d285f">


## Imports

In [0]:
import os
import shutil
from distutils.dir_util import copy_tree

import time
from datetime import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

from joblib import dump, load

## Parameter Setup
The following parameters and variables should be adjusted based on your data and experimental objectives

### 2. Import the Dataset

In [0]:
# <halerium id="8595d578-e365-46d6-9b08-16e0569992e4">
path = 'default example' # Specify the filepath here
test_size = 0.25 # You may adjust this value to change the train/test split ratio
# </halerium id="8595d578-e365-46d6-9b08-16e0569992e4">


Importing the dataset

In [0]:
example_df = datasets.load_diabetes(return_X_y=False, as_frame=True)['frame'] # The default example of the template - sklearn diabetes
if path == 'default example':
    df = example_df
else:
  df = pd.read_csv(path)

Visualising the dataset

In [0]:
df

Creating the /out folder for artifacts

In [0]:
path = 'out'
isExist = os.path.exists(path)
if isExist:
  for root, dirs, files in os.walk(path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(path)

df.to_csv(path + '/dataset.csv')

### 3. Specify the Variables

In [0]:
# Specify exactly the names of the dependent variables (aka variables you wish to predict)
# <halerium id="5c3a0d9f-05df-4ed7-8dd9-2f011bf9bb59">
target = ['target'] # eg. ['variable1', 'variable2']
# </halerium id="5c3a0d9f-05df-4ed7-8dd9-2f011bf9bb59">


### 4. Normalise the data

In [0]:
# <halerium id="4f4ac754-14f7-4fd1-88cc-df724631601b">
normalisation_method = 'standard' # 'none', 'standard', 'minmax'
# </halerium id="4f4ac754-14f7-4fd1-88cc-df724631601b">


### 5. Reduce Dimensionality

Correlation Matrix

In [0]:
corr_target = abs(df.corr()[target])
print(corr_target)

Reduction Method

In [0]:
# For manual reduction
# <halerium id="9211d717-e939-4f69-b530-94c9d9a522d5">
variables_to_remove = ['sex'] # eg. ['variable1'] or empty list [] if none to remove
# </halerium id="9211d717-e939-4f69-b530-94c9d9a522d5">


### 6. Regularise the Model

In [0]:
# <halerium id="72c52810-7bf8-4e6d-8143-76772ad199a1">
alpha_param = np.logspace(-4, 0, num=50) # default = np.logspace(-4, 0, num=50)
# </halerium id="72c52810-7bf8-4e6d-8143-76772ad199a1">


### 7. Limit Iterations

In [0]:
# <halerium id="9ffe4933-c590-4333-8b90-397004481121">
poly_limit = 4 # Limit for polynomial degree, default = 4
max_depth = None # Limit for decision tree/random forest depth, default = None
# </halerium id="9ffe4933-c590-4333-8b90-397004481121">


### 8. Run the Models

In [0]:
# <halerium id="cf674127-a6f1-4624-aa9b-b89a768166ba">
run_models = ['linear', 'l1_linear', 'l2_linear', 'poly', 'tree'] # Specify the models you would like to run in the list
# </halerium id="cf674127-a6f1-4624-aa9b-b89a768166ba">
# Possible models: ['linear', 'l1_linear', 'l2_linear', 'poly', 'tree', 'forest', 'mlp']

if not run_models:
    raise ValueError('You must pick at least 1 model to run')

There are no user inputs beyond this point. 

You may skip to the results section to view the results of the models.

Extracting the X (inputs) and y (outputs)

In [0]:
X = df.drop(columns=target)
y = df[target]

labels = list(X.columns)
num_labels = len(labels)
target_labels = list(y.columns)
num_target_labels = len(target_labels)
print(labels, target_labels)

Splitting the training and testing data

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)

if 'half' in globals() and half:
    X_train, X_discard, y_train, y_discard = train_test_split(X_train, y_train, test_size = 0.5)

Manual Dimensionality Reduction

In [0]:
if variables_to_remove:
    X_train = X_train.drop(columns=variables_to_remove)
    X_test = X_test.drop(columns=variables_to_remove)
    
    for x in variables_to_remove:
        labels.remove(x)

Scaling

In [0]:
from functions.scale import scale

if normalisation_method:
    X_train, X_test, y_train, y_test = scale(X_train, X_test, y_train, y_test, normalisation_method, path)

Setting up the models

In [0]:
models = {
    'linear': LinearRegression(),
    'l1_linear': Lasso(),
    'l2_linear': Ridge(),
    'poly': Ridge(),
    'tree': DecisionTreeRegressor(max_depth=max_depth),
    'forest': RandomForestRegressor(max_depth=max_depth),
    'mlp': MLPRegressor()
}

In [0]:
# Parameter grids for each model
linear_param_grid = {
    "pca__n_components": np.linspace(1, len(labels), min(len(labels), 10), dtype=int),
    "model__fit_intercept": [True, False]
}

lasso_param_grid = {
    "pca__n_components": np.linspace(1, len(labels), min(len(labels), 10), dtype=int),
    "model__alpha": alpha_param,
    "model__fit_intercept": [True, False]
}

ridge_param_grid = {
    "pca__n_components": np.linspace(1, len(labels), min(len(labels), 10), dtype=int),
    "model__alpha": alpha_param,
    "model__fit_intercept": [True, False]
}

poly_param_grid = {
    "pca__n_components": np.linspace(1, len(labels), min(len(labels), 10), dtype=int),
    "model__alpha": alpha_param,
    "model__fit_intercept": [True, False],
    "poly__degree": np.arange(2, poly_limit + 1, 1),
    "poly__include_bias": [True, False]
}

tree_param_grid = {
    "pca__n_components": np.linspace(1, len(labels), min(len(labels), 10), dtype=int),
    "model__criterion": ['squared_error', 'friedman_mse', 'absolute_error'],
    "model__ccp_alpha": alpha_param
}

forest_param_grid = {
    "pca__n_components": np.linspace(1, len(labels), min(len(labels), 10), dtype=int),
    "model__criterion": ['squared_error', 'absolute_error'],
    "model__ccp_alpha": alpha_param,
    "model__n_estimators": np.linspace(1, 1000, 21, dtype=int)
}

mlp_param_grid = {
    "pca__n_components": np.linspace(1, len(labels), min(len(labels), 10), dtype=int),
    "model__activation": ['identity', 'logistic', 'tanh', 'relu'],
    "model__solver": ['lbfgs', 'sgd', 'adam'],
    "model__alpha": alpha_param,
    "model__learning_rate": ['constant', 'invscaling', 'adaptive']
}

In [0]:
models_param_grid = {
    'linear': linear_param_grid,
    'l1_linear': lasso_param_grid,
    'l2_linear': ridge_param_grid,
    'poly': poly_param_grid,
    'tree': tree_param_grid,
    'forest': forest_param_grid,
    'mlp': mlp_param_grid
}

Running the regression

In [0]:
from functions.run_regression_pipeline import run_regression

model_results = run_regression(X_train, X_test, y_train, y_test, run_models, models, models_param_grid, labels)

### 9. Get the Results

In [0]:
results = pd.DataFrame(model_results)
results = results[:2]
results.index = ['Testing Score', 'Time Taken']
results[:2]

In [0]:
plt.title("Testing Score")
model = list(model_results.keys())
final_time = [x[0] for x in list(model_results.values())]
plt.bar(model, final_time)
# <halerium id="2b2d88c0-4964-484f-aa20-12cd6974301f">
plt.show()
# </halerium id="2b2d88c0-4964-484f-aa20-12cd6974301f">


In [0]:
plt.title("Training Time")
model = list(model_results.keys())
final_time = [x[1] for x in list(model_results.values())]
plt.bar(model, final_time)
# <halerium id="2b2d88c0-4964-484f-aa20-12cd6974301f">
plt.show()
# </halerium id="2b2d88c0-4964-484f-aa20-12cd6974301f">


In [0]:
best_model = max(model_results, key=(lambda key: model_results[key]))
# <halerium id="2b2d88c0-4964-484f-aa20-12cd6974301f">
print("Best model:", best_model)
print(model_results[best_model][2])
# </halerium id="2b2d88c0-4964-484f-aa20-12cd6974301f">


In [0]:
dump([best_model, model_results[best_model], labels, target], path + '/model.joblib')

### 10. Interpret and Improve

In [0]:
# <halerium id="4f82ef15-a13e-49e5-8119-af8ecd3afd3a">
half=True
# </halerium id="4f82ef15-a13e-49e5-8119-af8ecd3afd3a">
