# Linear model

### Serves as a baseline for more complicated models 

Importing required packages and functions

In [1]:
import sys
sys.path.append('../../')
from simCRN.multivariate_reg import read_eq_data_file, convert_np2df, get_stats, Z_normalize_data, min_max_normalize, prep_data, plot_data, plot_predict, plot_true_and_pred, plot_error_hist, plot_true_v_error, plot_residuals, subset
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
import matplotlib.pyplot as plt
from tensorflow.keras.layers import LeakyReLU
import numpy as np
import math

ModuleNotFoundError: No module named 'matplotlib'

Reading in the data

In [2]:
Ci_all_array, Am_array, Cmin, Cmax, Ai = read_eq_data_file('../4-4-2-asym-AB-AC.txt')

In [5]:
Ci_all_array.shape

(2000, 2)

Preparing the data

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(Am_array, Ci_all_array, test_size=0.2, random_state=0)

# Z normalizing
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Linear regression

In [4]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [5]:
clf = Ridge(alpha=1.0) # The linear regression model

clf.fit(X_train_scaled, y_train)

y_hat_train = clf.predict(X_train_scaled)
y_hat_test = clf.predict(X_test_scaled)

Evaluating the performance of the linear model

In [6]:
from sklearn.metrics import mean_absolute_error as mae

In [7]:
train_mae = mae(y_train, y_hat_train, multioutput='raw_values')
test_mae = mae(y_test, y_hat_test, multioutput='raw_values')

print(f'The MAE on the training data for C₁ is {train_mae[0]:.3}') # 3 significant figures
print(f'The MAE on the training data for C₂ is {train_mae[1]:.3}')
print(f'The MAE on the test data for C₁ is {test_mae[0]:.3}')
print(f'The MAE on the test data for C₂ is {test_mae[1]:.3}')

print() # new line

# Contextualizing with the mean of C₁ and C₂
means = np.mean(Ci_all_array, axis=0)
print(f'The average value of C₁ is {means[0]:.3}')
print(f'The average value of C₂ is {means[1]:.3}')

print() # new line

print(f'For the test data, MAE/mean for C₁ is {test_mae[0]/means[0]:.3}')
print(f'For the test data, MAE/mean for C₂ is {test_mae[1]/means[1]:.3}')

The MAE on the training data for C₁ is 7.71e-08
The MAE on the training data for C₂ is 2.6e-08
The MAE on the test data for C₁ is 7.5e-08
The MAE on the test data for C₂ is 2.74e-08

The average value of C₁ is 7.61e-07
The average value of C₂ is 7.5e-07

For the test data, MAE/mean for C₁ is 0.0986
For the test data, MAE/mean for C₂ is 0.0365


In [8]:
from sklearn.metrics import mean_squared_error as mse

In [9]:
train_mse = mse(y_train, y_hat_train, multioutput='raw_values')
test_mse = mse(y_test, y_hat_test, multioutput='raw_values')

print(f'The MSE on the training data for C₁ is {train_mse[0]:.3}') # 3 significant figures
print(f'The MSE on the training data for C₂ is {train_mse[1]:.3}')
print(f'The MSE on the test data for C₁ is {test_mse[0]:.3}')
print(f'The MSE on the test data for C₂ is {test_mse[1]:.3}')

The MSE on the training data for C₁ is 9.09e-15
The MSE on the training data for C₂ is 1.06e-15
The MSE on the test data for C₁ is 8.67e-15
The MSE on the test data for C₂ is 1.16e-15


In [10]:
from sklearn.metrics import r2_score

In [11]:
train_r2 = r2_score(y_train, y_hat_train, multioutput='raw_values')
test_r2 = r2_score(y_test, y_hat_test, multioutput='raw_values')

print(f'The R² on the training data for C₁ is {train_r2[0]:.3}') # 3 significant figures
print(f'The R² on the training data for C₂ is {train_r2[1]:.3}')
print(f'The R² on the test data for C₁ is {test_r2[0]:.3}')
print(f'The R² on the test data for C₂ is {test_r2[1]:.3}')

The R² on the training data for C₁ is 0.951
The R² on the training data for C₂ is 0.994
The R² on the test data for C₁ is 0.955
The R² on the test data for C₂ is 0.994


### Checking to see if hyperoptimization helps

In [14]:
from hyperopt import hp
from hyperopt import fmin, tpe, space_eval, Trials
from sklearn.model_selection import cross_val_score

parameter_space =  {"reg_type": hp.choice("reg_type", ["Ridge", "Lasso"]), # L2 vs L1 regularization type
                    "reg": hp.uniform("reg", 0, 10), # try values between zero (no regularization) and 10 (high regularization)
                    }

# Evaluation function 
# args should be a dict, with keys for reg_type and reg
def model_eval(args):

    '''Take suggested arguments and perform model evaluation'''
    
    if args["reg_type"] == "Ridge":
        model = Ridge(alpha=args["reg"])
    elif args["reg_type"] == "Lasso":
        model = Lasso(alpha=args["reg"])
    else:
        raise Exception("Something has gone wrong!")
    
    scores = cross_val_score(model, X_train_scaled, y=y_train, scoring='neg_mean_squared_error')
        # I believe that neg_mean_squared_error uses a uniform_average of the two outputs
        # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error

    cv_score = np.mean(scores)

    # return the negative of the CV score to ensure we maximize the negative MSE by minimizing the loss
    return -cv_score


print("Start trials") 

trials = Trials()
best = fmin(model_eval, parameter_space, algo=tpe.suggest, max_evals=300, trials=trials)

print("Best parameter set: {}".format(best))
print("Best loss from CV: {:.2}".format(trials.best_trial['result']['loss']))

########### Code #############

Start trials
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:02<00:00, 117.10trial/s, best loss: 3.008871294061469e-15]
Best parameter set: {'reg': 0.0019465726744386876, 'reg_type': 0}
Best loss from CV: 3e-15
