In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# Illness dataset

In [2]:
def read_illness_file(filename: str) -> pd.DataFrame:
    illness = pd.read_csv(filename)
    illness["date"] = pd.to_datetime(illness["date"])
    return illness

In [3]:
illness = read_illness_file("data/illness/national_illness.csv")

### Train/Test split 80/20 split

In [4]:
X = np.array(illness)

split_index = int(0.8 * len(X))
X, X_test = X[:split_index, :], X[split_index:, :]

### Normalisation 
Mean and Standard Deviation calculated with first 87.5 Percent of the training data


In [5]:
datetime_column_train = X[:, 0]
datetime_column_test = X_test[:, 0]

features_train = X[:, 1:].astype(float)
features_test = X_test[:, 1:].astype(float)

split_index = int(0.875 * len(X))
mean = features_train[:split_index].mean(axis=0)
std = features_train[:split_index].std(axis=0)

features_train_normalized = (features_train - mean) / std
features_test_normalized = (features_test - mean) / std

X = np.column_stack((datetime_column_train, features_train_normalized))
X_test = np.column_stack((datetime_column_test, features_test_normalized))

### Align data with sliding window

In [6]:
def create_sliding_windows_X(matrix, n, h):
    num_rows, num_cols = matrix.shape
    result_matrices = []

    for col in range(1, num_cols):   # skip datecolumn
        variable_windows = []

        for i in range(num_rows - n - h + 1):
            window = matrix[i:i+n, col]
            variable_windows.append(window)

        result_matrices.append(np.array(variable_windows))

    return result_matrices

def create_sliding_windows_Y(matrix, n, h):
    num_rows, num_cols = matrix.shape
    result_matrices = []

    for col in range(1, num_cols):   # skip datecolumn
        variable_windows = []

        for i in range(n, num_rows - h + 1):
            window = matrix[i:i+h, col]
            variable_windows.append(window)

        result_matrices.append(np.array(variable_windows))

    return result_matrices

In [7]:
h = [24, 36, 48, 60]      # predicted horizon
n = 96                    # length of sliding window (Xi)
len(X)

X_list = create_sliding_windows_X(X, n, h[0])
Y_list = create_sliding_windows_Y(X, n, h[0])

### Train, Predict and Error Functions

In [8]:
def linear_regression(X_list, Y_list):
    models = []

    for X, Y in zip(X_list, Y_list):

        # Create and fit linear regression model
        model = LinearRegression()
        model.fit(X, Y)

        models.append(model)

    return models

def predict(models, X_list):
    predictions_list = []

    for model, X in zip(models, X_list):
        predictions = model.predict(X)
        predictions_list.append(predictions)

    return predictions_list

def mse_error(prediction_list, Y_list_test):
    mse_list = []    # save mse for every variable

    for prediction, y in zip(prediction_list, Y_list_test):
        mse_list.append(mean_squared_error(prediction, y))

    return np.mean(mse_list)

def mae_error(prediction_list, Y_list_test):
    mae_list = []    # save mse for every variable

    for prediction, y in zip(prediction_list, Y_list_test):
        mae_list.append(mean_absolute_error(prediction, y))
        
    return np.mean(mae_list)


## Run Test

In [9]:
h = [24, 36, 48, 60]      # predicted horizon
n = 96                    # length of sliding window (Xi)

X_list = create_sliding_windows_X(X, n, h[3])
Y_list = create_sliding_windows_Y(X, n, h[3])

X_list_test = create_sliding_windows_X(X_test, n, h[3])
Y_list_test = create_sliding_windows_Y(X_test, n, h[3])

models = linear_regression(X_list, Y_list)
prediction_list = predict(models, X_list_test)

mse = mse_error(prediction_list, Y_list_test)
mae = mae_error(prediction_list, Y_list_test)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")


Mean Squared Error: 3.546701111670855
Mean Absolute Error: 1.369070390047721


## Try Multivariate Approach

Example for 2 variables:

X = [Var_1, Var_2]

Var_1 = [[1,2],[3,4]]

Var_2 = [[5,6],[7,8]]

combined for Multivariate analysis:

X = [[1,2,5,6],[3,4,7,8]]

In [10]:
h = [24, 36, 48, 60]      # predicted horizon
n = 96                    # length of sliding window (Xi)

X_list = create_sliding_windows_X(X, n, h[3])
Y_list = create_sliding_windows_Y(X, n, h[3])

X_list_test = create_sliding_windows_X(X_test, n, h[3])
Y_list_test = create_sliding_windows_Y(X_test, n, h[3])
    
# Combine all matrices into a single matrix
X_combined = np.concatenate(X_list, axis=1)
Y_combined = np.concatenate(Y_list, axis=1)
X_combined_test = np.concatenate(X_list_test, axis=1)
Y_combined_test = np.concatenate(Y_list_test, axis=1)

model = LinearRegression()
model.fit(X_combined, Y_combined)

prediction = model.predict(X_combined_test)

print(f"MSE of Multivariate linear Regression: {mean_squared_error(prediction, Y_combined_test)}")
print(f"MSE of Multivariate linear Regression: {mean_absolute_error(prediction, Y_combined_test)}")

MSE of Multivariate linear Regression: 25.882299160200798
MSE of Multivariate linear Regression: 3.97296911423748


## How does the Baseline y=0 perform?

In [11]:
def mse_error_to_baseline(Y_list_test):
    mae_list = []  # Save MAE for every variable

    for y in Y_list_test:
        zero_matrix = np.zeros_like(y)
        mae_list.append(mean_squared_error(y, zero_matrix))

    return np.mean(mae_list)

def mae_error_to_baseline(Y_list_test):
    mae_list = []  # Save MAE for every variable

    for y in Y_list_test:
        zero_matrix = np.zeros_like(y)
        mae_list.append(mean_absolute_error(y, zero_matrix))

    return np.mean(mae_list)


mse = mse_error_to_baseline(Y_list_test)
mae = mae_error_to_baseline(Y_list_test)
print("Error of the Baseline y=0")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

Error of the Baseline y=0
Mean Squared Error: 10.369358994187028
Mean Absolute Error: 2.2950808527574647
