In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
df = pd.read_csv('variables.csv')

In [None]:
df = df.drop(columns=['Epic_week'])

In [None]:
# create lagged features for multivariate time series data
def create_lagged_features(data, n_in=1, n_out=1, dropna=True):
    n_vars = data.shape[1]
    df = pd.DataFrame(data)
    cols = []

    # Create lagged features for all variables (X)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))

    # Create lagged features only for the first column (y)
    for i in range(0, n_out):
        cols.append(df.iloc[:, 0].shift(-i))

    agg = pd.concat(cols, axis=1)

    if dropna:
        agg.dropna(inplace=True)

    return agg.values


n_lag = 4

n_steps = 8

# Create lagged features
data_lagged = create_lagged_features(df, n_in=n_lag, n_out=n_steps)

# Separate features (X) and labels (y)
X = data_lagged[:, :-n_steps]
y = data_lagged[:, -1]


### Random Forest and Gradient boosted machines

In [None]:
train_size = int(len(X) * 0.7)
test_ind = range(train_size, len(X))

y_pred_store = np.array([None] * len(X))
y_test_array = np.array([])
y_pred_array = np.array([])

for forecast_ind in test_ind:
    train_ind = list(range(0, forecast_ind))
    X_train = X[train_ind]
    X_test = X[forecast_ind].reshape(1, -1)
    y_train = y[train_ind]
    y_test = y[forecast_ind]

    # Z-score
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Create the RF model / GBM model 
    model = RandomForestRegressor(n_estimators=100, random_state=42)
#     model = GradientBoostingRegressor(n_estimators=100, random_state=42)

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_store[forecast_ind] = y_pred
    y_test_array = np.append(y_test_array, y_test)
    y_pred_array = np.append(y_pred_array, y_pred)

In [None]:
# Calculate the mean squared error on test sets

mse = mean_squared_error(y_test_array, y_pred_array)
print('Mean Squared Error for {} step: {}'.format(n_steps ,mse))

mae = mean_absolute_error(y_test_array, y_pred_array)
print('Mean Absolute Error for {} step: {}'.format(n_steps ,mae))

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test_array, y_pred_array)
print('Mean Absolute Percentage Error for {} step: {:.2f}%'.format(n_steps,mape))

In [None]:
def mean_absolute_scaled_error(y_true, y_pred):
    n = len(y_true)

    # Calculate absolute error of the forecasted values
    abs_err = np.abs(y_true[n_steps:] - y_pred[n_steps:])

    # Calculate the mean absolute error of the forecasted values
    mae = np.mean(abs_err)

    # Calculate the mean absolute error for the in-sample h-step naive forecast
    naive_forecast = y_true[:-n_steps]  # naive forecast shifts the series by h step
    mae_naive = np.mean(np.abs(y_true[n_steps:] - naive_forecast))

    # Calculate and return MASE
    mase = mae / mae_naive

    return mase

mase = mean_absolute_scaled_error(y_test_array, y_pred_array)
print('Mean Absolute Scaled Error for {} step: {:.2f}'.format(n_steps,mase))

In [None]:
# 8 forecasting window. different size of samples.
# 1 week ahead: [0, 48, 100, 152, 204, 256, 309, 361, 413, 465, 517]
# 2 week ahead: [0, 47, 99, 151, 203, 255, 308, 360, 412, 464, 516]
# 3 week ahead: [0, 46, 98, 150, 202, 254, 307, 359, 411, 463, 515]
# 4 week ahead: [0, 45, 97, 149, 201, 253, 306, 358, 410, 462, 514]
# 5 week ahead: [0, 44, 96, 148, 200, 252, 305, 357, 409, 461, 513]
# 6 week ahead: [0, 43, 95, 147, 199, 251, 304, 356, 408, 460, 512]
# 7 week ahead: [0, 42, 94, 146, 198, 250, 303, 355, 407, 459, 511]
# 8 week ahead: [0, 41, 93, 145, 197, 249, 302, 354, 406, 458, 510]

In [None]:
plt.figure(figsize=(9, 4))

# Set the x-axis limits
plt.xlim(-30, len(X)+40)

# Shade the left half of the figure in light blue
first_test_index = test_ind[0]  # Get the index of the first test observation
plt.axvspan(-30, first_test_index, facecolor='lightblue', alpha=0.4)

# Create tick locations and labels
tick_interval = 52
# origin data is [0, 52, 104, 156, 208, 260, 313, 365, 417, 469, 521]
tick_locations = np.array([0, 41, 93, 145, 197, 249, 302, 354, 406, 458, 510])
tick_label_locations = tick_locations[:-1] + tick_interval / 2
tick_labels = [f'{year:02d}' for year in range(9, 19)]

plt.plot(y, label='Observed', linestyle='-', linewidth=0.6, alpha=1, color='black')
plt.scatter(range(len(y_pred_store)), y_pred_store, label=F'Random forest {n_steps} week ahead forecast', marker='o', s=10, alpha=1, color='darkorchid')


plt.xlabel('Year', fontsize=12)
plt.ylabel('IPDs ED Attendances', fontsize=12)

plt.xticks(tick_locations)
plt.gca().set_xticklabels([])
plt.gca().set_xticks(tick_label_locations, minor=True)
plt.gca().set_xticklabels(tick_labels, minor=True)
plt.gca().tick_params(axis='x', which='minor', length=0)


plt.legend(loc='upper left', frameon=False)
plt.savefig('RF8.tif', format='tif', dpi=400)
plt.show()


In [None]:
plt.figure(figsize=(9, 4))

# Set the x-axis limits
plt.xlim(-30, len(X)+40)

# Shade the left half of the figure in light blue
first_test_index = test_ind[0]  # Get the index of the first test observation
plt.axvspan(-30, first_test_index, facecolor='lightblue', alpha=0.4)

# Create tick locations and labels
tick_interval = 52
# origin data is [0, 52, 104, 156, 208, 260, 313, 365, 417, 469, 521]
tick_locations = np.array([0, 41, 93, 145, 197, 249, 302, 354, 406, 458, 510])
tick_label_locations = tick_locations[:-1] + tick_interval / 2
tick_labels = [f'{year:02d}' for year in range(9, 19)]

plt.plot(y, label='Observed', linestyle='-', linewidth=0.6, alpha=1, color='black')
plt.scatter(range(len(y_pred_store)), y_pred_store, label=F'Gradient boosted machines {n_steps} week ahead forecast', marker='o', s=10, alpha=1, color='darkorchid')


plt.xlabel('Year', fontsize=12)
plt.ylabel('IPDs ED Attendances', fontsize=12)

plt.xticks(tick_locations)
plt.gca().set_xticklabels([])
plt.gca().set_xticks(tick_label_locations, minor=True)
plt.gca().set_xticklabels(tick_labels, minor=True)
plt.gca().tick_params(axis='x', which='minor', length=0)


plt.legend(loc='upper left', frameon=False)
plt.savefig('GBM8.tif', format='tif', dpi=400)
plt.show()
