In [29]:
# 03 Submodels

In [30]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
weatherclimateED = pd.read_csv('weatherclimateED.csv', parse_dates = [0], dayfirst = True)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import itertools
import statsmodels.api as sm
from sklearn.exceptions import ConvergenceWarning
ConvergenceWarning('ignore')
from IPython.display import clear_output
import os
import joblib
from joblib import Parallel, delayed
import warnings
# warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore" # this change is only for this time's session
import time
import itertools

In [31]:
## Epiweeks Module converts dates to CDC Epiweek format
## Further documentation on https://pypi.org/project/epiweeks/
from epiweeks import Week, Year
from datetime import date
def create_epiweek(date):
    return Week.fromdate(date)
def create_epiweekplot(epiweek):
    epiweek = str(epiweek)
    return F'Y{epiweek[:4]}W{epiweek[4:]}'
def create_epiweek_fromstr(str):
    return Week.fromstring(str)

In [32]:
## This section creates a full complete dataset that includes all the variables of interest that will be used
## iloc function selects the relevant variables of interest based on column number
## Problematic weather columns (i.e. don't select!): 6, 16, 17, 19, 20
## Disease columns excluded due to limited dataset: 21:24, 25
weatherclimateED['Date'] = pd.to_datetime(weatherclimateED['Date'])
weatherclimateED['epiweek'] = weatherclimateED['Date'].apply(create_epiweek)
weatherclimateED = weatherclimateED.set_index('epiweek')
weatherclimateED = weatherclimateED.iloc[:, np.r_[30:32, 33:39, 40, 42 , 45:47, 49:51,  52:54, 1:6, 8:15]]
weatherclimateED.info()

<class 'pandas.core.frame.DataFrame'>
Index: 866 entries, 200601 to 202231
Data columns (total 28 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   Cardiovascular disease                                              679 non-null    float64
 1   Chronic respiratory disease                                         679 non-null    float64
 2   Diabetes mellitus                                                   679 non-null    float64
 3   Digestive disease                                                   679 non-null    float64
 4   Endocrine disorders                                                 679 non-null    float64
 5   Factors influencing health status and contact with health services  679 non-null    float64
 6   Genitourinary disorders                                             679 non-null    float64
 7   Ill-defined di

In [33]:

## This function takes the full dataset and creates an initial dataset with the specified range
## also returns the name of the target variable for creation of the initial dataset
## note disease_var here is an integer based off the column number
def create_initial_dataset(dataset, disease_var: int):
    explore_df = dataset.copy()
    range_start = Week(2009,1)
    range_end = Week (2018,52)
    explore_df = explore_df.loc[range_start:range_end]
    target_var = explore_df.columns.values.tolist()[disease_var]

    if not os.path.exists(target_var):
        os.makedirs(target_var)
    path = os.path.join(target_var, F'initial_dataset.csv')
    
    explore_df.to_csv(path)
    #explore_df1 is pure AR and explore_df2 is with environmetal vairables
    explore_df_1 = explore_df[[target_var]] 
    explore_df_2 = pd.merge(explore_df[[target_var]], explore_df[explore_df.columns[16:28].to_list()], on='epiweek')
#     explore_df_pure = explore_df.drop(columns=target_var)
    return explore_df, explore_df_1, explore_df_2, target_var

In [34]:
def create_naive(dataset, step, target_var):
    naive = dataset.copy()
    naive = naive[[target_var]].shift(step)
    return naive.dropna()

In [35]:
def create_history_mean(dataset, lag, step, target_var):
    origin_history_mean = dataset.copy()
    history_mean = origin_history_mean[[target_var]].shift(step)
    for i in range(step + 1, step + lag):
        history_mean += origin_history_mean.shift(i)
    return history_mean.dropna() / lag

In [36]:
# Create lagged dataset
def create_lagged_dataset(dataset, lag, target_var):
    lagged_dataset = dataset.copy()
    columns_list = list(lagged_dataset.columns)
    data_join = {}
    for column in columns_list:
        if column == target_var:
            data_join[column] = lagged_dataset[column]
        for n in range(1,lag+1):
            data_join[F'{column}_L{n}'] = lagged_dataset[column].shift(n)
    lagged_dataset = pd.concat(data_join.values(), axis=1, ignore_index = True)
    lagged_dataset.columns = data_join.keys()
    return lagged_dataset.dropna()

In [37]:
## Step is the number of weeks ahead that we are forecasting, e.g. step=2 is 2 weeks ahead.
## Note step=1 results in no change to dataset, i.e. use generated lagged variables to forecast current. 
def create_stepped_dataset(dataset, step, target_var):
    stepped_dataset = dataset.copy()
    y = stepped_dataset[[target_var]].shift(-step+1)
    if step != 1:
        X = stepped_dataset.iloc[:-step+1, :]
    else:
        X = stepped_dataset
    return X.drop(target_var, axis = 1), y.dropna()
## So now target variable (y variable for exploration) is shifted back by 2 weeks. i.e., taking the y-value from 2 weeks later
## and setting it to the current index. So linear regression of y+2 with the current X values. X will have
## a smaller dataset with the last 2 time points removed because of the shift. 

In [38]:
def create_window(X, window_perc):
    return X.index[0], X.index[int(len(X)*window_perc)]
def create_output_dataset(y, window_end):
    return y.copy().loc[window_end+1:]

In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from group_lasso import GroupLasso
import xgboost as xgb
import lightgbm as lgb
# from nixtlats import TimeGPT
from sklearn.decomposition import PCA
from rpy2.robjects import pandas2ri, r
import rpy2.robjects as ro
from rpy2.robjects import globalenv
from rpy2.robjects.packages import importr
from rpy2.robjects import Formula
from rpy2.robjects.conversion import localconverter

np.random.seed(0)

def block_bootstrap(data, block_size, num_resamples):
    np.random.seed(0)
    n = len(data)
    num_blks = n - block_size + 1
    resamples = []
    blks = []
    for i in range(num_blks):
        start_idx = i
        end_idx = start_idx + block_size
        blks.append(data[start_idx:end_idx])
    for _ in range(num_resamples):
        resampled_series = []
        resampled_indices = np.random.choice(num_blks, size=n // block_size + 1, replace=True)

        for j in resampled_indices:
            resampled_series.extend(blks[j])
        resamples.append(np.array(resampled_series))
    return resamples



def coefs(model, coefs_path, filename):
    coefs_path

## This function runs the first order regression for the target disease, for one specified lag and step

def regression_with_naive(X_dataset, y_dataset, X_dataset_1, y_dataset_1, X_dataset_2, y_dataset_2, window_start, window_end, y_pred_models, test_length, naive, history_mean, target_var, lag, step, block_size, num_bs, explore_dataset):
    count = 0
    df_end = X_dataset.index[-1]
    while window_end < df_end:
        if (window_end + 15) <= df_end:
            expand_test_length = 15
        else:
            expand_test_length = X_dataset.index.get_loc(df_end) - X_dataset.index.get_loc(window_end) 
    
        X = X_dataset.copy()
        y = y_dataset.copy()
        # Note: .loc is end-inclusive    
        X_train = X.loc[window_start:window_end]
        #print(X_train.info())
        ## values.ravel() converts y_train to numpy array for compatibility with models (update: already deleted this)
        y_train = y.loc[window_start:window_end]
        #print(len(y_train))
        ## double square brackets so X_test is extracted as a pandas df instead of series
        X_test = X.loc[window_end+1:window_end+expand_test_length]
        #print(X_test)
        y_test = y.loc[window_end+1:window_end+expand_test_length]
        #print(y_test)
    
        ## Scaling
        scaler = StandardScaler()
        ## .fit_transform stores the scaling parameters (fit), and transforms the training set
        X_train = scaler.fit_transform(X_train)
        ## .transform takes the previously stored scaling parameters to transform the test set
        ## Therefore, test set is transformed based on the training set parameters
        X_test = scaler.transform(X_test)
        # For all models using all variables, I use dataframe and not array
        X_train = pd.DataFrame(X_train)
        X_train.columns = X.columns
        X_test = pd.DataFrame(X_test)
        X_test.columns = X.columns

        
        ## data processing for pure AR 
        X_1 = X_dataset_1.copy()
        y_1 = y_dataset_1.copy()
        # Note: .loc is end-inclusive    
        X_train_1 = X_1.loc[window_start:window_end]
        y_train_1 = y_1.loc[window_start:window_end]
        ## double square brackets so X_test is extracted as a pandas df instead of series
        X_test_1 = X_1.loc[window_end+1:window_end+expand_test_length]
        y_test_1 = y_1.loc[window_end+1:window_end+expand_test_length]
    
        ## Scaling
        scaler = StandardScaler()
        ## .fit_transform stores the scaling parameters (fit), and transforms the training set
        X_train_1 = scaler.fit_transform(X_train_1)
        ## .transform takes the previously stored scaling parameters to transform the test set
        ## Therefore, test set is transformed based on the training set parameters
        X_test_1 = scaler.transform(X_test_1)
        # For pure factor model, I use dataframe and not array
        X_train_1 = pd.DataFrame(X_train_1)
        X_train_1.columns = X_1.columns
        X_test_1 = pd.DataFrame(X_test_1)
        X_test_1.columns = X_1.columns
        
        ## data processing for AR with environmental variables
        X_2 = X_dataset_2.copy()
        y_2 = y_dataset_2.copy()
        # Note: .loc is end-inclusive    
        X_train_2 = X_2.loc[window_start:window_end]
        y_train_2 = y_2.loc[window_start:window_end]
        ## double square brackets so X_test is extracted as a pandas df instead of series
        X_test_2 = X_2.loc[window_end+1:window_end+expand_test_length]
        y_test_2 = y_2.loc[window_end+1:window_end+expand_test_length]
    
        ## Scaling
        scaler = StandardScaler()
        ## .fit_transform stores the scaling parameters (fit), and transforms the training set
        X_train_2 = scaler.fit_transform(X_train_2)
        ## .transform takes the previously stored scaling parameters to transform the test set
        ## Therefore, test set is transformed based on the training set parameters
        X_test_2 = scaler.transform(X_test_2)
        
        X_train_2 = pd.DataFrame(X_train_2)
        X_train_2.columns = X_2.columns
        X_test_2 = pd.DataFrame(X_test_2)
        X_test_2.columns = X_2.columns
    
        ## evaluate variance
        
        ## Implement cross-validation split
        tscv = TimeSeriesSplit(n_splits = 5)
        
    
#         ## 1. Naive Forecast
#         residuals = [explore_dataset.loc[epiweek, target_var] - naive.loc[epiweek,target_var] for epiweek in naive.index]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples, dtype=object) # "dtype=object" to make sure the array can hold objects of any type
#         resamples  = resamples[:, :naive.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         for i, resample in enumerate(resamples):
#             resample = pd.DataFrame(resample, index=naive.index, columns=naive.columns)
#             naive_tilde = resample + naive
#             naive_new = create_naive(naive_tilde, step, target_var)
#             y_pred_models['naive'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = (naive_new.loc[window_end+1:window_end+expand_test_length, target_var]).values

        
#         ## 2. Historical mean (rolling window = lag)
#         residuals = [explore_dataset.loc[epiweek, target_var] - history_mean.loc[epiweek,target_var] for epiweek in history_mean.index]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples, dtype=object) # "dtype=object" to make sure the array can hold objects of any type
#         resamples  = resamples[:, :history_mean.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         for i, resample in enumerate(resamples):
#             resample = pd.DataFrame(resample, index=history_mean.index, columns=history_mean.columns)
#             history_mean_tilde = resample + history_mean
#             history_mean_new = create_history_mean(history_mean_tilde, lag, step, target_var)
#             y_pred_models['historymean'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = (history_mean_new.loc[window_end+1:window_end+expand_test_length, target_var]).values

#         ## 3. Pure AR
#         ar_pure = LinearRegression()
#         ar_pure.fit(X_train_1, y_train_1)
        
#         y_train_1_pred = ar_pure.predict(X_train_1)
#         residuals = [y_train_1.iloc[i,0] - y_train_1_pred[i] for i, epiweek in enumerate(X_train_1.index)]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples)
#         resamples  = resamples[:, :y_train_1_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         y_train_1_tilde = resamples + y_train_1_pred
#         # fit models on bootstrapping samples to estimate the empirical distribution
#         for i, sample in enumerate(y_train_1_tilde):
#             sample = pd.DataFrame(sample)
#             sample.index = y_train_1.index
#             sample.columns = y_train_1.columns
#             ar_pure_bs = LinearRegression()
#             ar_pure_bs.fit(X_train_1, sample)
#             y_pred_models['ar_pure'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = ar_pure_bs.predict(X_test_1)
        
        
#         ## 4. AR only with environmental variables
#         ar_env = LinearRegression()
#         ar_env.fit(X_train_2, y_train_2)
        
#         y_train_2_pred = ar_env.predict(X_train_2)
#         residuals = [y_train_2.iloc[i,0] - y_train_2_pred[i] for i, epiweek in enumerate(X_train_2.index)]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples)
#         resamples  = resamples[:, :y_train_2_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         y_train_2_tilde = resamples + y_train_2_pred
#         # fit models on bootstrapping samples to estimate the empirical distribution
#         for i, sample in enumerate(y_train_2_tilde):
#             sample = pd.DataFrame(sample)
#             sample.index = y_train_2.index
#             sample.columns = y_train_2.columns
#             ar_env_bs = LinearRegression()
#             ar_env_bs.fit(X_train_2, sample)
#             y_pred_models['ar_env'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = ar_env_bs.predict(X_test_2)
        

    
#         ## 5. Ridge model
#         ridge_cv = RidgeCV(cv = tscv)
#         ridge_cv.fit(X_train, y_train)
    
#         ridge_model = Ridge(alpha = ridge_cv.alpha_)
#         ridge_model.fit(X_train, y_train)

#         y_train_pred = ridge_model.predict(X_train)
#         residuals = [y_train.iloc[i,0] - y_train_pred[i] for i, epiweek in enumerate(X_train.index)]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples)
#         resamples  = resamples[:, :y_train_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         y_train_tilde = resamples + y_train_pred
#         # fit models on bootstrapping samples to estimate the empirical distribution
#         for i, sample in enumerate(y_train_tilde):
#             sample = pd.DataFrame(sample)
#             sample.index = y_train.index
#             sample.columns = y_train.columns
            
#             ridge_cv_bs = RidgeCV(cv = tscv)
#             ridge_cv_bs.fit(X_train, sample)
#             ridge_model_bs = Ridge(alpha = ridge_cv_bs.alpha_) 
#             ridge_model_bs.fit(X_train, sample)
#             y_pred_models['ridge'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = ridge_model_bs.predict(X_test)

        
#         ## 6. Lasso Model
#         lasso_cv = LassoCV(cv = tscv, random_state = 18, max_iter = 100000)
#         lasso_cv.fit(X_train, y_train)
        
#         # Create the Lasso model with the optimal alpha value
#         lasso_model = Lasso(alpha = lasso_cv.alpha_)
#         lasso_model.fit(X_train, y_train)

#         y_train_pred = lasso_model.predict(X_train)
#         residuals = [y_train.iloc[i,0] - y_train_pred[i] for i, epiweek in enumerate(X_train.index)]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples)
#         resamples  = resamples[:, :y_train_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         y_train_tilde = resamples + y_train_pred
#         # fit models on bootstrapping samples to estimate the empirical distribution
#         for i, sample in enumerate(y_train_tilde):
#             sample = pd.DataFrame(sample)
#             sample.index = y_train.index
#             sample.columns = y_train.columns
            
#             lasso_cv_bs = LassoCV(cv = tscv, random_state = 18)
#             lasso_cv_bs.fit(X_train, sample)
#             lasso_model_bs = Lasso(alpha = lasso_cv_bs.alpha_)
#             lasso_model_bs.fit(X_train, sample)
#             y_pred_models['lasso'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = lasso_model_bs.predict(X_test)
        
#         ## 7. Adaptive Lasso regression
#         linear_reg = LinearRegression()
#         linear_reg.fit(X_train, y_train)
#         initial_coef = linear_reg.coef_
#         # Calculate weights for the adaptive Lasso
#         weights = 1 / (np.abs(initial_coef) + 1e-5)
#         X_train_weighted = X_train / weights
        
#         lasso_adaptive = Lasso(alpha = lasso_cv.alpha_)
#         lasso_adaptive.fit(X_train_weighted, y_train)
        
#         lasso_adaptive.coef_ = lasso_adaptive.coef_ / weights
        
#         y_train_pred = lasso_adaptive.predict(X_train)
#         residuals = [y_train.iloc[i,0] - y_train_pred[i] for i, epiweek in enumerate(X_train.index)]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples)
#         resamples  = resamples[:, :y_train_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         y_train_tilde = resamples + y_train_pred
#         # fit models on bootstrapping samples to estimate the empirical distribution
#         for i, sample in enumerate(y_train_tilde):
#             sample = pd.DataFrame(sample)
#             sample.index = y_train.index
#             sample.columns = y_train.columns
            
#             linear_reg_bs = LinearRegression()
#             linear_reg_bs.fit(X_train, sample)
#             initial_coef_bs = linear_reg_bs.coef_
#             weights_bs = 1 / (np.abs(initial_coef_bs) + 1e-5)
#             X_train_weighted = X_train / weights_bs
#             lasso_cv_bs = LassoCV(cv = tscv, random_state = 18)
#             lasso_cv_bs.fit(X_train, sample)
#             lasso_adaptive_bs = Lasso(alpha = lasso_cv_bs.alpha_)
#             lasso_adaptive_bs.fit(X_train_weighted, sample)
#             lasso_adaptive_bs.coef_ = lasso_adaptive_bs.coef_ / weights_bs
#             y_pred_models['alasso'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = lasso_adaptive_bs.predict(X_test)
        
        
        
#         ## 8. Sparse Group Lasso regression
#         group_sizes = [8 for i in range(int(X_train.shape[1]/lag))]
#         groups = np.concatenate(
#             [size * [i] for i, size in enumerate(group_sizes)]
#         ).reshape(-1, 1)
        
#         # Create the sgl model with the default group_reg and l1_reg
#         sgl_model = GroupLasso(groups=groups, random_state=18, scale_reg="inverse_group_size", fit_intercept=True, n_iter=100000, supress_warning=True)
#         sgl_model.fit(X_train, y_train)

#         y_train_pred = sgl_model.predict(X_train)
#         residuals = [y_train.iloc[i,0] - y_train_pred[i] for i, epiweek in enumerate(X_train.index)]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples)
#         resamples  = resamples[:, :y_train_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         y_train_tilde = resamples + y_train_pred
#         # fit models on bootstrapping samples to estimate the empirical distribution
#         for i, sample in enumerate(y_train_tilde):
#             sample = pd.DataFrame(sample)
#             sample.index = y_train.index
#             sample.columns = y_train.columns
#             sgl_model_bs = GroupLasso(groups=groups, random_state=18, scale_reg="inverse_group_size", fit_intercept=True, n_iter=100000, supress_warning=True)
#             sgl_model_bs.fit(X_train, sample)
#             y_pred_models['sgl'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = sgl_model_bs.predict(X_test)
        
        
        ## 9. ElasticNet Model
        elasticnet_cv = ElasticNetCV(cv = tscv, max_iter = 100000)
        elasticnet_cv.fit(X_train, y_train)
    
        # Create the ElasticNet model with the optimal l1 and alpha values
        elasticnet_model = ElasticNet(alpha = elasticnet_cv.alpha_, l1_ratio = elasticnet_cv.l1_ratio_)
        elasticnet_model.fit(X_train, y_train)

        y_train_pred = elasticnet_model.predict(X_train)
        residuals = [y_train.iloc[i,0] - y_train_pred[i] for i, epiweek in enumerate(X_train.index)]
        resamples = block_bootstrap(residuals, block_size, num_bs)
        resamples = np.array(resamples)
        resamples  = resamples[:, :y_train_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
        y_train_tilde = resamples + y_train_pred
        # fit models on bootstrapping samples to estimate the empirical distribution
        for i, sample in enumerate(y_train_tilde):
            sample = pd.DataFrame(sample)
            sample.index = y_train.index
            sample.columns = y_train.columns
            
            elasticnet_cv_bs = ElasticNetCV(cv = tscv)
            elasticnet_cv_bs.fit(X_train, sample)
            elasticnet_model_bs = ElasticNet(alpha = elasticnet_cv_bs.alpha_, l1_ratio = elasticnet_cv_bs.l1_ratio_)
            elasticnet_model_bs.fit(X_train, sample)
            y_pred_models['elasticnet'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = elasticnet_model_bs.predict(X_test)

        
#         ## 10. Pure factor model
# #         print(X_train)
#         remove_names = []
#         for name in X_train.columns:
#             if name[0:-3] == target_var:
#                 remove_names.append(name)

#         X_train_pure = X_train.drop(columns=remove_names)
#         X_test_pure = X_test.drop(columns=remove_names)
#         pca = PCA()
#         pca.fit(X_train_pure)
#         cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
#           #to explain more than 85% of the variance
#         num_components = np.where(cumulative_variance_ratio >= 0.85)[0][0] + 1 
#         pca_new = PCA(n_components=num_components)
#         X_train_pca = pca_new.fit_transform(X_train_pure)
#         X_train_pca = pd.DataFrame(X_train_pca)
#         X_train_pca.columns = X_train_pca.columns.astype(str)
#         X_train_pca = pd.merge(X_train_1, X_train_pca, left_index=True, right_index=True)

#         X_test_pca = pca_new.transform(X_test_pure)
#         X_test_pca = pd.DataFrame(X_test_pca)
#         X_test_pca.columns = X_test_pca.columns.astype(str)
#         X_test_pca = pd.merge(X_test_1, X_test_pca, left_index=True, right_index=True)
        
#         pure_factor_model = LinearRegression()
#         pure_factor_model.fit(X_train_pca, y_train)

#         y_train_pred = pure_factor_model.predict(X_train_pca)
#         residuals = [y_train.iloc[i,0] - y_train_pred[i] for i, epiweek in enumerate(X_train_pca.index)]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples)
#         resamples  = resamples[:, :y_train_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         y_train_tilde = resamples + y_train_pred
#         # fit models on bootstrapping samples to estimate the empirical distribution
#         for i, sample in enumerate(y_train_tilde):
#             sample = pd.DataFrame(sample)
#             sample.index = y_train.index
#             sample.columns = y_train.columns
#             pure_factor_model_bs = LinearRegression()
#             pure_factor_model_bs.fit(X_train_pca, sample)
#             y_pred_models['purefactor'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = pure_factor_model_bs.predict(X_test_pca)

    
    
        ## 11. KNN
        knn_model = KNeighborsRegressor() #  default parameters
        knn_model.fit(X_train, y_train)

        y_train_pred = knn_model.predict(X_train)
        residuals = [y_train.iloc[i,0] - y_train_pred[i] for i, epiweek in enumerate(X_train.index)]
        resamples = block_bootstrap(residuals, block_size, num_bs)
        resamples = np.array(resamples)
        resamples  = resamples[:, :y_train_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
        y_train_tilde = resamples + y_train_pred
        # fit models on bootstrapping samples to estimate the empirical distribution
        for i, sample in enumerate(y_train_tilde):
            sample = pd.DataFrame(sample)
            sample.index = y_train.index
            sample.columns = y_train.columns
            knn_model_bs = KNeighborsRegressor()
            knn_model_bs.fit(X_train, sample)
            y_pred_models['knn'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = knn_model_bs.predict(X_test)



#         ## 12. XGBoost
#         xgboost_model = xgb.XGBRegressor(n_estimators=1000, random_state=18)
#         xgboost_model.fit(X_train, y_train)

#         y_train_pred = xgboost_model.predict(X_train)
#         residuals = [y_train.iloc[i,0] - y_train_pred[i] for i, epiweek in enumerate(X_train.index)]
#         resamples = block_bootstrap(residuals, block_size, num_bs)
#         resamples = np.array(resamples)
#         resamples  = resamples[:, :y_train_pred.shape[0]-resamples.shape[1]] # Excess samples are removed to align with the original data size
#         y_train_tilde = resamples + y_train_pred
#         # fit models on bootstrapping samples to estimate the empirical distribution
#         for i, sample in enumerate(y_train_tilde):
#             sample = pd.DataFrame(sample)
#             sample.index = y_train.index
#             sample.columns = y_train.columns
#             xgboost_model_bs = xgb.XGBRegressor(n_estimators=1000, random_state=18)
#             xgboost_model_bs.fit(X_train, sample)
#             y_pred_models['xgboost'].loc[window_end+1:window_end+expand_test_length, F'{i}'] = xgboost_model_bs.predict(X_test)
        
    
        ##
        #keep track of model progress, every number of weeks
        tracking_interval = 5
        if window_end.weektuple()[1] % tracking_interval == 0:
            print(F'{target_var} done with {window_end+expand_test_length}; {count} out of {test_length}')
            
        ## Implement expanding window
        #window_start = window_start+1 (only for rolling window)
        window_end += 15
        count += 15

    print(F'The last epiweek for {target_var} to be predicted is: {window_end}')
    print(F'The total number of predicted epiweeks for {target_var} is: {count}')


In [40]:
## This function sets up the first order regression for the target disease, for one specified lag and step

def run_first_order_regression(dataset, dataset_1, dataset_2, lag, step, target_var, window_perc):
    print(F'Running first order regression for {target_var} lag {lag} step {step}')
    
    naive = create_naive(dataset, step, target_var)
    history_mean = create_history_mean(dataset, lag, step, target_var)
    
    lagged_dataset = create_lagged_dataset(dataset, lag, target_var)
    lagged_dataset_1 = create_lagged_dataset(dataset_1, lag, target_var)
    lagged_dataset_2 = create_lagged_dataset(dataset_2, lag, target_var)
    
    X, y = create_stepped_dataset(lagged_dataset, step, target_var)
    X_1, y_1 = create_stepped_dataset(lagged_dataset_1, step, target_var)
    X_2, y_2 = create_stepped_dataset(lagged_dataset_2, step, target_var)
    
    window_start, window_end = create_window(X, window_perc)

    print(F'The first epiweek to be predicted for {target_var} lag {lag} step {step} is: {window_end+1}')
    
    num_bs = 1000 # Bootstrapping times
    block_size = 20
#     model_list = ['naive', 'historymean', 'ar_pure', 'ar_env', 'ridge', 'lasso', 'alasso', 'sgl', 
#                  'elasticnet', 'purefactor', 'knn', 'xgboost']
    model_list = ['knn']
    
    y_pred_models = {key: create_output_dataset(y, window_end) for key in model_list}
    

    train_length = len(X.loc[window_start:window_end])
    print(F'The initial training dataset length for {target_var} lag {lag} step {step} is: {train_length}')


    test_length = len(X.loc[window_end+1:])
    print(F'The initial testing dataset length for {target_var} lag {lag} step {step} is: {test_length}')

        
    regression_with_naive(X, y, X_1, y_1, X_2, y_2, window_start, window_end, y_pred_models, test_length, naive, history_mean, target_var, lag, step, block_size, num_bs, dataset)
    
    pred_path = os.path.join(target_var, 'pred', F'L{lag}_S{step}')
    if not os.path.exists(pred_path):
        os.makedirs(pred_path)
    
    for model in model_list:
        pred_model_path = os.path.join(pred_path, F'{model}.csv')
        y_pred_models[model].to_csv(pred_model_path)

    

    

    print(F'Completed for {target_var} lag {lag} step {step}')
    clear_output(wait=False)

In [41]:
## This function runs the regression for one disease, for all lags and steps, hence the for loop

def run_disease_regression(dataset, disease_var, lag, step):
    
    ## Note how the integer disease_var is input into this function, and then
    ## the string target_var is returned for the remaining functions
    explore_df, explore_df_1, explore_df_2, target_var = create_initial_dataset(dataset, disease_var)

    with open("target_variables.txt") as target_variables_file:
        if target_var not in target_variables_file.read():
            with open("target_variables.txt", 'a') as target_variables_file:
                target_variables_file.write(F'{target_var}\n')
    
    ## run the first order regression for all lags and steps for this target variable
    print(F'Running regression for {target_var}')
    run_first_order_regression(explore_df, explore_df_1, explore_df_2, lag = lag, step = step, target_var = target_var, window_perc = 0.7)

In [42]:
start_time = time.time()
## Main function call using Parallel
## x in range (0,16) represents the 16 diseases that are the target variables. However, for this function we input them as integers
## the create_initial_dataset function will convert the integer format to string format
## Using parallel, each disease can be run on one computer core
combinations = list(itertools.product(range(0, 16), range(1, 13)))
Parallel(n_jobs=-1, verbose=51)(delayed(run_disease_regression)(weatherclimateED, x, 8, y) for x, y in combinations)
#run_full_regression(weatherclimateED, range(0,16), 8, 9, 1, 9)
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
Running regression for Cardiovascular disease
Running first order regression for Cardiovascular disease lag 8 step 12
The first epiweek to be predicted for Cardiovascular disease lag 8 step 12 is: 201548
The initial training dataset length for Cardiovascular disease lag 8 step 12 is: 352
The initial testing dataset length for Cardiovascular disease lag 8 step 12 is: 150
Cardiovascular disease done with 201625; 15 out of 150
Cardiovascular disease done with 201640; 30 out of 150
Cardiovascular disease done with 201703; 45 out of 150
The last epiweek for Cardiovascular disease to be predicted is: 201841
The total number of predicted epiweeks for Cardiovascular disease is: 150
Completed for Cardiovascular disease lag 8 step 12
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   32.1s
Running regression for Cardiovascular disease
Running first order regression for Cardiovascular disease lag 8 step 11
The first

Running regression for Chronic respiratory disease
Running first order regression for Chronic respiratory disease lag 8 step 1
The first epiweek to be predicted for Chronic respiratory disease lag 8 step 1 is: 201604
The initial training dataset length for Chronic respiratory disease lag 8 step 1 is: 360
The initial testing dataset length for Chronic respiratory disease lag 8 step 1 is: 153
The last epiweek for Chronic respiratory disease to be predicted is: 201912
The total number of predicted epiweeks for Chronic respiratory disease is: 165
Completed for Chronic respiratory disease lag 8 step 1
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.1min
Running regression for Chronic respiratory disease
Running first order regression for Chronic respiratory disease lag 8 step 2
The first epiweek to be predicted for Chronic respiratory disease lag 8 step 2 is: 201603
The initial training dataset length for Chronic respiratory disease lag 8 step 2 is: 359
The initial testing dataset 

Running regression for Chronic respiratory disease
Running first order regression for Chronic respiratory disease lag 8 step 10
The first epiweek to be predicted for Chronic respiratory disease lag 8 step 10 is: 201549
The initial training dataset length for Chronic respiratory disease lag 8 step 10 is: 353
The initial testing dataset length for Chronic respiratory disease lag 8 step 10 is: 151
The last epiweek for Chronic respiratory disease to be predicted is: 201905
The total number of predicted epiweeks for Chronic respiratory disease is: 165
Completed for Chronic respiratory disease lag 8 step 10
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.2min
Running regression for Diabetes mellitus
Running first order regression for Diabetes mellitus lag 8 step 1
The first epiweek to be predicted for Diabetes mellitus lag 8 step 1 is: 201604
The initial training dataset length for Diabetes mellitus lag 8 step 1 is: 360
The initial testing dataset length for Diabetes mellitus lag 8 

Running regression for Digestive disease
Running first order regression for Digestive disease lag 8 step 2
The first epiweek to be predicted for Digestive disease lag 8 step 2 is: 201603
The initial training dataset length for Digestive disease lag 8 step 2 is: 359
The initial testing dataset length for Digestive disease lag 8 step 2 is: 153
Digestive disease done with 201725; 60 out of 153
Digestive disease done with 201740; 75 out of 153
Digestive disease done with 201803; 90 out of 153
The last epiweek for Digestive disease to be predicted is: 201911
The total number of predicted epiweeks for Digestive disease is: 165
Completed for Digestive disease lag 8 step 2
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.2min
Running regression for Digestive disease
Running first order regression for Digestive disease lag 8 step 1
The first epiweek to be predicted for Digestive disease lag 8 step 1 is: 201604
The initial training dataset length for Digestive disease lag 8 step 1 is: 36

Running regression for Endocrine disorders
Running first order regression for Endocrine disorders lag 8 step 2
The first epiweek to be predicted for Endocrine disorders lag 8 step 2 is: 201603
The initial training dataset length for Endocrine disorders lag 8 step 2 is: 359
The initial testing dataset length for Endocrine disorders lag 8 step 2 is: 153
Endocrine disorders done with 201725; 60 out of 153
Endocrine disorders done with 201740; 75 out of 153
Endocrine disorders done with 201803; 90 out of 153
The last epiweek for Endocrine disorders to be predicted is: 201911
The total number of predicted epiweeks for Endocrine disorders is: 165
Completed for Endocrine disorders lag 8 step 2
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:  2.8min
Running regression for Endocrine disorders
Running first order regression for Endocrine disorders lag 8 step 12
The first epiweek to be predicted for Endocrine disorders lag 8 step 12 is: 201548
The initial training dataset length for Endocri

Running regression for Factors influencing health status and contact with health services
Running first order regression for Factors influencing health status and contact with health services lag 8 step 12
The first epiweek to be predicted for Factors influencing health status and contact with health services lag 8 step 12 is: 201548
The initial training dataset length for Factors influencing health status and contact with health services lag 8 step 12 is: 352
The initial testing dataset length for Factors influencing health status and contact with health services lag 8 step 12 is: 150
Factors influencing health status and contact with health services done with 201625; 15 out of 150
Factors influencing health status and contact with health services done with 201640; 30 out of 150
Factors influencing health status and contact with health services done with 201703; 45 out of 150
The last epiweek for Factors influencing health status and contact with health services to be predicted is: 20

Running regression for Factors influencing health status and contact with health services
Running first order regression for Factors influencing health status and contact with health services lag 8 step 6
The first epiweek to be predicted for Factors influencing health status and contact with health services lag 8 step 6 is: 201552
The initial training dataset length for Factors influencing health status and contact with health services lag 8 step 6 is: 356
The initial testing dataset length for Factors influencing health status and contact with health services lag 8 step 6 is: 152
Factors influencing health status and contact with health services done with 201830; 120 out of 152
Factors influencing health status and contact with health services done with 201845; 135 out of 152
Factors influencing health status and contact with health services done with 201847; 150 out of 152
The last epiweek for Factors influencing health status and contact with health services to be predicted is: 201

[Parallel(n_jobs=-1)]: Done  79 tasks      | elapsed:  4.0min
Running regression for Genitourinary disorders
Running first order regression for Genitourinary disorders lag 8 step 6
The first epiweek to be predicted for Genitourinary disorders lag 8 step 6 is: 201552
The initial training dataset length for Genitourinary disorders lag 8 step 6 is: 356
The initial testing dataset length for Genitourinary disorders lag 8 step 6 is: 152
Genitourinary disorders done with 201830; 120 out of 152
Genitourinary disorders done with 201845; 135 out of 152
Genitourinary disorders done with 201847; 150 out of 152
The last epiweek for Genitourinary disorders to be predicted is: 201908
The total number of predicted epiweeks for Genitourinary disorders is: 165
Completed for Genitourinary disorders lag 8 step 6
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:  4.0min
Running regression for Genitourinary disorders
Running first order regression for Genitourinary disorders lag 8 step 7
The first epiw

Running regression for Ill-defined diseases
Running first order regression for Ill-defined diseases lag 8 step 6
The first epiweek to be predicted for Ill-defined diseases lag 8 step 6 is: 201552
The initial training dataset length for Ill-defined diseases lag 8 step 6 is: 356
The initial testing dataset length for Ill-defined diseases lag 8 step 6 is: 152
Ill-defined diseases done with 201830; 120 out of 152
Ill-defined diseases done with 201845; 135 out of 152
Ill-defined diseases done with 201847; 150 out of 152
The last epiweek for Ill-defined diseases to be predicted is: 201908
The total number of predicted epiweeks for Ill-defined diseases is: 165
Completed for Ill-defined diseases lag 8 step 6
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:  4.5min
Running regression for Ill-defined diseases
Running first order regression for Ill-defined diseases lag 8 step 7
The first epiweek to be predicted for Ill-defined diseases lag 8 step 7 is: 201551
The initial training dataset len

Running regression for Infectious and Parasitic Diseases
Running first order regression for Infectious and Parasitic Diseases lag 8 step 5
The first epiweek to be predicted for Infectious and Parasitic Diseases lag 8 step 5 is: 201601
The initial training dataset length for Infectious and Parasitic Diseases lag 8 step 5 is: 357
The initial testing dataset length for Infectious and Parasitic Diseases lag 8 step 5 is: 152
Infectious and Parasitic Diseases done with 201630; 15 out of 152
Infectious and Parasitic Diseases done with 201645; 30 out of 152
Infectious and Parasitic Diseases done with 201708; 45 out of 152
The last epiweek for Infectious and Parasitic Diseases to be predicted is: 201909
The total number of predicted epiweeks for Infectious and Parasitic Diseases is: 165
Completed for Infectious and Parasitic Diseases lag 8 step 5
[Parallel(n_jobs=-1)]: Done 103 tasks      | elapsed:  5.1min
Running regression for Infectious and Parasitic Diseases
Running first order regression 

Running regression for Malignant neoplasms
Running first order regression for Malignant neoplasms lag 8 step 4
The first epiweek to be predicted for Malignant neoplasms lag 8 step 4 is: 201602
The initial training dataset length for Malignant neoplasms lag 8 step 4 is: 358
The initial testing dataset length for Malignant neoplasms lag 8 step 4 is: 152
The last epiweek for Malignant neoplasms to be predicted is: 201910
The total number of predicted epiweeks for Malignant neoplasms is: 165
Completed for Malignant neoplasms lag 8 step 4
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  5.6min
Running regression for Malignant neoplasms
Running first order regression for Malignant neoplasms lag 8 step 6
The first epiweek to be predicted for Malignant neoplasms lag 8 step 6 is: 201552
The initial training dataset length for Malignant neoplasms lag 8 step 6 is: 356
The initial testing dataset length for Malignant neoplasms lag 8 step 6 is: 152
Malignant neoplasms done with 201830; 120 ou

Running regression for Musculoskeletal disease
Running first order regression for Musculoskeletal disease lag 8 step 4
The first epiweek to be predicted for Musculoskeletal disease lag 8 step 4 is: 201602
The initial training dataset length for Musculoskeletal disease lag 8 step 4 is: 358
The initial testing dataset length for Musculoskeletal disease lag 8 step 4 is: 152
The last epiweek for Musculoskeletal disease to be predicted is: 201910
The total number of predicted epiweeks for Musculoskeletal disease is: 165
Completed for Musculoskeletal disease lag 8 step 4
[Parallel(n_jobs=-1)]: Done 126 tasks      | elapsed:  6.2min
Running regression for Musculoskeletal disease
Running first order regression for Musculoskeletal disease lag 8 step 8
The first epiweek to be predicted for Musculoskeletal disease lag 8 step 8 is: 201551
The initial training dataset length for Musculoskeletal disease lag 8 step 8 is: 355
The initial testing dataset length for Musculoskeletal disease lag 8 step 8 

Running regression for Neurological and sense disorders
Running first order regression for Neurological and sense disorders lag 8 step 3
The first epiweek to be predicted for Neurological and sense disorders lag 8 step 3 is: 201602
The initial training dataset length for Neurological and sense disorders lag 8 step 3 is: 358
The initial testing dataset length for Neurological and sense disorders lag 8 step 3 is: 153
The last epiweek for Neurological and sense disorders to be predicted is: 201910
The total number of predicted epiweeks for Neurological and sense disorders is: 165
Completed for Neurological and sense disorders lag 8 step 3
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  6.8min
Running regression for Neurological and sense disorders
Running first order regression for Neurological and sense disorders lag 8 step 5
The first epiweek to be predicted for Neurological and sense disorders lag 8 step 5 is: 201601
The initial training dataset length for Neurological and sense

Running regression for Oral Diseases
Running first order regression for Oral Diseases lag 8 step 1
The first epiweek to be predicted for Oral Diseases lag 8 step 1 is: 201604
The initial training dataset length for Oral Diseases lag 8 step 1 is: 360
The initial testing dataset length for Oral Diseases lag 8 step 1 is: 153
The last epiweek for Oral Diseases to be predicted is: 201912
The total number of predicted epiweeks for Oral Diseases is: 165
Completed for Oral Diseases lag 8 step 1
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:  7.3min
Running regression for Oral Diseases
Running first order regression for Oral Diseases lag 8 step 3
The first epiweek to be predicted for Oral Diseases lag 8 step 3 is: 201602
The initial training dataset length for Oral Diseases lag 8 step 3 is: 358
The initial testing dataset length for Oral Diseases lag 8 step 3 is: 153
The last epiweek for Oral Diseases to be predicted is: 201910
The total number of predicted epiweeks for Oral Diseases is:

Running regression for Other neoplasms
Running first order regression for Other neoplasms lag 8 step 7
The first epiweek to be predicted for Other neoplasms lag 8 step 7 is: 201551
The initial training dataset length for Other neoplasms lag 8 step 7 is: 355
The initial testing dataset length for Other neoplasms lag 8 step 7 is: 152
Other neoplasms done with 201613; 0 out of 152
The last epiweek for Other neoplasms to be predicted is: 201907
The total number of predicted epiweeks for Other neoplasms is: 165
Completed for Other neoplasms lag 8 step 7
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  7.9min
Running regression for Other neoplasms
Running first order regression for Other neoplasms lag 8 step 8
The first epiweek to be predicted for Other neoplasms lag 8 step 8 is: 201551
The initial training dataset length for Other neoplasms lag 8 step 8 is: 355
The initial testing dataset length for Other neoplasms lag 8 step 8 is: 151
Other neoplasms done with 201613; 0 out of 151
Th

Running regression for Respiratory Infection
Running first order regression for Respiratory Infection lag 8 step 6
The first epiweek to be predicted for Respiratory Infection lag 8 step 6 is: 201552
The initial training dataset length for Respiratory Infection lag 8 step 6 is: 356
The initial testing dataset length for Respiratory Infection lag 8 step 6 is: 152
Respiratory Infection done with 201830; 120 out of 152
Respiratory Infection done with 201845; 135 out of 152
Respiratory Infection done with 201847; 150 out of 152
The last epiweek for Respiratory Infection to be predicted is: 201908
The total number of predicted epiweeks for Respiratory Infection is: 165
Completed for Respiratory Infection lag 8 step 6
Running regression for Respiratory Infection
Running first order regression for Respiratory Infection lag 8 step 9
The first epiweek to be predicted for Respiratory Infection lag 8 step 9 is: 201550
The initial training dataset length for Respiratory Infection lag 8 step 9 is: 3

In [43]:
print('finish')

finish
[2K

### test (can deleteme)