In [1]:
# IMPORT LIBRARIES

import itertools
import time
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#--- Import Statsmodels
import statsmodels.api as sm

#--- Import Sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

#--- Importing tqdm for the progress bar
from tqdm import tnrange, tqdm_notebook

In [2]:
# READ PICKLE OF THE ORIGINAL AND DIFFERENCED SERIES
Dengue_EastVis = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_EastVis_Clean.pickle')
Dengue_EastVis_diff = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_EastVis_Diff.pickle')

In [3]:
Dengue_EastVis_diff.columns

Index(['MTD_Deaths', 'GTrend_Dengue', 'GTrend_Dengue_Fever',
       'GTrend_Dengue_Sym', 'Mort_Rate', 'MTD_Cases', 'Reg_Ave_Temp_EastVis',
       'Reg_Ave_Rainfall_EastVis'],
      dtype='object')

In [4]:
# CREATE LAGGED VERSIONS OF THE PREDICTORS (MAX OF A QUARTER)
Dengue_EastVis_diff['Temp_L1'] = Dengue_EastVis_diff['Reg_Ave_Temp_EastVis'].shift(1)
Dengue_EastVis_diff['Temp_L2'] = Dengue_EastVis_diff['Reg_Ave_Temp_EastVis'].shift(2)
Dengue_EastVis_diff['Temp_L3'] = Dengue_EastVis_diff['Reg_Ave_Temp_EastVis'].shift(3)

Dengue_EastVis_diff['Rain_L1'] = Dengue_EastVis_diff['Reg_Ave_Rainfall_EastVis'].shift(1)
Dengue_EastVis_diff['Rain_L2'] = Dengue_EastVis_diff['Reg_Ave_Rainfall_EastVis'].shift(2)
Dengue_EastVis_diff['Rain_L3'] = Dengue_EastVis_diff['Reg_Ave_Rainfall_EastVis'].shift(3)

Dengue_EastVis_diff['GT_Dengue_L1'] = Dengue_EastVis_diff['GTrend_Dengue'].shift(1)
Dengue_EastVis_diff['GT_Dengue_L2'] = Dengue_EastVis_diff['GTrend_Dengue'].shift(2)
Dengue_EastVis_diff['GT_Dengue_L3'] = Dengue_EastVis_diff['GTrend_Dengue'].shift(3)

Dengue_EastVis_diff['GT_DengueFvr_L1'] = Dengue_EastVis_diff['GTrend_Dengue_Fever'].shift(1)
Dengue_EastVis_diff['GT_DengueFvr_L2'] = Dengue_EastVis_diff['GTrend_Dengue_Fever'].shift(2)
Dengue_EastVis_diff['GT_DengueFvr_L3'] = Dengue_EastVis_diff['GTrend_Dengue_Fever'].shift(3)

Dengue_EastVis_diff['GT_DengueSym_L1'] = Dengue_EastVis_diff['GTrend_Dengue_Sym'].shift(1)
Dengue_EastVis_diff['GT_DengueSym_L2'] = Dengue_EastVis_diff['GTrend_Dengue_Sym'].shift(2)
Dengue_EastVis_diff['GT_DengueSym_L3'] = Dengue_EastVis_diff['GTrend_Dengue_Sym'].shift(3)

#Dengue_EastVis_diff['Cases_L1'] = Dengue_EastVis_diff['MTD_Cases'].shift(1)
#Dengue_EastVis_diff['Cases_L2'] = Dengue_EastVis_diff['MTD_Cases'].shift(2)
#Dengue_EastVis_diff['Cases_L3'] = Dengue_EastVis_diff['MTD_Cases'].shift(3)

Dummies = pd.get_dummies(Dengue_EastVis_diff.index.month, prefix='m')
Dengue_EastVis_diff = Dengue_EastVis_diff.reset_index()
Dengue_EastVis_diff = Dengue_EastVis_diff.merge(Dummies, left_index=True, right_index=True)
Dengue_EastVis_diff.set_index('Date', inplace=True)

In [5]:
# SPLIT SERIES TO TRAINING AND TEST SETS
#--- Set 2018 as the test dataframe
nobs = 12
df_train, df_test = Dengue_EastVis_diff[0:-nobs], Dengue_EastVis_diff[-nobs:]
df_train = df_train.dropna()
df_test = df_test.dropna()

# Check size
print(df_train.shape)  
print(df_test.shape)  


(32, 35)
(12, 35)


In [6]:
# PERFORM UNIVARIATE REGRESSION TO TRIM DOWN THE PREDICTORS
predictor_col = df_train.columns[df_train.columns.str.contains(pat = '_L')]
pvals = pd.DataFrame()
for col in predictor_col:
    Y = df_train.MTD_Cases
    X = df_train[col]
    X2 = sm.add_constant(X)
    mod = sm.OLS(Y,X)
    fit = mod.fit()
    pval = fit.summary2().tables[1]['P>|t|']
    pval = pval.to_frame()
    pvals = pvals.append(pval)

# RETAIN ONLY THE LAGGED PREDICTORS WITH SIGNIFICANT P-VALUES
pvals = pvals[pvals['P>|t|'] <= 0.05].reset_index()
pvals = pvals.rename(columns={'index':'Variable'})
shortlist_predictor_col = pvals['Variable']
dummy = pd.Series(df_train.columns[df_train.columns.str.startswith('m_')])
shortlist_predictor_col = shortlist_predictor_col.append(dummy)
X = df_train[shortlist_predictor_col].drop(columns=['m_1'],axis=1)
#X = df_train[shortlist_predictor_col]
list(X)

['Temp_L2',
 'GT_Dengue_L1',
 'GT_Dengue_L3',
 'm_2',
 'm_3',
 'm_4',
 'm_5',
 'm_6',
 'm_7',
 'm_8',
 'm_9',
 'm_10',
 'm_11',
 'm_12']

In [7]:
def fit_linear_reg(X,Y):
    #Fit linear regression model 
    X2 = sm.add_constant(X)
    model_k = sm.OLS(Y,X2)
    fit = model_k.fit()
    pvalues = fit.pvalues
    worst_pval = pvalues.max() 
    worst_feature = pvalues.argmax()
    Y_Pred_Test = fit.predict(sm.add_constant(df_test[list(X)]))
    
    if worst_feature == 'const':
        model_k = sm.OLS(Y,X)
        fit = model_k.fit()
        pvalues = fit.pvalues
        worst_pval = pvalues.max() 
        worst_feature = pvalues.argmax()
        Y_Pred_Test = fit.predict(df_test[list(X)])
        
    pval = fit.pvalues.to_frame()
    features = list(pval.index)
    pvals = list(pval[0])
    sig = pval[pval[0]<=0.05]
    pct_sig = len(list(sig[0])) / len(list(pval[0]))
    rsq = fit.rsquared
    adjr = fit.rsquared_adj
    serial_corr = list(sm.stats.diagnostic.acorr_breusch_godfrey(fit, nlags=3))[3]
    het_arch = list(sm.stats.diagnostic.het_arch(fit.resid, maxlag=1))[3]
    normality = list(sm.stats.stattools.jarque_bera(fit.resid))[1]
    
    mae = mean_absolute_error(Y_True_Test,Y_Pred_Test)
    mse = mean_squared_error(Y_True_Test,Y_Pred_Test)
    rmse = sqrt(mean_squared_error(Y_True_Test,Y_Pred_Test))
    return features, pvals, pct_sig, rsq, adjr, serial_corr, het_arch, normality, mae, mse, rmse, worst_pval, worst_feature

In [8]:
# INITIALIZE VARIABLES
#X = df_train[df_train.columns[df_train.columns.str.contains(r'_L|m_')]]
Y = df_train.MTD_Cases
Y_True_Test = df_test.MTD_Cases
threshold_out = 0.05

remaining_features = list(X.columns.values)
features = []
R_squared_list, AdjR2_list, feature_list, pval_list = [],[],[],[]
pct_sig_list = []
num_features = []
serial_corr_list = []
het_arch_list = []
norm_list = []
mae_list, mse_list, rmse_list = [],[],[]

# RUN BACKWARD STEPWISE REGRESSION 
#--- Remove predictors one at a time until there is no more p-value exceeding the threshold
while True:
    changed = False
    tmp_result = fit_linear_reg(X[list(set(remaining_features))],Y)  
    num_features.append(len(remaining_features)) 
    feature_list.append(tmp_result[0])
    pval_list.append(tmp_result[1])
    pct_sig_list.append(tmp_result[2])
    R_squared_list.append(tmp_result[3])
    AdjR2_list.append(tmp_result[4])
    serial_corr_list.append(tmp_result[5])
    het_arch_list.append(tmp_result[6])
    norm_list.append(tmp_result[7])
    mae_list.append(tmp_result[8])
    mse_list.append(tmp_result[9])
    rmse_list.append(tmp_result[10])
    
    if tmp_result[11] > threshold_out:
        changed = True
        remaining_features.remove(tmp_result[12])
        
    if not changed:
        break

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  
will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  app.launch_new_instance()


In [9]:
# STORE IN DATAFRAME
subsets_df = pd.DataFrame({'num_features': num_features, 'features': feature_list, 'P>|t|': pval_list, 'pct_sig': pct_sig_list, \
                          'rsq': R_squared_list, 'adj_rsq': AdjR2_list, 'serial_corr': serial_corr_list, 'het': het_arch_list, \
                          'normality': norm_list, 'mae': mae_list, 'mse': mse_list, 'rmse': rmse_list})

# RETAIN ONLY THE SUBSET MODELS WHICH PASSED THE DIAGNOSTIC TESTS
subsets_df = subsets_df[(subsets_df['serial_corr'] > 0.05) & (subsets_df['het'] > 0.05) & (subsets_df['normality'] > 0.05)]

# GET ONLY THE TOP 3 MODELS BY ADJUSTED R-SQUARED
top3_subsets_df = subsets_df.nlargest(3,'adj_rsq').reset_index().drop(columns=['index'])
top3_subsets_df.index += 1

In [11]:
# RERUN THE TOP 3 MODELS AND STORE THE FORECASTS FOR BOTH TRAINING AND TEST SETS
Y = df_train['MTD_Cases']

for i in range(1,4):
    top = top3_subsets_df['features'][i]
    if top.count('const') == 0:
        X = df_train[top]    
        reg = LinearRegression(fit_intercept=False)

    if top.count('const') > 0:
        top.remove('const')
        X = df_train[top]    
        reg = LinearRegression(fit_intercept=True)

    reg.fit(X,Y)
    prediction = pd.DataFrame(reg.predict(X))
    prediction.columns = ['Pred_Model_'] 
    prediction.columns += str(i)
    if i == 1:
        df_train2 = df_train.reset_index().join(prediction, how='inner')
    if i > 1:
        df_train2 = df_train2.reset_index().join(prediction, how='inner')

    prediction = pd.DataFrame(reg.predict(df_test[X.columns]))
    prediction.columns = ['Pred_Model_']
    prediction.columns += str(i)
    if i == 1:
        df_test2 = df_test.reset_index().join(prediction, how='inner')
    if i > 1:
        df_test2 = df_test2.reset_index().join(prediction, how='inner')

In [12]:
# REVERT THE FORECAST TO THE ORIGINAL FORM FROM PERCENTAGE CHANGE
Dengue_EastVis_Fct = df_train2.append(df_test2, ignore_index=True)
Dengue_EastVis_Fct = Dengue_EastVis_Fct[['Date','Pred_Model_1','Pred_Model_2','Pred_Model_3']]
Dengue_EastVis_Fct['Pred_Model_1'] += 1
Dengue_EastVis_Fct['Pred_Model_2'] += 1
Dengue_EastVis_Fct['Pred_Model_3'] += 1
Dengue_EastVis_Fct = Dengue_EastVis_Fct.rename(columns={'Pred_Model_1': 'Pred_PctChg_1', 'Pred_Model_2': 'Pred_PctChg_2', \
                'Pred_Model_3': 'Pred_PctChg_3'})
Dengue_EastVis_Fct.set_index('Date', inplace=True)

Dengue_EastVis2 = Dengue_EastVis.merge(Dengue_EastVis_Fct,how='left',on='Date')
Dengue_EastVis2['MTD_Cases_Fct_1'] = Dengue_EastVis2.MTD_Cases.shift(1) * Dengue_EastVis2['Pred_PctChg_1']
Dengue_EastVis2['MTD_Cases_Fct_2'] = Dengue_EastVis2.MTD_Cases.shift(1) * Dengue_EastVis2['Pred_PctChg_2']
Dengue_EastVis2['MTD_Cases_Fct_3'] = Dengue_EastVis2.MTD_Cases.shift(1) * Dengue_EastVis2['Pred_PctChg_3']
Dengue_EastVis2 = Dengue_EastVis2.drop(columns=['Pred_PctChg_1','Pred_PctChg_2','Pred_PctChg_3'])


In [13]:
# SAVE MODEL DATAFRAMES TO PICKLE
top3_subsets_df.to_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_EastVis_Top3_Models.pickle')
Dengue_EastVis2.to_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_EastVis_Fct.pickle')

In [14]:
# EXPORT PICKLES TO EXCEL 
writer = pd.ExcelWriter('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_EastVis_Fct.xlsx', engine='xlsxwriter')
Dengue_EastVis2.to_excel(writer, sheet_name='Fct')
top3_subsets_df.to_excel(writer, sheet_name='Model')
writer.save()