In [1]:
# IMPORT LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#--- Import Statsmodels
import statsmodels.api as sm

#--- Import Sklearn
from sklearn.linear_model import LinearRegression
from scipy import stats

In [2]:
# READ PICKLE OF THE ORIGINAL AND DIFFERENCED SERIES
Dengue_PH = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_PH_Clean.pickle')
Dengue_PH_diff = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_PH_Diff.pickle')

In [3]:
Dengue_PH_diff.columns

Index(['GTrend_Dengue', 'GTrend_Dengue_Fever', 'GTrend_Dengue_Cure',
       'GTrend_Dengue_Med', 'GTrend_Dengue_Sym', 'Mort_Rate', 'MTD_Cases',
       'Reg_Ave_Temp_NCR', 'Reg_Ave_Rainfall_NCR'],
      dtype='object')

In [4]:
# CREATE LAGGED VERSIONS OF THE PREDICTORS (MAX OF A QUARTER)
Dengue_PH_diff['Temp_L1'] = Dengue_PH_diff['Reg_Ave_Temp_NCR'].shift(1)
Dengue_PH_diff['Temp_L2'] = Dengue_PH_diff['Reg_Ave_Temp_NCR'].shift(2)
Dengue_PH_diff['Temp_L3'] = Dengue_PH_diff['Reg_Ave_Temp_NCR'].shift(3)

Dengue_PH_diff['Rain_L1'] = Dengue_PH_diff['Reg_Ave_Rainfall_NCR'].shift(1)
Dengue_PH_diff['Rain_L2'] = Dengue_PH_diff['Reg_Ave_Rainfall_NCR'].shift(2)
Dengue_PH_diff['Rain_L3'] = Dengue_PH_diff['Reg_Ave_Rainfall_NCR'].shift(3)

Dengue_PH_diff['GT_Dengue_L1'] = Dengue_PH_diff['GTrend_Dengue'].shift(1)
Dengue_PH_diff['GT_Dengue_L2'] = Dengue_PH_diff['GTrend_Dengue'].shift(2)
Dengue_PH_diff['GT_Dengue_L3'] = Dengue_PH_diff['GTrend_Dengue'].shift(3)

Dengue_PH_diff['GT_DengueFvr_L1'] = Dengue_PH_diff['GTrend_Dengue_Fever'].shift(1)
Dengue_PH_diff['GT_DengueFvr_L2'] = Dengue_PH_diff['GTrend_Dengue_Fever'].shift(2)
Dengue_PH_diff['GT_DengueFvr_L3'] = Dengue_PH_diff['GTrend_Dengue_Fever'].shift(3)

Dengue_PH_diff['GT_DengueCure_L1'] = Dengue_PH_diff['GTrend_Dengue_Cure'].shift(1)
Dengue_PH_diff['GT_DengueCure_L2'] = Dengue_PH_diff['GTrend_Dengue_Cure'].shift(2)
Dengue_PH_diff['GT_DengueCure_L3'] = Dengue_PH_diff['GTrend_Dengue_Cure'].shift(3)

Dengue_PH_diff['GT_DengueMed_L1'] = Dengue_PH_diff['GTrend_Dengue_Med'].shift(1)
Dengue_PH_diff['GT_DengueMed_L2'] = Dengue_PH_diff['GTrend_Dengue_Med'].shift(2)
Dengue_PH_diff['GT_DengueMed_L3'] = Dengue_PH_diff['GTrend_Dengue_Med'].shift(3)

Dengue_PH_diff['GT_DengueSym_L1'] = Dengue_PH_diff['GTrend_Dengue_Sym'].shift(1)
Dengue_PH_diff['GT_DengueSym_L2'] = Dengue_PH_diff['GTrend_Dengue_Sym'].shift(2)
Dengue_PH_diff['GT_DengueSym_L3'] = Dengue_PH_diff['GTrend_Dengue_Sym'].shift(3)

Dummies = pd.get_dummies(Dengue_PH_diff.index.month, prefix='m')
Dengue_PH_diff = Dengue_PH_diff.reset_index()
Dengue_PH_diff = Dengue_PH_diff.merge(Dummies, left_index=True, right_index=True)
Dengue_PH_diff.set_index('Date', inplace=True)

In [5]:
# SPLIT SERIES TO TRAINING AND TEST SETS
#--- Set 2018 as the test dataframe
nobs = 12
df_train, df_test = Dengue_PH_diff[0:-nobs], Dengue_PH_diff[-nobs:]
df_train = df_train.dropna()
df_test = df_test.dropna()

# Check size
print(df_train.shape)  
print(df_test.shape)  


(32, 42)
(12, 42)


In [27]:
# PERFORM UNIVARIATE REGRESSION TO TRIM DOWN THE PREDICTORS
predictor_col = df_train.columns[df_train.columns.str.contains(pat = '_L')]
pvals = pd.DataFrame()
for col in predictor_col:
    Y = df_train.MTD_Cases
    X = df_train[col]
    X2 = sm.add_constant(X)
    mod = sm.OLS(Y,X)
    fit = mod.fit()
    pval = fit.summary2().tables[1]['P>|t|']
    pval = pval.to_frame()
    #print(pval)
    pvals = pvals.append(pval)
    
# RETAIN ONLY THE LAGGED PREDICTORS WITH SIGNIFICANT P-VALUES
pvals = pvals[pvals['P>|t|'] <= 0.05].reset_index()
pvals = pvals.rename(columns={'index':'Variable'})
shortlist_predictor_col = pvals['Variable']
dummy = pd.Series(df_train.columns[df_train.columns.str.contains(pat = 'm_')])
#shortlist_predictor_col = shortlist_predictor_col.append(dummy)
#X = df_train[shortlist_predictor_col].drop(columns=['m_1'],axis=1)
X = df_train[shortlist_predictor_col]

In [None]:
print(type(X))
print(type(Y))
print(Y.axes)

In [28]:
# PERFORM STEPWISE REGRESSION
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.05, 
                       threshold_out = 0.10, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.argmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

Y = Y.tolist()
result = stepwise_selection(X, Y)

print('resulting features:')
print(result)

Add  Temp_L3                        with p-value 0.0135999
Add  GT_DengueMed_L1                with p-value 0.0303546
resulting features:
['Temp_L3', 'GT_DengueMed_L1']


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


In [None]:
# PRINT THE RESULTS FOR THE FINAL MODEL
#X = df_train[result]
X2 = sm.add_constant(X)
est = sm.OLS(Y, X)
est2 = est.fit()
print(est2.summary())

In [26]:
# PRINT THE RESULTS FOR THE FINAL MODEL
X = df_train[['Temp_L3','GT_DengueMed_L1','m_11']]
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.394
Model:                            OLS   Adj. R-squared:                  0.329
Method:                 Least Squares   F-statistic:                     6.071
Date:                Sun, 20 Oct 2019   Prob (F-statistic):            0.00257
Time:                        01:10:59   Log-Likelihood:                -30.093
No. Observations:                  32   AIC:                             68.19
Df Residuals:                      28   BIC:                             74.05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.2936      0.124     

In [None]:
Y