In [1]:
# IMPORT LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#--- Import Statsmodels
import statsmodels.api as sm

#--- Import Sklearn
from sklearn.linear_model import LinearRegression
from scipy import stats

In [2]:
# READ PICKLE OF THE ORIGINAL AND DIFFERENCED SERIES
Dengue_PH = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_PH_Clean.pickle')
Dengue_PH_diff = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_PH_Diff.pickle')

In [3]:
Dengue_PH_diff.columns

Index(['GTrend_Dengue', 'GTrend_Dengue_Fever', 'GTrend_Dengue_Cure',
       'GTrend_Dengue_Med', 'GTrend_Dengue_Sym', 'Mort_Rate', 'MTD_Cases',
       'Reg_Ave_Temp_NCR', 'Reg_Ave_Rainfall_NCR'],
      dtype='object')

In [4]:
# CREATE LAGGED VERSIONS OF THE PREDICTORS (MAX OF A QUARTER)
Dengue_PH_diff['Temp_L1'] = Dengue_PH_diff['Reg_Ave_Temp_NCR'].shift(1)
Dengue_PH_diff['Temp_L2'] = Dengue_PH_diff['Reg_Ave_Temp_NCR'].shift(2)
Dengue_PH_diff['Temp_L3'] = Dengue_PH_diff['Reg_Ave_Temp_NCR'].shift(3)

Dengue_PH_diff['Rain_L1'] = Dengue_PH_diff['Reg_Ave_Rainfall_NCR'].shift(1)
Dengue_PH_diff['Rain_L2'] = Dengue_PH_diff['Reg_Ave_Rainfall_NCR'].shift(2)
Dengue_PH_diff['Rain_L3'] = Dengue_PH_diff['Reg_Ave_Rainfall_NCR'].shift(3)

Dengue_PH_diff['GT_Dengue_L1'] = Dengue_PH_diff['GTrend_Dengue'].shift(1)
Dengue_PH_diff['GT_Dengue_L2'] = Dengue_PH_diff['GTrend_Dengue'].shift(2)
Dengue_PH_diff['GT_Dengue_L3'] = Dengue_PH_diff['GTrend_Dengue'].shift(3)

Dengue_PH_diff['GT_DengueFvr_L1'] = Dengue_PH_diff['GTrend_Dengue_Fever'].shift(1)
Dengue_PH_diff['GT_DengueFvr_L2'] = Dengue_PH_diff['GTrend_Dengue_Fever'].shift(2)
Dengue_PH_diff['GT_DengueFvr_L3'] = Dengue_PH_diff['GTrend_Dengue_Fever'].shift(3)

Dengue_PH_diff['GT_DengueCure_L1'] = Dengue_PH_diff['GTrend_Dengue_Cure'].shift(1)
Dengue_PH_diff['GT_DengueCure_L2'] = Dengue_PH_diff['GTrend_Dengue_Cure'].shift(2)
Dengue_PH_diff['GT_DengueCure_L3'] = Dengue_PH_diff['GTrend_Dengue_Cure'].shift(3)

Dengue_PH_diff['GT_DengueMed_L1'] = Dengue_PH_diff['GTrend_Dengue_Med'].shift(1)
Dengue_PH_diff['GT_DengueMed_L2'] = Dengue_PH_diff['GTrend_Dengue_Med'].shift(2)
Dengue_PH_diff['GT_DengueMed_L3'] = Dengue_PH_diff['GTrend_Dengue_Med'].shift(3)

Dengue_PH_diff['GT_DengueSym_L1'] = Dengue_PH_diff['GTrend_Dengue_Sym'].shift(1)
Dengue_PH_diff['GT_DengueSym_L2'] = Dengue_PH_diff['GTrend_Dengue_Sym'].shift(2)
Dengue_PH_diff['GT_DengueSym_L3'] = Dengue_PH_diff['GTrend_Dengue_Sym'].shift(3)

Dengue_PH_diff['Cases_L1'] = Dengue_PH_diff['MTD_Cases'].shift(1)
Dengue_PH_diff['Cases_L2'] = Dengue_PH_diff['MTD_Cases'].shift(2)
Dengue_PH_diff['Cases_L3'] = Dengue_PH_diff['MTD_Cases'].shift(3)

Dummies = pd.get_dummies(Dengue_PH_diff.index.month, prefix='m')
Dengue_PH_diff = Dengue_PH_diff.reset_index()
Dengue_PH_diff = Dengue_PH_diff.merge(Dummies, left_index=True, right_index=True)
Dengue_PH_diff.set_index('Date', inplace=True)

In [5]:
# SPLIT SERIES TO TRAINING AND TEST SETS
#--- Set 2018 as the test dataframe
nobs = 12
df_train, df_test = Dengue_PH_diff[0:-nobs], Dengue_PH_diff[-nobs:]
df_train = df_train.dropna()
df_test = df_test.dropna()

# Check size
print(df_train.shape)  
print(df_test.shape)  


(32, 45)
(12, 45)


In [18]:
# PERFORM UNIVARIATE REGRESSION TO TRIM DOWN THE PREDICTORS
predictor_col = df_train.columns[df_train.columns.str.contains(pat = '_L')]
pvals = pd.DataFrame()
for col in predictor_col:
    Y = df_train.MTD_Cases
    X = df_train[col]
    X2 = sm.add_constant(X)
    mod = sm.OLS(Y,X)
    fit = mod.fit()
    pval = fit.summary2().tables[1]['P>|t|']
    pval = pval.to_frame()
    #print(pval)
    pvals = pvals.append(pval)
    
# RETAIN ONLY THE LAGGED PREDICTORS WITH SIGNIFICANT P-VALUES
print(pvals)
pvals = pvals[pvals['P>|t|'] <= 0.05].reset_index()
pvals = pvals.rename(columns={'index':'Variable'})
shortlist_predictor_col = pvals['Variable']
dummy = pd.Series(df_train.columns[df_train.columns.str.contains(pat = 'm_')])
shortlist_predictor_col = shortlist_predictor_col.append(dummy)
X = df_train[shortlist_predictor_col].drop(columns=['m_1'],axis=1)


                     P>|t|
Temp_L1           0.058965
Temp_L2           0.444114
Temp_L3           0.009211
Rain_L1           0.371678
Rain_L2           0.343528
Rain_L3           0.040640
GT_Dengue_L1      0.043142
GT_Dengue_L2      0.808227
GT_Dengue_L3      0.213421
GT_DengueFvr_L1   0.024079
GT_DengueFvr_L2   0.551524
GT_DengueFvr_L3   0.574029
GT_DengueCure_L1  0.338337
GT_DengueCure_L2  0.761531
GT_DengueCure_L3  0.050721
GT_DengueMed_L1   0.020318
GT_DengueMed_L2   0.147536
GT_DengueMed_L3   0.129061
GT_DengueSym_L1   0.029047
GT_DengueSym_L2   0.490212
GT_DengueSym_L3   0.500757
Cases_L1          0.740008
Cases_L2          0.790330
Cases_L3          0.799209


In [7]:
# PERFORM STEPWISE REGRESSION
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.05, 
                       threshold_out = 0.10, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.argmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

Y = Y.tolist()
result = stepwise_selection(X, Y)

print('resulting features:')
print(result)

Add  Temp_L3                        with p-value 0.0135999
Add  GT_DengueMed_L1                with p-value 0.0303546
resulting features:
['Temp_L3', 'GT_DengueMed_L1']


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


In [19]:
# PRINT THE RESULTS FOR THE FINAL MODEL
#X = df_train[result]
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())


                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.656
Model:                            OLS   Adj. R-squared:                  0.111
Method:                 Least Squares   F-statistic:                     1.205
Date:                Sun, 20 Oct 2019   Prob (F-statistic):              0.379
Time:                        01:29:38   Log-Likelihood:                -21.035
No. Observations:                  32   AIC:                             82.07
Df Residuals:                      12   BIC:                             111.4
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.5682      0.949     

In [20]:
X = X.drop(columns='m_10')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())


                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.656
Model:                            OLS   Adj. R-squared:                  0.179
Method:                 Least Squares   F-statistic:                     1.377
Date:                Sun, 20 Oct 2019   Prob (F-statistic):              0.282
Time:                        01:30:03   Log-Likelihood:                -21.040
No. Observations:                  32   AIC:                             80.08
Df Residuals:                      13   BIC:                             107.9
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.5301      0.701     

In [21]:
X = X.drop(columns='m_8')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())


                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.656
Model:                            OLS   Adj. R-squared:                  0.238
Method:                 Least Squares   F-statistic:                     1.569
Date:                Sun, 20 Oct 2019   Prob (F-statistic):              0.200
Time:                        01:30:28   Log-Likelihood:                -21.043
No. Observations:                  32   AIC:                             78.09
Df Residuals:                      14   BIC:                             104.5
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.5062      0.500     

In [22]:
X = X.drop(columns='m_7')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())


                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.655
Model:                            OLS   Adj. R-squared:                  0.288
Method:                 Least Squares   F-statistic:                     1.783
Date:                Sun, 20 Oct 2019   Prob (F-statistic):              0.135
Time:                        01:30:57   Log-Likelihood:                -21.064
No. Observations:                  32   AIC:                             76.13
Df Residuals:                      15   BIC:                             101.0
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.4767      0.435     

In [23]:
X = X.drop(columns='m_12')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())


                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.655
Model:                            OLS   Adj. R-squared:                  0.332
Method:                 Least Squares   F-statistic:                     2.026
Date:                Sun, 20 Oct 2019   Prob (F-statistic):             0.0863
Time:                        01:31:33   Log-Likelihood:                -21.079
No. Observations:                  32   AIC:                             74.16
Df Residuals:                      16   BIC:                             97.61
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.4366      0.256     

In [24]:
X = X.drop(columns='GT_DengueSym_L1')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())


                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.654
Model:                            OLS   Adj. R-squared:                  0.369
Method:                 Least Squares   F-statistic:                     2.297
Date:                Sun, 20 Oct 2019   Prob (F-statistic):             0.0528
Time:                        01:33:05   Log-Likelihood:                -21.121
No. Observations:                  32   AIC:                             72.24
Df Residuals:                      17   BIC:                             94.23
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.4281      0.246     

In [25]:
X = X.drop(columns='m_6')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.652
Model:                            OLS   Adj. R-squared:                  0.401
Method:                 Least Squares   F-statistic:                     2.595
Date:                Sun, 20 Oct 2019   Prob (F-statistic):             0.0312
Time:                        01:34:01   Log-Likelihood:                -21.219
No. Observations:                  32   AIC:                             70.44
Df Residuals:                      18   BIC:                             90.96
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.3962      0.219     

In [26]:
X = X.drop(columns='m_4')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.647
Model:                            OLS   Adj. R-squared:                  0.425
Method:                 Least Squares   F-statistic:                     2.908
Date:                Sun, 20 Oct 2019   Prob (F-statistic):             0.0184
Time:                        01:34:54   Log-Likelihood:                -21.427
No. Observations:                  32   AIC:                             68.85
Df Residuals:                      19   BIC:                             87.91
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.3462      0.190     

In [27]:
X = X.drop(columns='m_2')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.644
Model:                            OLS   Adj. R-squared:                  0.448
Method:                 Least Squares   F-statistic:                     3.284
Date:                Sun, 20 Oct 2019   Prob (F-statistic):             0.0102
Time:                        01:35:39   Log-Likelihood:                -21.602
No. Observations:                  32   AIC:                             67.20
Df Residuals:                      20   BIC:                             84.79
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.3019      0.160     

In [28]:
X = X.drop(columns='m_3')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.641
Model:                            OLS   Adj. R-squared:                  0.470
Method:                 Least Squares   F-statistic:                     3.744
Date:                Sun, 20 Oct 2019   Prob (F-statistic):            0.00520
Time:                        01:36:31   Log-Likelihood:                -21.733
No. Observations:                  32   AIC:                             65.47
Df Residuals:                      21   BIC:                             81.59
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.2841      0.151     

In [29]:
X = X.drop(columns='m_9')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.638
Model:                            OLS   Adj. R-squared:                  0.490
Method:                 Least Squares   F-statistic:                     4.314
Date:                Sun, 20 Oct 2019   Prob (F-statistic):            0.00246
Time:                        01:36:57   Log-Likelihood:                -21.837
No. Observations:                  32   AIC:                             63.67
Df Residuals:                      22   BIC:                             78.33
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.2554      0.127     

In [30]:
X = X.drop(columns='GT_Dengue_L1')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.625
Model:                            OLS   Adj. R-squared:                  0.495
Method:                 Least Squares   F-statistic:                     4.793
Date:                Sun, 20 Oct 2019   Prob (F-statistic):            0.00145
Time:                        01:37:45   Log-Likelihood:                -22.414
No. Observations:                  32   AIC:                             62.83
Df Residuals:                      23   BIC:                             76.02
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.2563      0.126     

In [31]:
X = X.drop(columns='GT_DengueMed_L1')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.616
Model:                            OLS   Adj. R-squared:                  0.505
Method:                 Least Squares   F-statistic:                     5.510
Date:                Sun, 20 Oct 2019   Prob (F-statistic):           0.000718
Time:                        01:38:31   Log-Likelihood:                -22.778
No. Observations:                  32   AIC:                             61.56
Df Residuals:                      24   BIC:                             73.28
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.2405      0.123     

In [32]:
X = X.drop(columns='m_5')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.571
Model:                            OLS   Adj. R-squared:                  0.468
Method:                 Least Squares   F-statistic:                     5.549
Date:                Sun, 20 Oct 2019   Prob (F-statistic):           0.000904
Time:                        01:38:57   Log-Likelihood:                -24.564
No. Observations:                  32   AIC:                             63.13
Df Residuals:                      25   BIC:                             73.39
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.1847      0.123     

In [33]:
X = X.drop(columns='m_11')
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.529
Model:                            OLS   Adj. R-squared:                  0.438
Method:                 Least Squares   F-statistic:                     5.838
Date:                Sun, 20 Oct 2019   Prob (F-statistic):           0.000962
Time:                        01:39:46   Log-Likelihood:                -26.067
No. Observations:                  32   AIC:                             64.13
Df Residuals:                      26   BIC:                             72.93
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.1215      0.119     

In [34]:
#X = X.drop(columns='m_4')
#X2 = sm.add_constant(X)
est = sm.OLS(Y, X)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.558
Model:                            OLS   Adj. R-squared:                  0.476
Method:                 Least Squares   F-statistic:                     6.812
Date:                Sun, 20 Oct 2019   Prob (F-statistic):           0.000316
Time:                        01:40:40   Log-Likelihood:                -26.696
No. Observations:                  32   AIC:                             63.39
Df Residuals:                      27   BIC:                             70.72
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Temp_L3             5.7668      3.389     

In [36]:
#X = X.drop(columns='Temp_L3')
#X2 = sm.add_constant(X)
est = sm.OLS(Y, X)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              MTD_Cases   R-squared:                       0.510
Model:                            OLS   Adj. R-squared:                  0.440
Method:                 Least Squares   F-statistic:                     7.296
Date:                Sun, 20 Oct 2019   Prob (F-statistic):           0.000371
Time:                        01:41:46   Log-Likelihood:                -28.326
No. Observations:                  32   AIC:                             64.65
Df Residuals:                      28   BIC:                             70.52
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Rain_L3             0.0640      0.024     

In [63]:
reg = LinearRegression(fit_intercept=False)
reg.fit(X,Y)

#For retrieving the slope:
print(reg.coef_)

prediction = pd.DataFrame(reg.predict(X))
prediction.columns = ['Pred_MTD_Cases']
df_train2 = df_train.reset_index().join(prediction, how='inner')

prediction = pd.DataFrame(reg.predict(df_test[X.columns]))
prediction.columns = ['Pred_MTD_Cases']
df_test2 = df_test.reset_index().join(prediction, how='inner')

[ 0.06398832  7.74637729 -9.141377    4.20865082]


In [64]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

y_true = df_train2['MTD_Cases']
y_pred = df_train2['Pred_MTD_Cases']
print(mean_squared_error(y_true, y_pred))
print(mean_absolute_error(y_true, y_pred))

0.3438691715909843
0.44366227682497755


In [65]:
y_true = df_test2['MTD_Cases']
y_pred = df_test2['Pred_MTD_Cases']
print(mean_squared_error(y_true, y_pred))
print(mean_absolute_error(y_true, y_pred))

0.3557505444645807
0.49477182165631856


In [100]:
Dengue_PH_Fct = df_train2.append(df_test2, ignore_index=True)
Dengue_PH_Fct = Dengue_PH_Fct[['Date','Pred_MTD_Cases']]
Dengue_PH_Fct['Pred_MTD_Cases'] += 1
Dengue_PH_Fct = Dengue_PH_Fct.rename(columns={'Pred_MTD_Cases': 'Pred_Cases_PctChg'})
Dengue_PH_Fct.set_index('Date', inplace=True)

In [96]:
Dengue_PH2 = Dengue_PH.merge(Dengue_PH_Fct,how='left',on='Date')
Dengue_PH2['MTD_Cases_Fct'] = Dengue_PH2.MTD_Cases.shift(1) * Dengue_PH2['Pred_Cases_PctChg']
Dengue_PH2[['MTD_Cases','Pred_Cases_PctChg','MTD_Cases_Fct']]

Unnamed: 0_level_0,MTD_Cases,Pred_Cases_PctChg,MTD_Cases_Fct
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-31,10435.0,,
2015-02-28,4237.0,,
2015-03-31,5274.0,,
2015-04-30,5670.0,,
2015-05-31,2984.0,1.013546,5746.804614
2015-06-30,7684.0,1.393079,4156.946662
2015-07-31,23058.0,1.492608,11469.200399
2015-08-31,35580.0,1.869719,43111.975767
2015-09-30,32650.0,0.876893,31199.85741
2015-10-31,50195.0,2.299566,75080.835965


In [104]:
Dengue_PH2.to_excel('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_PH_Fct.xlsx')