In [1]:
# IMPORT LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#--- Import Statsmodels
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tsa.stattools import acf

In [9]:
# IMPORT PICKLE OF CLEAN DATAFRAME FROM pgm1_Data_Preparation
Dengue_PH = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_PH_Clean.pickle')
Dengue_PH

Unnamed: 0_level_0,MTD_Cases,MTD_Deaths,Reg_Ave_Temp_NCR,Reg_Ave_Rainfall_NCR,GTrend_Dengue,GTrend_Dengue_Fever,GTrend_Dengue_Cure,GTrend_Dengue_Med,GTrend_Dengue_Sym,Mort_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-31,10435.0,29.0,25.5,26.066667,0.1125,0.3025,0.1925,0.18,0.1875,0.002779
2015-02-28,4237.0,10.0,26.3,1.866667,0.1025,0.225,0.1875,0.1325,0.1525,0.00236
2015-03-31,5274.0,14.0,27.566667,6.7,0.07,0.152,0.078,0.036,0.114,0.002655
2015-04-30,5670.0,22.0,29.966667,23.1,0.0575,0.135,0.06,0.09,0.0875,0.00388
2015-05-31,2984.0,11.0,30.533333,73.333333,0.054,0.154,0.07,0.142,0.086,0.003686
2015-06-30,7684.0,30.0,30.2,192.333333,0.1,0.21,0.1325,0.2625,0.1325,0.003904
2015-07-31,23058.0,80.0,28.466667,464.133333,0.1525,0.36,0.235,0.2775,0.255,0.00347
2015-08-31,35580.0,100.0,28.766667,338.933333,0.202,0.45,0.286,0.306,0.302,0.002811
2015-09-30,32650.0,93.0,27.866667,392.1,0.295,0.595,0.55,0.475,0.46,0.002848
2015-10-31,50195.0,140.0,28.6,198.066667,0.34,0.6775,0.51,0.42,0.4925,0.002789


In [3]:
# CHECK FOR STATIONARITY OF THE TIME SERIES
def adfuller_test(series, signif=0.05, name='', verbose=False):
    """Perform ADFuller to test for Stationarity of given series and print report"""
    r = adfuller(series, autolag='AIC')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    # Print Summary
    print(f'    Augmented Dickey-Fuller Test on "{name}"', "\n   ", '-'*47)
    print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
    print(f' Significance Level    = {signif}')
    print(f' Test Statistic        = {output["test_statistic"]}')
    print(f' No. Lags Chosen       = {output["n_lags"]}')

    for key,val in r[4].items():
        print(f' Critical value {adjust(key)} = {round(val, 3)}')

    if p_value <= signif:
        print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
        print(f" => Series is Stationary.")
    else:
        print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
        print(f" => Series is Non-Stationary.")   

# ADF Test on each column
for name, column in Dengue_PH.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')


    Augmented Dickey-Fuller Test on "MTD_Cases" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -3.0967
 No. Lags Chosen       = 0
 Critical value 1%     = -3.578
 Critical value 5%     = -2.925
 Critical value 10%    = -2.601
 => P-Value = 0.0268. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "MTD_Deaths" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -3.4068
 No. Lags Chosen       = 0
 Critical value 1%     = -3.578
 Critical value 5%     = -2.925
 Critical value 10%    = -2.601
 => P-Value = 0.0107. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "Reg_Ave_Temp_NCR" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 

In [12]:
# SINCE NOT ALL OF THE SERIES ARE STATIONARY, PERFORM DIFFERENCING.  USE PERCENTAGE DIFFERENCING FOR THE 
# SERIES CASES, TEMPERATURE, AND RAINFALL.  USE SIMPLE DIFFERENCING FOR THE SERIES MORTALITY AND GOOGLE TRENDS

#Dengue_PH = Dengue_PH.drop(columns=['MTD_Deaths'])

#--- Calculate the first differences
Dengue_PH_diff = Dengue_PH.diff().dropna()
Dengue_PH_diff = Dengue_PH_diff.drop(columns=['MTD_Cases','Reg_Ave_Temp_NCR','Reg_Ave_Rainfall_NCR'] )

#--- Calculate the percentage differences for MTD_Cases, Reg_Ave_Temp_NCR, and Reg_Ave_Rainfall_NCR
Dengue_PH_diff[['MTD_Cases','Reg_Ave_Temp_NCR','Reg_Ave_Rainfall_NCR']] = Dengue_PH.groupby(Dengue_PH.index)['MTD_Cases','Reg_Ave_Temp_NCR','Reg_Ave_Rainfall_NCR'].pct_change().dropna()
print(Dengue_PH_diff)


            GTrend_Dengue  GTrend_Dengue_Fever  GTrend_Dengue_Cure  \
Date                                                                 
2015-02-28        -0.0100              -0.0775             -0.0050   
2015-03-31        -0.0325              -0.0730             -0.1095   
2015-04-30        -0.0125              -0.0170             -0.0180   
2015-05-31        -0.0035               0.0190              0.0100   
2015-06-30         0.0460               0.0560              0.0625   
2015-07-31         0.0525               0.1500              0.1025   
2015-08-31         0.0495               0.0900              0.0510   
2015-09-30         0.0930               0.1450              0.2640   
2015-10-31         0.0450               0.0825             -0.0400   
2015-11-30        -0.1100              -0.2235             -0.1320   
2015-12-31        -0.0875              -0.1840             -0.0155   
2016-01-31         0.0195               0.0360              0.0075   
2016-02-29        -0

In [13]:
# STORE DIFFERENCED DATAFRAME TO PICKLE
Dengue_PH_diff.to_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_PH_Diff.pickle')

In [14]:
# CHECK IF THE DIFFERENCED SERIES ARE STATIONARY
# ADF Test on each column
for name, column in Dengue_PH_diff.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')

    Augmented Dickey-Fuller Test on "GTrend_Dengue" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -6.3968
 No. Lags Chosen       = 7
 Critical value 1%     = -3.61
 Critical value 5%     = -2.939
 Critical value 10%    = -2.608
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue_Fever" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -5.0126
 No. Lags Chosen       = 9
 Critical value 1%     = -3.621
 Critical value 5%     = -2.944
 Critical value 10%    = -2.61
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue_Cure" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Statio