In [1]:
# IMPORT LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#--- Import Statsmodels
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tsa.stattools import acf

In [2]:
# IMPORT PICKLE OF CLEAN DATAFRAME FROM pgm1_Data_Preparation
Dengue_NCR = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_NCR_Clean.pickle')
Dengue_NCR

Unnamed: 0_level_0,MTD_Cases,MTD_Deaths,Reg_Ave_Temp_NCR,Reg_Ave_Rainfall_NCR,GTrend_Dengue,GTrend_Dengue_Fever,GTrend_Dengue_Cure,GTrend_Dengue_Med,GTrend_Dengue_Sym,Mort_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-31,1696.0,7.0,25.5,26.066667,0.105,0.2025,0.225,0.1675,0.135,0.004127
2015-02-28,320.0,0.0,26.3,1.866667,0.09,0.16,0.245,0.11,0.1075,0.0
2015-03-31,612.0,3.0,27.566667,6.7,0.064,0.118,0.096,0.092,0.084,0.004902
2015-04-30,872.0,5.0,29.966667,23.1,0.055,0.1,0.0425,0.0575,0.065,0.005734
2015-05-31,308.0,0.0,30.533333,73.333333,0.056,0.104,0.1,0.09,0.074,0.0
2015-06-30,621.0,3.0,30.2,192.333333,0.0875,0.1325,0.23,0.26,0.1075,0.004831
2015-07-31,3270.0,18.0,28.466667,464.133333,0.15,0.275,0.315,0.225,0.205,0.005505
2015-08-31,2368.0,8.0,28.766667,338.933333,0.198,0.374,0.35,0.456,0.244,0.003378
2015-09-30,5162.0,22.0,27.866667,392.1,0.295,0.5075,0.5625,0.5525,0.335,0.004262
2015-10-31,7923.0,28.0,28.6,198.066667,0.3175,0.5375,0.6975,0.3775,0.37,0.003534


In [3]:
# CHECK FOR STATIONARITY OF THE TIME SERIES
def adfuller_test(series, signif=0.05, name='', verbose=False):
    """Perform ADFuller to test for Stationarity of given series and print report"""
    r = adfuller(series, autolag='AIC')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    # Print Summary
    print(f'    Augmented Dickey-Fuller Test on "{name}"', "\n   ", '-'*47)
    print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
    print(f' Significance Level    = {signif}')
    print(f' Test Statistic        = {output["test_statistic"]}')
    print(f' No. Lags Chosen       = {output["n_lags"]}')

    for key,val in r[4].items():
        print(f' Critical value {adjust(key)} = {round(val, 3)}')

    if p_value <= signif:
        print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
        print(f" => Series is Stationary.")
    else:
        print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
        print(f" => Series is Non-Stationary.")   

# ADF Test on each column
for name, column in Dengue_NCR.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')


    Augmented Dickey-Fuller Test on "MTD_Cases" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -4.594
 No. Lags Chosen       = 3
 Critical value 1%     = -3.589
 Critical value 5%     = -2.93
 Critical value 10%    = -2.603
 => P-Value = 0.0001. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "MTD_Deaths" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -2.4446
 No. Lags Chosen       = 1
 Critical value 1%     = -3.581
 Critical value 5%     = -2.927
 Critical value 10%    = -2.602
 => P-Value = 0.1295. Weak evidence to reject the Null Hypothesis.
 => Series is Non-Stationary.


    Augmented Dickey-Fuller Test on "Reg_Ave_Temp_NCR" 
    -----------------------------------------------
 Null Hypothesis: Data has unit roo

In [4]:
# SINCE NOT ALL OF THE SERIES ARE STATIONARY, PERFORM DIFFERENCING.  USE PERCENTAGE DIFFERENCING FOR THE 
# SERIES CASES, TEMPERATURE, AND RAINFALL.  USE SIMPLE DIFFERENCING FOR THE SERIES MORTALITY AND GOOGLE TRENDS

#Dengue_NCR = Dengue_NCR.drop(columns=['MTD_Deaths'])

#--- Calculate the first differences
Dengue_NCR_diff = Dengue_NCR.diff().dropna()
Dengue_NCR_diff = Dengue_NCR_diff.drop(columns=['MTD_Cases','Reg_Ave_Temp_NCR','Reg_Ave_Rainfall_NCR'] )

#--- Calculate the percentage differences for MTD_Cases, Reg_Ave_Temp_NCR, and Reg_Ave_Rainfall_NCR
Dengue_NCR_diff[['MTD_Cases','Reg_Ave_Temp_NCR','Reg_Ave_Rainfall_NCR']] = Dengue_NCR.groupby(Dengue_NCR.index)['MTD_Cases','Reg_Ave_Temp_NCR','Reg_Ave_Rainfall_NCR'].pct_change().dropna()
print(Dengue_NCR_diff)


            MTD_Deaths  GTrend_Dengue  GTrend_Dengue_Fever  \
Date                                                         
2015-02-28   -7.000000        -0.0150              -0.0425   
2015-03-31    3.000000        -0.0260              -0.0420   
2015-04-30    2.000000        -0.0090              -0.0180   
2015-05-31   -5.000000         0.0010               0.0040   
2015-06-30    3.000000         0.0315               0.0285   
2015-07-31   15.000000         0.0625               0.1425   
2015-08-31  -10.000000         0.0480               0.0990   
2015-09-30   14.000000         0.0970               0.1335   
2015-10-31    6.000000         0.0225               0.0300   
2015-11-30  -14.000000        -0.0995              -0.1355   
2015-12-31  -11.000000        -0.0830              -0.1445   
2016-01-31    0.000000         0.0210               0.0025   
2016-02-29    1.000000        -0.0410              -0.0650   
2016-03-31   -1.000000        -0.0275              -0.0450   
2016-04-

In [5]:
# STORE DIFFERENCED DATAFRAME TO PICKLE
Dengue_NCR_diff.to_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_NCR_Diff.pickle')

In [6]:
# CHECK IF THE DIFFERENCED SERIES ARE STATIONARY
# ADF Test on each column
for name, column in Dengue_NCR_diff.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')

    Augmented Dickey-Fuller Test on "MTD_Deaths" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -10.3864
 No. Lags Chosen       = 0
 Critical value 1%     = -3.581
 Critical value 5%     = -2.927
 Critical value 10%    = -2.602
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -6.3276
 No. Lags Chosen       = 8
 Critical value 1%     = -3.616
 Critical value 5%     = -2.941
 Critical value 10%    = -2.609
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue_Fever" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.