In [1]:
# IMPORT LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#--- Import Statsmodels
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tsa.stattools import acf

In [2]:
# IMPORT PICKLE OF CLEAN DATAFRAME FROM pgm1_Data_Preparation
Dengue_EastVis = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_EastVis_Clean.pickle')
Dengue_EastVis

Unnamed: 0_level_0,MTD_Cases,MTD_Deaths,Reg_Ave_Temp_EastVis,Reg_Ave_Rainfall_EastVis,GTrend_Dengue,GTrend_Dengue_Fever,GTrend_Dengue_Sym,Mort_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-31,140.0,3.0,26.08,448.64,0.0,0.0,0.0,0.021429
2015-02-28,65.0,0.0,26.36,78.38,0.0,0.0,0.0,0.0
2015-03-31,72.0,0.0,27.12,70.7,0.0,0.0,0.0,0.0
2015-04-30,87.0,0.0,28.2,71.18,0.25,0.0,0.0,0.0
2015-05-31,26.0,0.0,29.22,21.06,0.0,0.0,0.0,0.0
2015-06-30,162.0,0.0,28.44,297.4,0.0,0.0,0.0,0.0
2015-07-31,290.0,1.0,28.433333,156.34,0.3225,0.235,0.0,0.003448
2015-08-31,110.0,0.0,28.82,150.84,0.066,0.0,0.0,0.0
2015-09-30,129.0,0.0,28.36,185.98,0.09,0.0,0.25,0.0
2015-10-31,443.0,0.0,28.44,155.54,0.145,0.25,0.0,0.0


In [3]:
# CHECK FOR STATIONARITY OF THE TIME SERIES
def adfuller_test(series, signif=0.05, name='', verbose=False):
    """Perform ADFuller to test for Stationarity of given series and print report"""
    r = adfuller(series, autolag='AIC')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    # Print Summary
    print(f'    Augmented Dickey-Fuller Test on "{name}"', "\n   ", '-'*47)
    print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
    print(f' Significance Level    = {signif}')
    print(f' Test Statistic        = {output["test_statistic"]}')
    print(f' No. Lags Chosen       = {output["n_lags"]}')

    for key,val in r[4].items():
        print(f' Critical value {adjust(key)} = {round(val, 3)}')

    if p_value <= signif:
        print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
        print(f" => Series is Stationary.")
    else:
        print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
        print(f" => Series is Non-Stationary.")   

# ADF Test on each column
for name, column in Dengue_EastVis.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')


    Augmented Dickey-Fuller Test on "MTD_Cases" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -1.1403
 No. Lags Chosen       = 2
 Critical value 1%     = -3.585
 Critical value 5%     = -2.928
 Critical value 10%    = -2.602
 => P-Value = 0.6987. Weak evidence to reject the Null Hypothesis.
 => Series is Non-Stationary.


    Augmented Dickey-Fuller Test on "MTD_Deaths" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -3.632
 No. Lags Chosen       = 0
 Critical value 1%     = -3.578
 Critical value 5%     = -2.925
 Critical value 10%    = -2.601
 => P-Value = 0.0052. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "Reg_Ave_Temp_EastVis" 
    -----------------------------------------------
 Null Hypothesis: Data has uni

In [4]:
# SINCE NOT ALL OF THE SERIES ARE STATIONARY, PERFORM DIFFERENCING.  USE PERCENTAGE DIFFERENCING FOR THE 
# SERIES CASES, TEMPERATURE, AND RAINFALL.  USE SIMPLE DIFFERENCING FOR THE SERIES MORTALITY AND GOOGLE TRENDS

#Dengue_EastVis = Dengue_EastVis.drop(columns=['MTD_Deaths'])

#--- Calculate the first differences
Dengue_EastVis_diff = Dengue_EastVis.diff().dropna()
Dengue_EastVis_diff = Dengue_EastVis_diff.drop(columns=['MTD_Cases','Reg_Ave_Temp_EastVis','Reg_Ave_Rainfall_EastVis'] )

#--- Calculate the percentage differences for MTD_Cases, Reg_Ave_Temp_EastVis, and Reg_Ave_Rainfall_EastVis
Dengue_EastVis_diff[['MTD_Cases','Reg_Ave_Temp_EastVis','Reg_Ave_Rainfall_EastVis']] = Dengue_EastVis.groupby(Dengue_EastVis.index)['MTD_Cases','Reg_Ave_Temp_EastVis','Reg_Ave_Rainfall_EastVis'].pct_change().dropna()
print(Dengue_EastVis_diff)


            MTD_Deaths  GTrend_Dengue  GTrend_Dengue_Fever  GTrend_Dengue_Sym  \
Date                                                                            
2015-02-28   -3.000000         0.0000               0.0000             0.0000   
2015-03-31    0.000000         0.0000               0.0000             0.0000   
2015-04-30    0.000000         0.2500               0.0000             0.0000   
2015-05-31    0.000000        -0.2500               0.0000             0.0000   
2015-06-30    0.000000         0.0000               0.0000             0.0000   
2015-07-31    1.000000         0.3225               0.2350             0.0000   
2015-08-31   -1.000000        -0.2565              -0.2350             0.0000   
2015-09-30    0.000000         0.0240               0.0000             0.2500   
2015-10-31    0.000000         0.0550               0.2500            -0.2500   
2015-11-30    1.000000        -0.0570              -0.2500             0.0000   
2015-12-31   -1.000000      

In [5]:
# STORE DIFFERENCED DATAFRAME TO PICKLE
Dengue_EastVis_diff.to_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_EastVis_Diff.pickle')

In [6]:
# CHECK IF THE DIFFERENCED SERIES ARE STATIONARY
# ADF Test on each column
for name, column in Dengue_EastVis_diff.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')

    Augmented Dickey-Fuller Test on "MTD_Deaths" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -5.4361
 No. Lags Chosen       = 3
 Critical value 1%     = -3.593
 Critical value 5%     = -2.932
 Critical value 10%    = -2.604
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -4.727
 No. Lags Chosen       = 10
 Critical value 1%     = -3.627
 Critical value 5%     = -2.946
 Critical value 10%    = -2.612
 => P-Value = 0.0001. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue_Fever" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationar