In [1]:
# IMPORT LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#--- Import Statsmodels
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tsa.stattools import acf

In [2]:
# IMPORT PICKLE OF CLEAN DATAFRAME FROM pgm1_Data_Preparation
Dengue_ARMM = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_ARMM_Clean.pickle')
Dengue_ARMM

Unnamed: 0_level_0,MTD_Cases,MTD_Deaths,Reg_Ave_Temp_ARMM,Reg_Ave_Rainfall_ARMM,GTrend_Dengue,GTrend_Dengue_Sym,Mort_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-31,120.0,0.0,27.5,88.9,0.0,0.0,0.0
2015-02-28,94.0,1.0,28.4,42.7,0.025,0.0,0.010638
2015-03-31,98.0,2.0,28.8,11.6,0.0,0.0,0.020408
2015-04-30,131.0,1.0,28.9,63.6,0.0,0.0,0.007634
2015-05-31,91.0,1.0,28.5,73.9,0.034,0.0,0.010989
2015-06-30,207.0,1.0,27.6,164.9,0.0,0.105,0.004831
2015-07-31,258.75,1.25,27.5,155.4,0.0,0.0,0.004831
2015-08-31,65.0,2.0,27.6,290.8,0.092,0.0,0.030769
2015-09-30,77.0,0.0,27.4,283.4,0.0,0.0,0.0
2015-10-31,127.0,1.0,27.9,110.1,0.0,0.0,0.007874


In [3]:
# CHECK FOR STATIONARITY OF THE TIME SERIES
def adfuller_test(series, signif=0.05, name='', verbose=False):
    """Perform ADFuller to test for Stationarity of given series and print report"""
    r = adfuller(series, autolag='AIC')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    # Print Summary
    print(f'    Augmented Dickey-Fuller Test on "{name}"', "\n   ", '-'*47)
    print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
    print(f' Significance Level    = {signif}')
    print(f' Test Statistic        = {output["test_statistic"]}')
    print(f' No. Lags Chosen       = {output["n_lags"]}')

    for key,val in r[4].items():
        print(f' Critical value {adjust(key)} = {round(val, 3)}')

    if p_value <= signif:
        print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
        print(f" => Series is Stationary.")
    else:
        print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
        print(f" => Series is Non-Stationary.")   

# ADF Test on each column
for name, column in Dengue_ARMM.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')


    Augmented Dickey-Fuller Test on "MTD_Cases" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -7.0695
 No. Lags Chosen       = 0
 Critical value 1%     = -3.578
 Critical value 5%     = -2.925
 Critical value 10%    = -2.601
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "MTD_Deaths" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -4.9738
 No. Lags Chosen       = 0
 Critical value 1%     = -3.578
 Critical value 5%     = -2.925
 Critical value 10%    = -2.601
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "Reg_Ave_Temp_ARMM" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Signi

In [4]:
# SINCE NOT ALL OF THE SERIES ARE STATIONARY, PERFORM DIFFERENCING.  USE PERCENTAGE DIFFERENCING FOR THE 
# SERIES CASES, TEMPERATURE, AND RAINFALL.  USE SIMPLE DIFFERENCING FOR THE SERIES MORTALITY AND GOOGLE TRENDS

#--- Calculate the first differences
Dengue_ARMM_diff = Dengue_ARMM.diff().dropna()
Dengue_ARMM_diff = Dengue_ARMM_diff.drop(columns=['MTD_Cases','Reg_Ave_Temp_ARMM','Reg_Ave_Rainfall_ARMM'] )

#--- Calculate the percentage differences for MTD_Cases, Reg_Ave_Temp_ARMM, and Reg_Ave_Rainfall_ARMM
Dengue_ARMM_diff[['MTD_Cases','Reg_Ave_Temp_ARMM','Reg_Ave_Rainfall_ARMM']] = Dengue_ARMM.groupby(Dengue_ARMM.index)['MTD_Cases','Reg_Ave_Temp_ARMM','Reg_Ave_Rainfall_ARMM'].pct_change().dropna()
print(Dengue_ARMM_diff)


            MTD_Deaths  GTrend_Dengue  GTrend_Dengue_Sym  Mort_Rate  \
Date                                                                  
2015-02-28    1.000000         0.0250             0.0000   0.010638   
2015-03-31    1.000000        -0.0250             0.0000   0.009770   
2015-04-30   -1.000000         0.0000             0.0000  -0.012775   
2015-05-31    0.000000         0.0340             0.0000   0.003355   
2015-06-30    0.000000        -0.0340             0.1050  -0.006158   
2015-07-31    0.250000         0.0000            -0.1050   0.000000   
2015-08-31    0.750000         0.0920             0.0000   0.025938   
2015-09-30   -2.000000        -0.0920             0.0000  -0.030769   
2015-10-31    1.000000         0.0000             0.0000   0.007874   
2015-11-30   -1.000000         0.1340             0.2000  -0.007874   
2015-12-31    3.000000        -0.1340            -0.2000   0.002698   
2016-01-31   -3.000000         0.0960             0.0820  -0.002698   
2016-0

In [5]:
# STORE DIFFERENCED DATAFRAME TO PICKLE
Dengue_ARMM_diff.to_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_ARMM_Diff.pickle')

In [6]:
# CHECK IF THE DIFFERENCED SERIES ARE STATIONARY
# ADF Test on each column
for name, column in Dengue_ARMM_diff.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')

    Augmented Dickey-Fuller Test on "MTD_Deaths" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -6.1082
 No. Lags Chosen       = 2
 Critical value 1%     = -3.589
 Critical value 5%     = -2.93
 Critical value 10%    = -2.603
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -4.9148
 No. Lags Chosen       = 5
 Critical value 1%     = -3.601
 Critical value 5%     = -2.935
 Critical value 10%    = -2.606
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue_Sym" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Si