In [1]:
# IMPORT LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#--- Import Statsmodels
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tsa.stattools import acf

In [2]:
# IMPORT PICKLE OF CLEAN DATAFRAME FROM pgm1_Data_Preparation
Dengue_WestVis = pd.read_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_WestVis_Clean.pickle')
Dengue_WestVis

Unnamed: 0_level_0,MTD_Cases,MTD_Deaths,Reg_Ave_Temp_WestVis,Reg_Ave_Rainfall_WestVis,GTrend_Dengue,GTrend_Dengue_Fever,GTrend_Dengue_Cure,GTrend_Dengue_Sym,Mort_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-31,313.0,0.0,26.6,64.0,0.07,0.155,0.0,0.0825,0.0
2015-02-28,113.0,1.0,27.1,24.0,0.045,0.125,0.0,0.0,0.00885
2015-03-31,201.0,0.0,27.6,26.5,0.024,0.078,0.0,0.0,0.0
2015-04-30,216.0,0.0,28.8,27.0,0.0275,0.1875,0.0,0.045,0.0
2015-05-31,214.0,0.0,29.8,6.0,0.034,0.08,0.0,0.036,0.0
2015-06-30,351.0,2.0,29.2,196.0,0.0525,0.0,0.0,0.18,0.005698
2015-07-31,2163.0,6.0,28.6,310.5,0.09,0.54,0.0,0.1925,0.002774
2015-08-31,1200.0,3.0,28.5,207.5,0.122,0.396,0.0,0.154,0.0025
2015-09-30,1695.0,5.0,28.7,327.0,0.0725,0.3025,0.0,0.1275,0.00295
2015-10-31,2118.75,6.25,-999.0,-999.0,0.105,0.2375,0.0,0.13,0.00295


In [3]:
# CHECK FOR STATIONARITY OF THE TIME SERIES
def adfuller_test(series, signif=0.05, name='', verbose=False):
    """Perform ADFuller to test for Stationarity of given series and print report"""
    r = adfuller(series, autolag='AIC')
    output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
    p_value = output['pvalue'] 
    def adjust(val, length= 6): return str(val).ljust(length)

    # Print Summary
    print(f'    Augmented Dickey-Fuller Test on "{name}"', "\n   ", '-'*47)
    print(f' Null Hypothesis: Data has unit root. Non-Stationary.')
    print(f' Significance Level    = {signif}')
    print(f' Test Statistic        = {output["test_statistic"]}')
    print(f' No. Lags Chosen       = {output["n_lags"]}')

    for key,val in r[4].items():
        print(f' Critical value {adjust(key)} = {round(val, 3)}')

    if p_value <= signif:
        print(f" => P-Value = {p_value}. Rejecting Null Hypothesis.")
        print(f" => Series is Stationary.")
    else:
        print(f" => P-Value = {p_value}. Weak evidence to reject the Null Hypothesis.")
        print(f" => Series is Non-Stationary.")   

# ADF Test on each column
for name, column in Dengue_WestVis.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')


    Augmented Dickey-Fuller Test on "MTD_Cases" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -3.3496
 No. Lags Chosen       = 2
 Critical value 1%     = -3.585
 Critical value 5%     = -2.928
 Critical value 10%    = -2.602
 => P-Value = 0.0128. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "MTD_Deaths" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -4.29
 No. Lags Chosen       = 0
 Critical value 1%     = -3.578
 Critical value 5%     = -2.925
 Critical value 10%    = -2.601
 => P-Value = 0.0005. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "Reg_Ave_Temp_WestVis" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.

In [4]:
# SINCE NOT ALL OF THE SERIES ARE STATIONARY, PERFORM DIFFERENCING.  USE PERCENTAGE DIFFERENCING FOR THE 
# SERIES CASES, TEMPERATURE, AND RAINFALL.  USE SIMPLE DIFFERENCING FOR THE SERIES MORTALITY AND GOOGLE TRENDS

#Dengue_WestVis = Dengue_WestVis.drop(columns=['MTD_Deaths'])

#--- Calculate the first differences
Dengue_WestVis_diff = Dengue_WestVis.diff().dropna()
Dengue_WestVis_diff = Dengue_WestVis_diff.drop(columns=['MTD_Cases','Reg_Ave_Temp_WestVis','Reg_Ave_Rainfall_WestVis'] )

#--- Calculate the percentage differences for MTD_Cases, Reg_Ave_Temp_WestVis, and Reg_Ave_Rainfall_WestVis
Dengue_WestVis_diff[['MTD_Cases','Reg_Ave_Temp_WestVis','Reg_Ave_Rainfall_WestVis']] = Dengue_WestVis.groupby(Dengue_WestVis.index)['MTD_Cases','Reg_Ave_Temp_WestVis','Reg_Ave_Rainfall_WestVis'].pct_change().dropna()
print(Dengue_WestVis_diff)


            MTD_Deaths  GTrend_Dengue  GTrend_Dengue_Fever  \
Date                                                         
2015-02-28    1.000000        -0.0250              -0.0300   
2015-03-31   -1.000000        -0.0210              -0.0470   
2015-04-30    0.000000         0.0035               0.1095   
2015-05-31    0.000000         0.0065              -0.1075   
2015-06-30    2.000000         0.0185              -0.0800   
2015-07-31    4.000000         0.0375               0.5400   
2015-08-31   -3.000000         0.0320              -0.1440   
2015-09-30    2.000000        -0.0495              -0.0935   
2015-10-31    1.250000         0.0325              -0.0650   
2015-11-30    3.750000        -0.0470              -0.0575   
2015-12-31    3.750000         0.0545              -0.0625   
2016-01-31  -12.750000        -0.0445              -0.0235   
2016-02-29   -1.000000         0.0020               0.0260   
2016-03-31    3.000000        -0.0500              -0.0025   
2016-04-

In [5]:
# STORE DIFFERENCED DATAFRAME TO PICKLE
Dengue_WestVis_diff.to_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_WestVis_Diff.pickle')

In [6]:
# CHECK IF THE DIFFERENCED SERIES ARE STATIONARY
# ADF Test on each column
for name, column in Dengue_WestVis_diff.iteritems():
    adfuller_test(column, name=column.name)
    print('\n')

    Augmented Dickey-Fuller Test on "MTD_Deaths" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -10.4595
 No. Lags Chosen       = 0
 Critical value 1%     = -3.581
 Critical value 5%     = -2.927
 Critical value 10%    = -2.602
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.
 Significance Level    = 0.05
 Test Statistic        = -5.2934
 No. Lags Chosen       = 1
 Critical value 1%     = -3.585
 Critical value 5%     = -2.928
 Critical value 10%    = -2.602
 => P-Value = 0.0. Rejecting Null Hypothesis.
 => Series is Stationary.


    Augmented Dickey-Fuller Test on "GTrend_Dengue_Fever" 
    -----------------------------------------------
 Null Hypothesis: Data has unit root. Non-Stationary.