In [1]:
# IMPORT LIBRARIES
import pandas as pd
import numpy as np
from dateutil.parser import parse 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import interp1d
from sklearn.metrics import mean_squared_error

In [2]:
# READ DATASET
raw_data = pd.read_excel("C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_PH_Aggregated.xlsx")
raw_data = raw_data.set_index('Date')

# TRUNCATE DATA TO INCLUDE OBSERVATIONS UP TO 2018 ONLY
raw_data_trunc = raw_data[raw_data.index <= '2018-12-31']

In [3]:
# CHECK FOR MISSING VALUES
raw_data_trunc.isna().sum()

MTD_Cases               4
MTD_Deaths              4
Reg_Ave_Temp_NCR        0
Reg_Ave_Rainfall_NCR    0
GTrend_Dengue           0
GTrend_Dengue_Fever     0
GTrend_Dengue_Cure      0
GTrend_Dengue_Med       0
Gtrend_Dengue_Sym       0
dtype: int64

In [4]:
# TREAT THE MISSING VALUES USING SEASONAL MEAN
def seasonal_mean(ts, n, lr=0.7):
    """
    Compute the mean of corresponding seasonal periods
    ts: 1D array-like of the time series
    n: Seasonal window length of the time series
    """
    out = np.copy(ts)
    for i, val in enumerate(ts):
        if np.isnan(val):
            ts_seas = ts[i-1::-n]  # previous seasons only
            if np.isnan(np.nanmean(ts_seas)):
                ts_seas = np.concatenate([ts[i-1::-n], ts[i::n]])  # previous and forward
            out[i] = np.nanmean(ts_seas) * lr
    return out

#--- Get the seasonal means
raw_data_trunc['seas_mn_cases'] = seasonal_mean(raw_data_trunc.MTD_Cases, n=12, lr=1.25)
raw_data_trunc['seas_mn_deaths'] = seasonal_mean(raw_data_trunc.MTD_Deaths, n=12, lr=1.25)

#--- Replace the missing observations with the seasonal means
raw_data_trunc.loc[raw_data_trunc['MTD_Cases'].isnull(),'MTD_Cases'] = raw_data_trunc['seas_mn_cases']
raw_data_trunc.loc[raw_data_trunc['MTD_Deaths'].isnull(),'MTD_Deaths'] = raw_data_trunc['seas_mn_deaths']

#--- Drop the seasonal mean columns
raw_data_trunc = raw_data_trunc.drop(columns=['seas_mn_cases','seas_mn_deaths'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in

In [5]:
# CREATE A NEW COLUMN FOR MORTALITY RATE
#raw_data_trunc['Mort_Rate'] = raw_data_trunc.MTD_Deaths / raw_data_trunc.MTD_Cases

# TRANSFORM GOOGLE TREND COLUMNS TO PERCENTAGES
gtrend_cols = [col for col in raw_data_trunc if col.startswith('GTrend')]
raw_data_trunc[gtrend_cols]= raw_data_trunc[gtrend_cols].apply(lambda t: t / 100)

In [7]:
# STORE UPDATED DATAFRAME TO PICKLE
raw_data_trunc.to_pickle('C:/Users/Claire/Documents/GitHub/nasa_hack/model/datasets/Dengue_PH_Clean.pickle')