In [1]:
# import stuff
%matplotlib inline

from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import statsmodels.api as sm

from statsmodels.tools import eval_measures
import statsmodels.formula.api as smf

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from pyts.preprocessing import InterpolationImputer

# just for the sake of this blog post!
from warnings import filterwarnings
filterwarnings('ignore')

# load the provided data
train_features = pd.read_csv('data-processed/dengue_features_train.csv',
                             index_col=[0,1,2])

train_labels = pd.read_csv('data-processed/dengue_labels_train.csv',
                           index_col=[0,1,2])

# Seperate data for San Juan
sj_train_features = train_features.loc['sj']
sj_train_labels = train_labels.loc['sj']

# Separate data for Iquitos
iq_train_features = train_features.loc['iq']
iq_train_labels = train_labels.loc['iq']

In [2]:
# run all preprocessing steps on both cities, then in last step seperate them

# load data
X_path = 'data-processed/dengue_features_train.csv'
y_path = "data-processed/dengue_labels_train.csv"

# load data and set index to city, year, weekofyear
df = pd.read_csv(X_path, index_col=[0, 1, 2])
    
# select features we want
features = ['reanalysis_specific_humidity_g_per_kg', 
            'reanalysis_dew_point_temp_k', 
            'station_avg_temp_c', 
            'station_min_temp_c']#,
            #'station_max_temp_c']

# select predictors
df = df[features]
#df = df.drop('week_start_date',axis=1)#[features]


In [3]:
# fill missing values
df.fillna(method='ffill', inplace=True) # bfill: use next valid observation to fill gap

In [4]:
# add lag to 
#lag        = 1
#var2change = 'station_max_temp_c'

#df['humid_lag'] = df[var2change].shift(lag)  # Lagged by 1 time step

# remove missing value again because of lag
#df.fillna(method='bfill', inplace=True)

# remove original 
#df = df.drop(var2change,axis=1)
#df.head()

In [5]:


# add predictor to DF to seperate cities
y = pd.read_csv(y_path, index_col=[0, 1, 2])
df = df.join(y)
    
# separate san juan and iquitos
df_sj = df.loc['sj']
df_iq = df.loc['iq']

# remove y from X again

# San Juan
X_sj = df_sj.drop('total_cases',axis=1)
y_sj = df_sj['total_cases']

# Iquitos
X_iq = df_iq.drop('total_cases',axis=1)
y_iq = df_iq['total_cases']

In [6]:
# Seperate X into prior and future train vs. test sets

# San Juan is 936 x 21
X_sj_train = X_sj.iloc[1:701]
X_sj_test = X_sj.iloc[701:]

In [7]:
y_sj_train = y_sj.iloc[1:701]
y_sj_test  = y_sj.iloc[701:]

In [8]:
# my version benchmarking function

# San Juan
# best alpha =  1e-08
# best score =  756.9779411764706

# Peru
# best alpha =  1e-08
# best score =  119.31666666666666

def get_score(X_train, y_train, X_test, y_test):
    
    rf_regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    rf_regressor.fit(X_train, y_train)#

    y_pred_train = rf_regressor.predict(X_test)
    
    return metrics.mean_squared_error(y_test, y_pred_train)

# output MSE 
mse_sj = get_score(X_sj_train, y_sj_train, X_sj_test, y_sj_test)
print(mse_sj)

# baseline model with 4 features: 1802.2099802009457
# baseline model with all features: 1458.708840425532 

# experiment on interpolation
# after interpolation with 4 feature: 1906.9913617021275 
# after interpolation with all features 1762.8954468085103 with all features using ffill
# after interpolation with all features 11281.0836463829787 with all features using bfill

# experiment on lag, removing original avg_temp
# baseline model, 4 features, bfill: 1281.0836463829787
# baseline model, 4 features, bfill, lagged by 1 avg, temp: 1623.894638297872
# baseline model, 4 features, bfill, lagged by 2 avg, temp: 1454.7012340425533
# baseline model, 4 features, bfill, lagged by 3 avg, temp: 1361.6014042553193

# experiment on lag, keeping original avg_temp
# baseline model, 4 features, bfill: 1281.0836463829787
# baseline model, 4 features, bfill, lagged by 1 avg, temp:

# experiment on lag, removing original humidity
# 1 weeks: 1538.7760425531915
# 2 weeks: 1617.551914893617
# 3 weeks: 1429.1278297872343

# experiment on lag, keeping original humidity
# 2 weeks: 1617.551914893617


1802.2099802009457


In [None]:
# list of changes to keep

### BETTER ###
# fill missing values
# df.fillna(method='bfill', inplace=True) better than ffill


### Worse ###
# avg_temp lag of 1-4 all worse