In [None]:
# import stuff

from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import statsmodels.api as sm

from statsmodels.tools import eval_measures
import statsmodels.formula.api as smf

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [None]:
def preprocess_data(X_path, interpolation, norm, mylag, interaction, ylabels):
# preprocessing function  
    # Input: 
        # x_path: feature data path
        # interpolation: bool, True=backfill, False=forward fill
        # norm: True=normalization, false= standardization
        # mylag: 1= 1 week for example, use 0 for no lag
        # interaction: True or False
        # ylabels: logical true or false whether we have y (because submission data does not)
    
    # load data and set index to city, year, weekofyear and remove week_start_date
    df = pd.read_csv(X_path, index_col=[0, 1, 2])
    df = df.drop('week_start_date',axis=1)#[features]

    ### fill missing values with interpolation ###
    if interpolation==True:
        mymethod = 'bfill'
    else:
        mymethod = 'ffill'
        
    # implement it    
    df.fillna(method=mymethod, inplace=True) # bfill: use next valid observation to fill gap

    ### normalization ###
    if norm==True:
        df = (df - df.min()) / (df.max() - df.min()) # min-max scaling
    else:
        df = (df - df.mean()) / df.std() # standarization
    
    ### add lag of mylag weeks and remove original ones ###
    lag        = mylag
    var2change =  list(df.columns.values)

    new_names = []
    for i,j in enumerate(var2change):
        new_names.append( j + '_lag')

        df[new_names[i]] = df[var2change[i]].shift(lag)
    
        # remove original 
        df = df.drop(var2change[i],axis=1)

    # remove missing values again because of lag
    df.fillna(method=mymethod, inplace=True)

    ### create interaction features
    if interaction==True:
        df['Humid_X_Temp26'] = np.where((df['reanalysis_relative_humidity_percent_lag'] >= 42) & (df['station_avg_temp_c_lag'] >= 24), 1, 0)

    # if its not submission, load y data
    if ylabels==True:
        
        # add predictor to DF to seperate the cities
        y_path = "data-processed/dengue_labels_train.csv"
        y = pd.read_csv(y_path, index_col=[0, 1, 2])
        df = df.join(y)
    
    # separate san juan and iquitos
    df_sj = df.loc['sj']
    df_iq = df.loc['iq']

    if labels==True:
        # remove y from X again
        # San Juan
        X_sj = df_sj.drop('total_cases',axis=1)
        y_sj = df_sj['total_cases']

        # Iquitos
        X_iq = df_iq.drop('total_cases',axis=1)
        y_iq = df_iq['total_cases']
        
    else:
        X_sj = df_sj
        X_iq = df_iq
        y_sj = []
        y_iq = []
            
    return X_sj, y_sj, X_iq, y_iq

In [None]:
# data paths
X_path = 'data-processed/dengue_features_train.csv'
y_path = "data-processed/dengue_labels_train.csv"

# preprocess
interpolation = True
norm          = True
mylag         = 2
interaction   = True
labels        = True

[X_sj, y_sj, X_iq, y_iq] = preprocess_data(X_path, interpolation, norm, mylag, interaction, labels)
data = [X_sj, y_sj, X_iq, y_iq]

# save for Edith
prefix = 'min-max-scaling' # for saving

X_sj.to_csv("data-processed/{}_Xsj.csv".format(prefix))
X_iq.to_csv("data-processed/{}_Xiq.csv".format(prefix))

y_sj.to_csv("data-processed/{}_ysj.csv".format(prefix))
y_iq.to_csv("data-processed/{}_yiq.csv".format(prefix))

In [None]:
# Seperate data into train 3/4 vs. test 1/4

# San Juan is 936 x 21
X_sj_train = X_sj.iloc[0:701]
X_sj_test = X_sj.iloc[701:]

# Iquitos is 520x20: split 390 vs 130
X_iq_train = X_iq.iloc[0:391]
X_iq_test = X_iq.iloc[391:]

# SJ
y_sj_train = y_sj.iloc[0:701]
y_sj_test  = y_sj.iloc[701:]

# IQ
y_iq_train = y_iq.iloc[0:391]
y_iq_test  = y_iq.iloc[391:]

In [None]:
# get MSE to assess preprocessing changes

def get_score(X_train, y_train, X_test, y_test):
    
    rf_regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    rf_regressor.fit(X_train, y_train)

    y_pred = rf_regressor.predict(X_test).astype(int)
    my_mse = metrics.mean_squared_error(y_test, y_pred)
    
    return my_mse, y_pred, rf_regressor

In [None]:
# model evaluation for two cities
[mse_sj, sj_pred, model_sj] = get_score(X_sj_train, y_sj_train, X_sj_test, y_sj_test)
print('San Juan: ' + str(mse_sj))

[mse_iq, iq_pred, model_iq] = get_score(X_iq_train, y_iq_train, X_iq_test, y_iq_test)
print('Iquitos: ' + str(mse_iq))

In [None]:
# plotting predicted timecourses
import plotly.graph_objects as go
import plotly as py

def prediction_plot(model_prediction, acutal_data, city_name):
    # Inputs:
        # model_prediction = sj_pred
        # acutal_data      = y_sj_test
        # city_name        = 'San Juan'
    length_sj = np.arange(len(model_prediction))

    trace0 = dict(
        type='scatter', 
        x=length_sj, 
        y=acutal_data,
        name="Actual cases")

    trace1 = dict(
        type='scatter', 
        x=length_sj, 
        y=model_prediction,
        name="Predicted cases")
    
    # Layout
    mylayout = dict(
        title=city_name,
        xaxis=dict(title='Weeks'),
        yaxis=dict(title='Total Cases'))

    # Figure
    fig = go.Figure(data=[trace0, trace1],layout=mylayout) 
    fig.show()
    
    fig.write_image("figures/"+ city_name + ".png")

In [None]:
# load Edith's data

result_iq = pd.read_csv("results/iq_All_lagged_and_mm_gb.csv")

model_prediction = result_iq['pred_cases']
acutal_data      = result_iq['total_cases']
city_name        = 'Iquitos: Gradient Boosting Classifier'
prediction_plot(model_prediction, acutal_data, city_name)

In [None]:
result_sj = pd.read_csv("results/sj_All_lagged_and_mm_gb.csv")

model_prediction = result_sj['pred_cases']
acutal_data      = result_sj['total_cases']
city_name        = 'San Juan: Gradient Boosting Classifier'
prediction_plot(model_prediction, acutal_data, city_name)

In [None]:
# Submission file

# preprocess
X_path        = 'data-processed/dengue_features_test.csv'
interpolation = True
norm          = True
mylag         = 2
interaction   = True
labels        = True

sj_test, sj_y, iq_test, iq_y = preprocess_data(X_path, interpolation, norm, mylag, interaction, labels)

sj_predictions = model_sj.predict(sj_test).astype(int)
iq_predictions = model_iq.predict(iq_test).astype(int)

submission = pd.read_csv("data-processed/submission_format.csv",
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
submission.to_csv("data-processed/our_model2.csv")