In [None]:
# import stuff

from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import statsmodels.api as sm

from statsmodels.tools import eval_measures
import statsmodels.formula.api as smf

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from pyts.preprocessing import InterpolationImputer

In [None]:
# run all preprocessing steps on both cities, then in last step seperate them

# data paths
X_path = 'data-processed/dengue_features_train.csv'
y_path = "data-processed/dengue_labels_train.csv"

def preprocess_data( X_path, labels):
    
    # Input: 
    # labels: logical true or false whether we have y (because submission data does not)
    
    # load data and set index to city, year, weekofyear and remove week_start_date
    df = pd.read_csv(X_path, index_col=[0, 1, 2])
    df = df.drop('week_start_date',axis=1)#[features]

    ### fill missing values with interpolation ###
    
    df.fillna(method='bfill', inplace=True) # bfill: use next valid observation to fill gap

    ### add lag of 3 weeks for 4 variables below ###
    
    # and remove original ones
    # max temp, precipitation, humidity and avgerage temp 
    lag        = 3
    var2change = ['station_max_temp_c', 
                  'station_precip_mm',
                  'reanalysis_relative_humidity_percent',
                  'station_avg_temp_c']

    new_names = []
    for i,j in enumerate(var2change):
        new_names.append( j + '_lag')

        df[new_names[i]] = df[var2change[i]].shift(lag)  # Lagged by 1 time step
    
        # remove original 
        df = df.drop(var2change[i],axis=1)

    # remove missing values again because of lag
    df.fillna(method='bfill', inplace=True)

    
    ### create interaction features
    
    # humidity above 42 % temperature above 24 degrees
    df['Humid_X_Temp26'] = np.where((df['reanalysis_relative_humidity_percent_lag'] >= 42) & (df['station_avg_temp_c_lag'] >= 24), 1, 0)
    
    # below one did not work
    # temperature after rain. Use rain vs. lagged temp var
    # df['Temp_X_rain'] = np.where((df['station_precip_mm_lag'] >= 600) & (df['station_avg_temp_c_lag'] >= 24), 1, 0)

    # if its not submission, load y data
    if labels==True:
        # add predictor to DF to seperate the cities
        y_path = "data-processed/dengue_labels_train.csv"
        y = pd.read_csv(y_path, index_col=[0, 1, 2])
        df = df.join(y)
    
    # separate san juan and iquitos
    df_sj = df.loc['sj']
    df_iq = df.loc['iq']

    if labels==True:
        # remove y from X again
        # San Juan
        X_sj = df_sj.drop('total_cases',axis=1)
        y_sj = df_sj['total_cases']

        # Iquitos
        X_iq = df_iq.drop('total_cases',axis=1)
        y_iq = df_iq['total_cases']
        
    else:
        X_sj = df_sj
        X_iq = df_iq
        y_sj = []
        y_iq = []
            
    return X_sj, y_sj, X_iq, y_iq

# remove df later again, just for debugging
[X_sj, y_sj, X_iq, y_iq] = preprocess_data( X_path, True)

In [None]:
# removing variables based on OLS didn't improve things

# # run OLS, check for non-important variables
# X = sm.add_constant(X_sj, prepend=False)
# y = y_sj

# Fit and summarize OLS model
# mod = sm.OLS(y, X).fit()
# print(mod.summary())

# remove some unimportant regressors for San Juan accoding to OLS
# X_sj = X_sj.drop(columns=['ndvi_se','ndvi_sw'])

In [None]:
# Seperate data into train 3/4 vs. test 1/4

# San Juan is 936 x 21
X_sj_train = X_sj.iloc[1:701]
X_sj_test = X_sj.iloc[701:]

# Iquitos is 520x20: split 390 vs 130
X_iq_train = X_iq.iloc[1:391]
X_iq_test = X_iq.iloc[391:]

# SJ
y_sj_train = y_sj.iloc[1:701]
y_sj_test  = y_sj.iloc[701:]

# IQ
y_iq_train = y_iq.iloc[1:391]
y_iq_test  = y_iq.iloc[391:]

In [None]:
# get MSE to assess preprocessing changes

def get_score(X_train, y_train, X_test, y_test):
    
    rf_regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
    rf_regressor.fit(X_train, y_train)

    y_pred = rf_regressor.predict(X_test).astype(int)
    my_mse = metrics.mean_squared_error(y_test, y_pred)
    
    return my_mse, y_pred, rf_regressor

# output MSE 
[mse_sj, sj_pred, model_sj] = get_score(X_sj_train, y_sj_train, X_sj_test, y_sj_test)
print('San Juan: ' + str(mse_sj))

[mse_iq, iq_pred, model_iq] = get_score(X_iq_train, y_iq_train, X_iq_test, y_iq_test)
print('Iquitos: ' + str(mse_iq))


In [None]:
# Submission file
sj_test, sj_y, iq_test, iq_y = preprocess_data('data-processed/dengue_features_test.csv', False)

sj_predictions = model_sj.predict(sj_test).astype(int)
iq_predictions = model_iq.predict(iq_test).astype(int)

submission = pd.read_csv("data-processed/submission_format.csv",
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
submission.to_csv("data-processed/our_model2.csv")