In [22]:
import warnings
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd
from sklearn import linear_model, tree

import statsmodels.api as sm
import statsmodels.formula.api as smf

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-white')


warnings.filterwarnings("ignore")

rng = 0


In [2]:
# Loading of dataset, with data preprocessing
train_features = pd.read_csv('./data/train_features_modified.csv')
train_labels = pd.read_csv('./data/dengue_labels_train.csv')
test_features = pd.read_csv('./data/test_features_modified.csv')


In [3]:
# slice train_features, test_features and train_labels by city
# Seperate data for San Juan
sj_train_features = train_features[train_features['city'] == 'sj']
sj_train_labels = train_labels[train_labels['city'] == 'sj']

sj_test_features = test_features[test_features['city'] == 'sj']

# Separate data for Iquitos
iq_train_features = train_features[train_features['city'] == 'iq']
iq_train_labels = train_labels[train_labels['city'] == 'iq']

iq_test_features = test_features[test_features['city'] == 'iq']

# drop city and week_start_date columns from train_features and test_features as they are strings
sj_train_features.drop(['city', 'week_start_date'], axis=1, inplace=True)
sj_test_features.drop(['city', 'week_start_date'], axis=1, inplace=True)

iq_train_features.drop(['city', 'week_start_date'], axis=1, inplace=True)
iq_test_features.drop(['city', 'week_start_date'], axis=1, inplace=True)


## Feature Selection


In [4]:
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('selectkbest', SelectKBest())
])

# hyperparameter tuning for select kbest

hyperparameters = {
    'selectkbest__k': range(10, 80, 10),
    'selectkbest__score_func': [r2_score]
}



In [5]:
def get_best_estimator(X, y):
    grid_search = GridSearchCV(pipeline, hyperparameters, return_train_score=True, scoring='r2', cv=5)
    grid_search.fit(X, y)
    return grid_search.best_params_

In [6]:
# select kbest features for sj_train_features and sj_train_labels
# sj_train_features_kbest = pipeline.fit_transform(sj_train_features, sj_train_labels['total_cases'])
# iq_train_features_kbest = pipeline.fit_transform(iq_train_features, iq_train_labels['total_cases'])

# sj_kbest_param = get_best_estimator(sj_train_features, sj_train_labels['total_cases'])

## Clustering


In [7]:
kmeans_sj = KMeans(n_clusters=5, random_state=rng).fit(sj_train_features)
clusters_sj = kmeans_sj.predict(sj_train_features)
clusters_sj_test = kmeans_sj.predict(sj_test_features)

In [8]:
kmeans_iq = KMeans(n_clusters=5, random_state=rng).fit(iq_train_features)
clusters_iq = kmeans_iq.predict(iq_train_features)
clusters_iq_test = kmeans_iq.predict(iq_test_features)

In [9]:
# after using kmeans -> use clustering to find the subset of data to train each classifier, then have an average classifier
# then use ensemble to combine the classifiers
# cluster test data

sj_train_features['cluster'] = clusters_sj
sj_train_labels['cluster'] = clusters_sj
sj_test_features['cluster'] = clusters_sj_test

iq_train_features['cluster'] = clusters_iq
iq_train_labels['cluster'] = clusters_iq
iq_test_features['cluster'] = clusters_iq_test

#sns.scatterplot(data=sj_train_merged, x="lag_1_station_avg_temp_c", y="total_cases", hue="cluster")

#sns.pairplot(sj_train_merged, hue="cluster")


# Splitting the dataset into clusters

In [10]:
sj_train_cluster_1 = sj_train_features[sj_train_features['cluster'] == 0]
sj_train_labels_cluster_1 = sj_train_labels[sj_train_labels['cluster'] == 0]

sj_train_cluster_2 = sj_train_features[sj_train_features['cluster'] == 1]
sj_train_labels_cluster_2 = sj_train_labels[sj_train_labels['cluster'] == 1]

sj_train_cluster_3 = sj_train_features[sj_train_features['cluster'] == 2]
sj_train_labels_cluster_3 = sj_train_labels[sj_train_labels['cluster'] == 2]

sj_train_cluster_4 = sj_train_features[sj_train_features['cluster'] == 3]
sj_train_labels_cluster_4 = sj_train_labels[sj_train_labels['cluster'] == 3]

sj_train_cluster_5 = sj_train_features[sj_train_features['cluster'] == 4]
sj_train_labels_cluster_5 = sj_train_labels[sj_train_labels['cluster'] == 4]

sj_test_cluster_1 = sj_test_features[sj_test_features['cluster'] == 0]
sj_test_cluster_2 = sj_test_features[sj_test_features['cluster'] == 1]
sj_test_cluster_3 = sj_test_features[sj_test_features['cluster'] == 2]
sj_test_cluster_4 = sj_test_features[sj_test_features['cluster'] == 3]
sj_test_cluster_5 = sj_test_features[sj_test_features['cluster'] == 4]

sj_train_cluster_2

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,lag_2_reanalysis_tdtr_k,lag_1_station_avg_temp_c,lag_2_station_avg_temp_c,lag_1_station_diur_temp_rng_c,lag_2_station_diur_temp_rng_c,lag_1_station_max_temp_c,lag_2_station_max_temp_c,lag_1_station_min_temp_c,lag_2_station_min_temp_c,cluster
7,1990,25,0.072500,0.072500,0.151471,0.133029,151.12,299.591429,299.528571,296.531429,...,2.100000,27.414286,28.114286,6.771429,6.942857,32.2,34.4,23.3,23.9,1
8,1990,26,0.102450,0.146175,0.125571,0.123600,19.32,299.578571,299.557143,296.378571,...,2.042857,28.371429,27.414286,7.685714,6.771429,33.9,32.2,22.8,23.3,1
15,1990,33,0.150567,0.128033,0.206957,0.168243,90.75,299.958571,299.957143,297.035714,...,2.585714,28.200000,28.242857,7.557143,8.085714,33.3,34.4,23.3,22.8,1
16,1990,34,0.190233,0.168800,0.167657,0.172286,32.40,300.332857,300.414286,296.728571,...,2.328571,28.042857,28.200000,6.685714,7.557143,32.8,33.3,22.8,23.3,1
17,1990,35,0.252900,0.330750,0.264171,0.284314,40.94,300.118571,300.221429,297.017143,...,1.857143,28.342857,28.042857,7.014286,6.685714,33.3,32.8,23.3,22.8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,2007,44,0.124300,0.054300,0.156814,0.123529,137.55,299.458571,299.542857,296.020000,...,3.185714,27.957143,29.100000,6.442857,7.542857,32.2,33.9,24.4,24.4,1
911,2007,45,-0.251700,-0.048600,0.205171,0.172883,15.25,300.604286,300.685714,295.838571,...,2.471429,26.200000,27.957143,5.400000,6.442857,30.6,32.2,22.2,24.4,1
913,2007,47,-0.058900,-0.062550,0.204486,0.156286,73.37,299.821429,299.885714,295.752857,...,2.257143,26.814286,27.442857,6.685714,6.857143,31.1,32.2,22.8,22.8,1
914,2007,48,-0.059500,-0.041667,0.090917,0.129086,15.95,299.090000,299.192857,293.351429,...,3.542857,26.900000,26.814286,6.200000,6.685714,31.1,31.1,22.8,22.8,1


In [11]:
iq_train_cluster_1 = iq_train_features[iq_train_features['cluster'] == 0]
iq_train_labels_cluster_1 = iq_train_labels[iq_train_labels['cluster'] == 0]

iq_train_cluster_2 = iq_train_features[iq_train_features['cluster'] == 1]
iq_train_labels_cluster_2 = iq_train_labels[iq_train_labels['cluster'] == 1]

iq_train_cluster_3 = iq_train_features[iq_train_features['cluster'] == 2]
iq_train_labels_cluster_3 = iq_train_labels[iq_train_labels['cluster'] == 2]

iq_train_cluster_4 = iq_train_features[iq_train_features['cluster'] == 3]
iq_train_labels_cluster_4 = iq_train_labels[iq_train_labels['cluster'] == 3]

iq_train_cluster_5 = iq_train_features[iq_train_features['cluster'] == 4]
iq_train_labels_cluster_5 = iq_train_labels[iq_train_labels['cluster'] == 4]

iq_test_cluster_1 = iq_test_features[iq_test_features['cluster'] == 0]
iq_test_cluster_2 = iq_test_features[iq_test_features['cluster'] == 1]
iq_test_cluster_3 = iq_test_features[iq_test_features['cluster'] == 2]
iq_test_cluster_4 = iq_test_features[iq_test_features['cluster'] == 3]
iq_test_cluster_5 = iq_test_features[iq_test_features['cluster'] == 4]

# Building regression model pipelines

In [25]:
lin = linear_model.LinearRegression(normalize=True, positive=True)
lin.fit(sj_train_cluster_1, sj_train_labels_cluster_1['total_cases'])
sj_pred_cluster_1 = lin.predict(sj_test_cluster_1)

clf = tree.DecisionTreeRegressor()
clf.fit(sj_train_cluster_2, sj_train_labels_cluster_2['total_cases'])
sj_pred_cluster_2 = lin.predict(sj_test_cluster_2)

lin.fit(sj_train_cluster_3, sj_train_labels_cluster_3['total_cases'])
sj_pred_cluster_3 = lin.predict(sj_test_cluster_3)

sj_pred_cluster_1

array([18.10186263, 16.39820662, 24.26920267, 26.41314226, 28.43013381,
       30.73261475, 26.81791663, 47.51114089, 33.79546138, 39.53632891,
       38.00214048, 24.87588669, 15.31623792, 23.6600611 , 18.86442825,
       15.35455445, 11.82045689,  7.64070009, 22.89155004, 23.2021282 ,
       22.67701086, 20.46346536, 23.0207813 , 14.4627141 , 10.98702982,
       13.48381572,  6.32273163, 10.47255717, 13.94502431, 15.96530591,
       22.76389146, 27.97838788, 27.9922419 , 33.86663077, 36.36969124,
       32.22763434, 36.17084571, 26.67123378, 30.24687986, 28.54296574,
       31.9527639 , 28.12773807, 12.14575206, 19.2259686 , 28.78082211,
       37.3369146 , 31.1174451 , 20.11013484, 16.88960998, 18.28401433,
       24.74783306, 22.01098714, 28.6024444 , 20.40933595, 15.80384316,
       11.8055483 , 14.52184188, 13.43357402, 29.95419987, 22.571873  ,
       14.219098  , 23.07983778, 16.08159964, 18.24851583, 16.70670874,
       18.55896882, 14.88980263, 13.37932094, 10.00931102, 13.94