In [1]:
import warnings
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

rng = 0


In [2]:
# Loading of dataset, with data preprocessing
train_features = pd.read_csv('./data/train_features_modified.csv')
train_labels = pd.read_csv('./data/dengue_labels_train.csv')
test_features = pd.read_csv('./data/test_features_modified.csv')


In [3]:
# slice train_features, test_features and train_labels by city
# Seperate data for San Juan
sj_train_features = train_features[train_features['city'] == 'sj']
sj_train_labels = train_labels[train_labels['city'] == 'sj']

sj_test_features = test_features[test_features['city'] == 'sj']

# Separate data for Iquitos
iq_train_features = train_features[train_features['city'] == 'iq']
iq_train_labels = train_labels[train_labels['city'] == 'iq']

iq_test_features = test_features[test_features['city'] == 'iq']

# drop city and week_start_date columns from train_features and test_features as they are strings
sj_train_features.drop(['city', 'week_start_date'], axis=1, inplace=True)
sj_test_features.drop(['city', 'week_start_date'], axis=1, inplace=True)

iq_train_features.drop(['city', 'week_start_date'], axis=1, inplace=True)
iq_test_features.drop(['city', 'week_start_date'], axis=1, inplace=True)


## Clustering


In [4]:
# set number of clusters to 5
kmeans_sj = KMeans(random_state=rng, n_clusters=5).fit(sj_train_features)
clusters_sj = kmeans_sj.predict(sj_train_features)
clusters_sj_test = kmeans_sj.predict(sj_test_features)


In [5]:
# set number of clusters to 5
kmeans_iq = KMeans(random_state=rng, n_clusters=5).fit(iq_train_features)
clusters_iq = kmeans_iq.predict(iq_train_features)
clusters_iq_test = kmeans_iq.predict(iq_test_features)


In [6]:
# Adding a cluster column in order to split the data by cluster
sj_train_features['cluster'] = clusters_sj
sj_train_labels['cluster'] = clusters_sj
sj_test_features['cluster'] = clusters_sj_test

iq_train_features['cluster'] = clusters_iq
iq_train_labels['cluster'] = clusters_iq
iq_test_features['cluster'] = clusters_iq_test


## Building regression models


In [7]:
class AverageRegressor(BaseEstimator):
    def __init__(self, regressors):
        self.regressors = regressors

    def fit(self, X, y):
        for regressor in self.regressors:
            regressor.fit(X, y)

    def predict(self, X):
        predictions = np.array([regressor.predict(X)
                               for regressor in self.regressors]).T
        return np.mean(predictions, axis=1)

    def score(self, X):
        return np.mean([regressor.score(X) for regressor in self.regressors])


In [8]:
def get_regressors(X, y):
    reg1 = GradientBoostingRegressor(random_state=rng)
    reg2 = RandomForestRegressor(random_state=rng)
    reg3 = AdaBoostRegressor(random_state=rng)

    return [reg1, reg2, reg3]


In [9]:
def get_regression_models(train_features, train_labels):
    best_regressors = []
    for i in train_features['cluster'].unique():
        train_features_cluster = train_features[train_features['cluster'] == i]
        train_labels_cluster = train_labels[train_labels['cluster'] == i]

        regressors = get_regressors(
            train_features_cluster, train_labels_cluster)
        avg_reg = AverageRegressor(regressors)

        # build pipeline
        pipeline = Pipeline([
            ('scalar', StandardScaler(with_mean=False)),
            ('selectkbest', SelectKBest()),
            ('avg_regressor', avg_reg)
        ])

        # hyperparameter tuning
        hyperparameters = {
            'selectkbest__k': np.arange(20, 80, 10),
            # skipped hyperparameter tuning for avg_regressor because it requires too much compute power to run
            # 'avg_regressor__randomforestregressor__max_depth': np.arange(1, 10),
            # 'avg_regressor__randomforestregressor__n_estimators': np.arange(1, 100, 10),
        }

        estimator = GridSearchCV(pipeline, hyperparameters, cv=5)
        estimator.fit(train_features_cluster,
                      train_labels_cluster['total_cases'])

        best_regressors.append(estimator.best_estimator_)
    return best_regressors


In [10]:
sj_regression_models = get_regression_models(
    sj_train_features, sj_train_labels)
iq_regression_models = get_regression_models(
    iq_train_features, iq_train_labels)


## Predictions


In [11]:
sj_predictions = []

# iterating each row of the test data and keeping track of the prediction for each row
# while using the different regression models for prediction based on their cluster
for index, row in sj_test_features.iterrows():
    cluster = row['cluster'].astype(int)
    pred = sj_regression_models[cluster].predict(row.values.reshape(1, -1))[0]
    sj_predictions.append(pred)


In [12]:
iq_predictions = []

# iterating each row of the test data and keeping track of the prediction for each row
# while using the different regression models for prediction based on their cluster
for index, row in iq_test_features.iterrows():
    cluster = row['cluster'].astype(int)
    pred = iq_regression_models[cluster].predict(row.values.reshape(1, -1))[0]
    iq_predictions.append(pred)


In [13]:
predictions = sj_predictions + iq_predictions
predictions = np.array(predictions, dtype=int)


In [14]:
submission = pd.read_csv("./data/submission_format.csv", index_col=[0, 1, 2])
submission.total_cases = predictions
submission.to_csv("./output/approach_4.csv")
