In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import xgboost as xgb
import time
import datetime
import math
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, VarianceThreshold



In [4]:
train_features = pd.read_csv('dengue_features_train.csv',
                             index_col=[0,1,2])
train_labels = pd.read_csv('dengue_labels_train.csv',
                          index_col=[0,1,2])

In [5]:
RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)

In [9]:
def preprocess_data(data, labels):
    
    df = data
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)
    
    df = df.drop(['week_start_date'],axis=1)

    dfl = labels
    
    
    # separate san juan and iquitos
    sjfeats = df.loc['sj']
    iqfeats = df.loc['iq']
    
    sjlabs = dfl.loc['sj']
    iqlabs = dfl.loc['iq']

    
    sjlabs = sjlabs.as_matrix()
    iqlabs = iqlabs.as_matrix()
    
    return sjfeats, iqfeats, sjlabs, iqlabs

In [10]:
def preprocess_data_test(data):
    
    df = data
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)
    
    df = df.drop(['week_start_date'],axis=1)
  

    # separate san juan and iquitos
    sjfeats = df.loc['sj']
    iqfeats = df.loc['iq']
    
    
    return sjfeats, iqfeats

In [11]:
test_features = pd.read_csv('dengue_features_test.csv',
                             index_col=[0,1,2])

sj_test, iq_test = preprocess_data_test(test_features)

In [12]:
sj_train, iq_train, sj_target, iq_target = preprocess_data(train_features, train_labels)

In [13]:
sjx_train, sjx_test, sjy_train, sjy_test = train_test_split(sj_train, sj_target, 
                                                                    test_size=0.2, random_state=42)

iqx_train, iqx_test, iqy_train, iqy_test = train_test_split(iq_train, iq_target, 
                                                                    test_size=0.2, random_state=42)

In [14]:
def feature_boost(train_feats, train_labs, test_feats, comp_feats):    
    clf = GradientBoostingRegressor(random_state = 8001)

    selector = clf.fit(train_feats, train_labs)
    importances = selector.feature_importances_
    fs = SelectFromModel(selector, prefit=True)
    train = fs.transform(train_feats)
    test = fs.transform(test_feats)
    comp = fs.transform(comp_feats)
    
    return train, test, comp

In [15]:
sj_train_feats, sj_test_feats, sj_comp = feature_boost(sjx_train, sjy_train, sjx_test, sj_test)
iq_train_feats, iq_test_feats, iq_comp = feature_boost(iqx_train, iqy_train, iqx_test, iq_test)

  y = column_or_1d(y, warn=True)


In [16]:
def xgb_model(train_feats, train_labs, test_feats):

# Create an empty array for prediction
    predictedResult = np.zeros(train_feats.shape[0])

# Split dataset into k = 10 consecutive folds
# Each fold is used once as a validation while the k - 1 remaining folds form the training set
    kf = KFold(train_feats.shape[0], n_folds=10)

    testPred = []

    for trainIndex, testIndex in kf:
        trainFold, testFold = train_feats[trainIndex], train_feats[testIndex]
        trainFoldTarget, testFoldTarget = train_labs[trainIndex], train_labs[testIndex]
    
        xgbr = xgb.XGBRegressor(n_estimators = 560, # number of boosted trees
                             learning_rate = 0.0202047, # step size shrinkage used in update to prevent overfitting
                             max_depth = 5, # maximum depth of a tree
                             subsample = 0.6815, # subsample ratio of the training set (Stochastic gradient boosting)
                             colsample_bytree = 0.701) # subsample features
    
        xgbr.fit(trainFold, trainFoldTarget)
        xgbpred =xgbr.predict(testFold)
        testPred.append(xgbr.predict(test_feats))
        predictedResult[testIndex] = xgbpred
    
    
        print(metrics.mean_absolute_error(testFoldTarget, xgbpred))
    
    return xgbr

In [17]:
sj_xgbr = xgb_model(sj_train_feats, sjy_train, sj_test_feats)

22.7433259201
26.0321743393
25.0134624577
26.8701399485
21.2800697072
27.7215311305
20.7993202909
20.8004616737
20.2175810015
20.1293031581


In [18]:
iq_xgbr = xgb_model(iq_train_feats, iqy_train, iq_test_feats)

8.18418627977
5.49578926961
9.36237240121
6.71961743846
5.35679241305
6.6936224699
8.33327088123
5.47649719221
5.36567196759
4.95160611228


In [112]:
sj_pred = sj_xgbr.predict(sj_comp)

In [113]:
iq_pred = iq_xgbr.predict(iq_comp)

In [115]:
sj_pred = list(map(int, sj_pred))
iq_pred = list(map(int, iq_pred))

In [116]:
submission = pd.read_csv("submission_format.csv",
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_pred, iq_pred])
submission.to_csv("submission_MLP.csv")