In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import xgboost as xgb
import time
import datetime
import math
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, VarianceThreshold



In [2]:
train_features = pd.read_csv('dengue_features_train.csv',
                             index_col=[0,1,2])
train_labels = pd.read_csv('dengue_labels_train.csv',
                          index_col=[0,1,2])

In [3]:
RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)

In [29]:
def preprocess_data(data, labels):
    
    df = data
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)
    
    df = df.drop(['week_start_date'],axis=1)

    dfl = labels
    
    
    # separate san juan and iquitos
    sjfeats = df.loc['sj']
    iqfeats = df.loc['iq']
    
    sjlabs = dfl.loc['sj']
    iqlabs = dfl.loc['iq']

    
    sjlabs = sjlabs.as_matrix()
    iqlabs = iqlabs.as_matrix()
    
    return sjfeats, iqfeats, sjlabs, iqlabs

In [35]:
def preprocess_data_test(data):
    
    df = data
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)
    
    df = df.drop(['week_start_date'],axis=1)
  

    # separate san juan and iquitos
    sjfeats = df.loc['sj']
    iqfeats = df.loc['iq']
    
    
    return sjfeats, iqfeats

In [36]:
test_features = pd.read_csv('dengue_features_test.csv',
                             index_col=[0,1,2])

sj_test, iq_test = preprocess_data_test(test_features)

In [37]:
sj_train, iq_train, sj_target, iq_target = preprocess_data(train_features, train_labels)

In [13]:
#sjx_train, sjx_test, sjy_train, sjy_test = train_test_split(sj_train, sj_target, 
#                                                                    test_size=0.2, random_state=42)

#iqx_train, iqx_test, iqy_train, iqy_test = train_test_split(iq_train, iq_target, 
#                                                                    test_size=0.2, random_state=42)

In [38]:
def feature_boost(train_feats, train_labs, comp_feats):    
    clf = GradientBoostingRegressor(random_state = 8001)

    selector = clf.fit(train_feats, train_labs)
    importances = selector.feature_importances_
    fs = SelectFromModel(selector, prefit=True)
    train = fs.transform(train_feats)
    test = fs.transform(comp_feats)
    
    return train, test

In [39]:
sj_train_feats, sj_test_feats = feature_boost(sj_train, sj_target, sj_test)
iq_train_feats, iq_test_feats = feature_boost(iq_train, iq_target, iq_test)

  y = column_or_1d(y, warn=True)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [27]:
def xgb_model(train_feats, train_labs):

# Create an empty array for prediction
    predictedResult = np.zeros(train_feats.shape[0])

# Split dataset into k = 10 consecutive folds
# Each fold is used once as a validation while the k - 1 remaining folds form the training set
    kf = KFold(train_feats.shape[0], n_folds=5)

    testPred = []

    for trainIndex, testIndex in kf:
        trainFold, testFold = train_feats[trainIndex], train_feats[testIndex]
        trainFoldTarget, testFoldTarget = train_labs[trainIndex], train_labs[testIndex]
    
        xgbr = xgb.XGBRegressor(n_estimators = 100, # number of boosted trees
                             learning_rate = 0.1, # step size shrinkage used in update to prevent overfitting
                             max_depth = 9, # maximum depth of a tree
                             min_child_weight = 5,   
                             subsample = 0.8, # subsample ratio of the training set (Stochastic gradient boosting)
                             colsample_bytree = 0.8) # subsample features
    
        xgbr.fit(trainFold, trainFoldTarget)
        xgbpred =xgbr.predict(testFold)
        #testPred.append(xgbr.predict(test_feats))
        predictedResult[testIndex] = xgbpred
    
    
        print(metrics.mean_absolute_error(testFoldTarget, xgbpred))
    
    return xgbr

In [28]:
sj_xgbr = xgb_model(sj_train_feats, sj_target)

26.0248900956
42.1109017091
31.3446045693
23.8472454229
27.1324818695


In [20]:
iq_xgbr = xgb_model(iq_train_feats, iq_target)

6.3059001267
6.70005340186
8.7585333729
6.02006647449
10.2521702464
6.91526018656
6.02462487037
7.47143287899
10.7773930144
7.63899595692


In [14]:
sj_pred = sj_xgbr.predict(sj_test_feats)

In [15]:
iq_pred = iq_xgbr.predict(iq_test_feats)

In [16]:
sj_pred = list(map(int, sj_pred))
iq_pred = list(map(int, iq_pred))

In [17]:
submission = pd.read_csv("submission_format.csv",
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_pred, iq_pred])
submission.to_csv("submission_MLP.csv")