In [12]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
sns.set(style="white", color_codes=True)

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.cross_validation import KFold
from sklearn import metrics
import xgboost as xgb

In [13]:
total_labels = pd.read_csv('./input/dengue_labels_train.csv')
total_features = pd.read_csv('./input/dengue_features_train.csv')
total_test_features = pd.read_csv('./input/dengue_features_test.csv')
target = total_labels.drop(total_labels.columns[[0, 1, 2]], axis=1) 

In [14]:
train = total_features.drop(total_features.columns[[1, 2, 3]], axis=1) 
total_test_features = total_test_features.drop(total_test_features.columns[[1, 2, 3]], axis=1)

In [15]:
#differentiates the two cities with 0 or 1. XGBoost is decision tree based so this should not decrease accuracy
mymap = {'sj':0, 'iq':1}
train = train.applymap(lambda s: mymap.get(s) if s in mymap else s)
total_test_features = total_test_features.applymap(lambda s: mymap.get(s) if s in mymap else s)

In [16]:
#frontfill the missing data as the features are in chronological order
train = train.fillna(method='ffill')
test = total_test_features.fillna(method='ffill')

In [17]:
#removes unnecessary columns, though none are present in this case
remove = []
for col in train.columns:
    if train[col].std() == 0:
        remove.append(col)

train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)

print(train.shape, test.shape)

(1456, 21) (416, 21)


The next lines of code before the model training reduce the training and testing features to the most important features

In [18]:
Cols = train.columns.values.tolist()
clf = GradientBoostingRegressor(random_state = 8001)

In [19]:
selector = clf.fit(train, target)

In [20]:
importances = selector.feature_importances_
fs = SelectFromModel(selector, prefit=True)
train = fs.transform(train)
test = fs.transform(test)

In [21]:
selectedCols = train.shape[1]
sortedCols = [col for importance, col  in sorted(zip(importances, Cols))]
sortedCols = sortedCols[0:selectedCols]
train = pd.DataFrame(train)
test = pd.DataFrame(test)
train.columns = sortedCols
test.columns = sortedCols

print(sortedCols[0:5])

['city', 'reanalysis_avg_temp_k', 'station_max_temp_c', 'reanalysis_max_air_temp_k', 'station_min_temp_c']


In [25]:
Cols = train.columns.values.tolist()
clf = GradientBoostingRegressor(random_state=1729)
selector = clf.fit(train, target)

importances = selector.feature_importances_
fs = SelectFromModel(selector, prefit=True)
train = fs.transform(train)
test = fs.transform(test)

selectedCols = train.shape[1]
sortedCols = [col for importance, col  in sorted(zip(importances, Cols))]
sortedCols = sortedCols[0:selectedCols]

In [26]:
target = np.array(target)

In [27]:
predictedResult = np.zeros(train.shape[0])

kf = KFold(train.shape[0], n_folds=10)

testPred = []

for trainIndex, testIndex in kf:
    trainFold, testFold = train[trainIndex], train[testIndex]
    trainFoldTarget, testFoldTarget = target[trainIndex], target[testIndex]
    
    xgbc = xgb.XGBRegressor(n_estimators = 125, # number of boosted trees
                             learning_rate = 0.01, # step size shrinkage used in update to prevent overfitting
                             max_depth = 8, # maximum depth of a tree
                             subsample = 0.7, # subsample ratio of the training set (Stochastic gradient boosting)
                             colsample_bytree = 0.75) # subsample features
    
    xgbc.fit(trainFold, trainFoldTarget)
    
    xgbpred =xgbc.predict(testFold)
    testPred.append(xgbc.predict(test))
    predictedResult[testIndex] = xgbpred
    
    # Print the AUC
    print(metrics.mean_absolute_error(testFoldTarget, xgbpred))

28.69583868
47.235344374
29.7537791827
17.714219949
15.7721701713
17.478118436
15.233830276
6.30928999473
5.47018019578
8.05103472841


In [49]:
xgbpred = [int(round(i)) for i in xgbc.predict(test)]

In [50]:
submission = pd.read_csv('./input/submission_format.csv',
                         index_col=[0, 1, 2])

submission.total_cases = xgbpred
submission.to_csv('./input/xgb_submission.csv')