# Gradient Boosting Model

In [5]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import cross_val_score
from xgboost.sklearn import XGBClassifier

from utils.metrics import ndcg_scorer

In [6]:
def generate_submission(y_pred, test_users_ids, label_encoder):
    """Create a valid submission file given the predictions."""
    ids = []
    cts = []
    for i in range(len(test_users_ids)):
        idx = test_users_ids[i]
        ids += [idx] * 5
        sorted_countries = np.argsort(y_pred[i])[::-1]
        cts += label_encoder.inverse_transform(sorted_countries)[:5].tolist()

    sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    return sub

In [7]:
path = '../data/processed/'
train_users = pd.read_csv(path + 'encoded_train_users.csv', nrows=10000)
test_users = pd.read_csv(path + 'encoded_test_users.csv', nrows=10000)

In [8]:
y_train = train_users['country_destination']
train_users.drop('country_destination', axis=1, inplace=True)
train_users.drop('id', axis=1, inplace=True)
train_users = train_users.fillna(-1)

x_train = train_users.values

test_users_ids = test_users['id']
test_users.drop('id', axis=1, inplace=True)
test_users = test_users.fillna(-1)

x_test = test_users.values

label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

In [11]:
clf = XGBClassifier(
    max_depth=10,
    learning_rate=0.18,
    n_estimators=30,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=1,
    colsample_bytree=1,
    colsample_bylevel=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    missing=None,
    silent=True,
    nthread=-1,
    seed=42
)

clf.fit(x_train, encoded_y_train)
y_pred = clf.predict_proba(x_test)

In [12]:
ndcg = cross_val_score(clf, x_train, encoded_y_train,
                       verbose=10, cv=5, scoring=ndcg_scorer)

print 'Parameters:', clf.get_params()
print 'Score:', ndcg.mean()

[CV] no parameters to be set .........................................
[CV] ................ no parameters to be set, score=0.674606 -  24.4s
[CV] no parameters to be set .........................................
[CV] ................ no parameters to be set, score=0.473992 -  24.4s
[CV] no parameters to be set .........................................
[CV] ................ no parameters to be set, score=0.680919 -  27.8s
[CV] no parameters to be set .........................................
[CV] ................ no parameters to be set, score=0.635626 -  25.9s

[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   24.4s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:  1.7min



[CV] no parameters to be set .........................................
[CV] ................ no parameters to be set, score=0.713322 -  27.1s
Parameters: {'reg_alpha': 0, 'colsample_bytree': 1, 'silent': True, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.18, 'missing': None, 'max_delta_step': 0, 'nthread': -1, 'base_score': 0.5, 'n_estimators': 30, 'subsample': 1, 'reg_lambda': 1, 'seed': 42, 'min_child_weight': 1, 'objective': 'multi:softprob', 'max_depth': 10, 'gamma': 0}
Score: 0.635692883924


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished


In [None]:
submission = generate_submission(y_pred, test_users_ids, label_encoder)

date = datetime.datetime.now().strftime("%m-%d-%H:%M:%S")
name = __file__.split('.')[0] + '_' + str(date) + '.csv'
submission.to_csv('../data/submissions/' + name, index=False)