In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

In [2]:
def dcg_score(y_true, y_score, k=10, gains="exponential"):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    Reference
    ---------
    http://en.wikipedia.org/wiki/Discounted_cumulative_gain
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true labels).
    y_score : array-like, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    Reference
    ---------
    http://en.wikipedia.org/wiki/Discounted_cumulative_gain
    """
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best

In [45]:
def generate_submission(y_pred, test_users_ids, label_encoder):
    ids = []
    cts = []
    for i in range(len(test_users_ids)):
        idx = test_users_ids[i]
        ids += [idx] * 5
        sorted_countries = np.argsort(y_pred[i])[::-1]
        cts += label_encoder.inverse_transform(sorted_countries)[:5].tolist()

    sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    return sub



path = '../datasets/raw/'
train_users = pd.read_csv(path + 'train_users.csv')
test_users = pd.read_csv(path + 'test_users.csv')

y_train = train_users['country_destination']
train_users.drop('country_destination', axis=1, inplace=True)
train_users.drop('id', axis=1, inplace=True)

x_train = train_users.values

test_users_ids = test_users['id']
test_users.drop('id', axis=1, inplace=True)

x_test = test_users.values

label_encoder = LabelEncoder()
encoded_y_train = label_encoder.fit_transform(y_train)

xgb = XGBClassifier(
    max_depth=8,
    learning_rate=0.2,
    n_estimators=45,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.6,
    colsample_bytree=0.6,
    colsample_bylevel=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    seed=42
)

In [3]:
xgb.fit(x_train, encoded_y_train)

y_pred = xgb.predict_proba(x_train)

submission = generate_submission(y_pred, test_users_ids, label_encoder)

In [46]:
import xgboost
from sklearn.grid_search import GridSearchCV

clf = GridSearchCV(
    xgb,
    {
        'max_depth':[1,2],
        'n_estimators': [5,6],
        'learning_rate': [0.6, 0.9],
    },
    cv=3,
    verbose=5,
    n_jobs = 1,
    scoring=ndcg
    )

clf.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] n_estimators=5, learning_rate=0.6, max_depth=1 ..................


ValueError: could not convert string to float: -unknown-

In [43]:
A = make_scorer(ndcg, needs_proba=True)

%timeit A(xgb, x_train, encoded_y_train)

100 loops, best of 3: 5.44 ms per loop
