In [1]:
%load_ext autoreload
%autoreload 2

import cudf
import pandas as pd
import numpy as np

from src.ranker import LGBModel
from src.utils import period_extraction, get_data_period
from src.metrics import ndcg_score
from src.evaluation import get_pred_items, get_ndcg_score

In [2]:
model = LGBModel()

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'num_leaves': 256,
    'max_bin': 255,
    'max_depth': -1,
    'bagging_freq': 1,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.9,
}

train_params = {
    'num_boost_round':2000,
    'early_stopping_rounds':50,
    'verbose_eval':100,
}

In [3]:
train = pd.read_csv('features_2017-04-09_t7_e7_n30.csv')
valid = pd.read_csv('features_2017-04-16_t7_e7_n30.csv')
test = pd.read_csv('features_2017-04-23_t7_e7_n30.csv')
pl_test = pd.read_csv('test_features_2017-04-30_t7_e7_n30.csv')

train = train[train['rated']==1]
valid = valid[valid['rated']==1]

X_train = train.iloc[:, 4:]
y_train = train['target']
X_valid = valid.iloc[:, 4:]
y_valid = valid['target']

model.train(
        params,
        X_train,
        y_train,
        X_valid,
        y_valid,
        train_params=train_params
)

y_pred = model.predict(X_valid)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3052
[LightGBM] [Info] Number of data points in the train set: 375090, number of used features: 12
[LightGBM] [Info] Start training from score 0.105495
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.389367	valid_1's rmse: 0.390033
[200]	training's rmse: 0.378703	valid_1's rmse: 0.386588
[300]	training's rmse: 0.372149	valid_1's rmse: 0.386133
Early stopping, best iteration is:
[322]	training's rmse: 0.370915	valid_1's rmse: 0.386127


In [4]:
date_th = '2017-04-23'
train_period = 7
eval_period = 7

df = pd.read_csv('../data/processed/train.csv', parse_dates=['time_stamp'])

In [5]:
X_test = test.iloc[:, 4:]
y_pred = model.predict(X_test)

pred_items = get_pred_items(test[['user_id', 'product_id']], y_pred )
score = get_ndcg_score(
    df,
    pred_items,
    date_th,
    train_period,
    eval_period,
)
print(f'ndcg: {score:.4f}')

ndcg: 0.2095


In [6]:
test = pd.read_csv('test_features_2017-04-30_t7_e7_n30.csv')
X_test = test.iloc[:, 4:]
y_pred = model.predict(X_test)
pred_items = get_pred_items(test[['user_id', 'product_id']], y_pred )


submission = []
for user, items in pred_items.items():
    for i, item in enumerate(items):
        submission.append([user, item, i])
submission = pd.DataFrame(submission)
submission.to_csv('submission.tsv', sep='\t', index=False, header=False)