In [1]:
%load_ext autoreload
%autoreload 2

import cudf
import pandas as pd
import numpy as np

from src.ranker import LGBModel
from src.utils import period_extraction, get_data_period
from src.metrics import ndcg_score
from src.evaluation import get_pred_items, get_ndcg_score

In [2]:
model = LGBModel()

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'num_leaves': 256,
    'max_bin': 255,
    'max_depth': -1,
    'bagging_freq': 1,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.9,
}

train_params = {
    'num_boost_round':2000,
    'early_stopping_rounds':50,
    'verbose_eval':100,
}

In [8]:
train = pd.read_csv('features_2017-04-16_t7_e7_n30.csv')
valid = pd.read_csv('features_2017-04-23_t7_e7_n30.csv')

train = train[train['rated']==1]
valid = valid[valid['rated']==1]

X_train = train.iloc[:, 4:]
y_train = train['target']
X_valid = valid.iloc[:, 4:]
y_valid = valid['target']

model.train(
        params,
        X_train,
        y_train,
        X_valid,
        y_valid,
        train_params=train_params
)

y_pred = model.predict(X_valid)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3052
[LightGBM] [Info] Number of data points in the train set: 360330, number of used features: 12
[LightGBM] [Info] Start training from score 0.101501
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.381328	valid_1's rmse: 0.381328
[200]	training's rmse: 0.370675	valid_1's rmse: 0.370675
[300]	training's rmse: 0.364146	valid_1's rmse: 0.364146
[400]	training's rmse: 0.358971	valid_1's rmse: 0.358971
[500]	training's rmse: 0.354614	valid_1's rmse: 0.354614
[600]	training's rmse: 0.35086	valid_1's rmse: 0.35086
[700]	training's rmse: 0.347614	valid_1's rmse: 0.347614
[800]	training's rmse: 0.344779	valid_1's rmse: 0.344779
[900]	training's rmse: 0.34215	valid_1's rmse: 0.34215
[1000]	training's rmse: 0.339713	valid_1's rmse: 0.339713
[1100]	training's rmse: 0.337414	valid_1's rmse: 0.337414
[1200]	trai

In [9]:
date_th = '2017-04-23'
train_period = 7
eval_period = 7

df = pd.read_csv('../data/processed/train.csv', parse_dates=['time_stamp'])

In [10]:
pred_items = get_pred_items(valid[['user_id', 'product_id']], y_pred )
score = get_ndcg_score(
    df,
    pred_items,
    date_th,
    train_period,
    eval_period,
)
print(f'ndcg: {score:.4f}')

ndcg: 0.2082


In [6]:
test = pd.read_csv('test_features_2017-04-30_t7_e7_n30.csv')
X_test = test.iloc[:, 4:]
y_pred = model.predict(X_test)
pred_items = get_pred_items(test[['user_id', 'product_id']], y_pred )

In [7]:
#submission = []
#for user, items in pred_items.items():
#    for i, item in enumerate(items):
#        submission.append([user, item, i])
#submission = pd.DataFrame(submission)
#submission.to_csv('submission.tsv', sep='\t', index=False, header=False)
# LB: 0.071