In [1]:
%load_ext autoreload
%autoreload 2

import cudf
import pandas as pd
import numpy as np

from src.ranker import LGBModel
from src.utils import period_extraction, get_data_period
from src.metrics import ndcg_score
from src.evaluation import get_pred_items, get_ndcg_score

In [2]:
model = LGBModel()

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'num_leaves': 256,
    'max_bin': 255,
    'max_depth': -1,
    'bagging_freq': 1,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.9,
}

train_params = {
    'num_boost_round':2000,
    'early_stopping_rounds':50,
    'verbose_eval':100,
}

In [3]:
train = pd.read_csv('features_2017-04-09_t7_e7_n30.csv')
valid = pd.read_csv('features_2017-04-16_t7_e7_n30.csv')
test = pd.read_csv('features_2017-04-23_t7_e7_n30.csv')
pl_test = pd.read_csv('test_features_2017-04-30_t7_e7_n30.csv')

train = train[train['rated']==1]
valid = valid[valid['rated']==1]

X_train = train.iloc[:, 4:]
y_train = train['target']
X_valid = valid.iloc[:, 4:]
y_valid = valid['target']

model.train(
        params,
        X_train,
        y_train,
        X_valid,
        y_valid,
        train_params=train_params
)

y_pred = model.predict(X_valid)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3052
[LightGBM] [Info] Number of data points in the train set: 375090, number of used features: 12
[LightGBM] [Info] Start training from score 0.105495
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.389367	valid_1's rmse: 0.390033
[200]	training's rmse: 0.378703	valid_1's rmse: 0.386588
[300]	training's rmse: 0.372149	valid_1's rmse: 0.386133
Early stopping, best iteration is:
[322]	training's rmse: 0.370915	valid_1's rmse: 0.386127


In [4]:
date_th = '2017-04-23'
train_period = 7
eval_period = 7

df = pd.read_csv('../data/processed/train.csv', parse_dates=['time_stamp'])

In [5]:
X_test = test.iloc[:, 4:]
y_pred = model.predict(X_test)

pred_items = get_pred_items(test[['user_id', 'product_id']], y_pred )
score = get_ndcg_score(
    df,
    pred_items,
    date_th,
    train_period,
    eval_period,
)
print(f'ndcg: {score:.4f}')

ndcg: 0.2095


In [6]:
test = pd.read_csv('test_features_2017-04-30_t7_e7_n30.csv')
X_test = test.iloc[:, 4:]
y_pred = model.predict(X_test)
pred_items = get_pred_items(test[['user_id', 'product_id']], y_pred )


submission = []
for user, items in pred_items.items():
    for i, item in enumerate(items):
        submission.append([user, item, i])
submission = pd.DataFrame(submission)
submission.to_csv('submission.tsv', sep='\t', index=False, header=False)

In [142]:
train = pd.read_csv('features_2017-04-09_t7_e7_n30.csv')
valid = pd.read_csv('features_2017-04-16_t7_e7_n30.csv')
test = pd.read_csv('features_2017-04-23_t7_e7_n30.csv')
pl_test = pd.read_csv('test_features_2017-04-30_t7_e7_n30.csv')

In [143]:
train = train[train['rated']==1]
valid = valid[valid['rated']==1]
test = test[test['rated']==1]

In [144]:
train.shape, valid.shape, test.shape, pl_test.shape

((375090, 16), (360330, 16), (345030, 16), (347940, 16))

In [145]:
len(set(train['product_id']) & set(valid['product_id']))

16436

In [146]:
len(set(valid['product_id']) & set(test['product_id']))

16401

In [147]:
len(set(test['product_id']) & set(pl_test['product_id']))

18428

In [148]:
len(set(valid['product_id']) & set(pl_test['product_id']))

19683

In [149]:
for col in train.columns[4:]:
    avg_train = train[col].mean()
    avg_valid = valid[col].mean()
    avg_test = test[col].mean()
    avg_pl_test = pl_test[col].mean()
    #avg_train = train[col].median()
    #avg_valid = valid[col].median()
    #avg_test = test[col].median()
    #avg_pl_test = pl_test[col].median()
    print('-' * 30)
    print(f'# {col}')
    print(f'- train: {avg_train:.2f}')
    print(f'- valid: {avg_valid:.2f}')
    print(f'- test: {avg_test:.2f}')
    print(f'- pl_test: {avg_pl_test:.2f}')

------------------------------
# cv-score-r0.9_by_user
- train: 0.51
- valid: 0.50
- test: 0.54
- pl_test: 0.52
------------------------------
# click-score-r0.9_by_user
- train: 0.41
- valid: 0.38
- test: 0.33
- pl_test: 0.21
------------------------------
# pv-score-r0.9_by_user
- train: 10.71
- valid: 10.00
- test: 10.50
- pl_test: 12.20
------------------------------
# other-score-r0.9_by_user
- train: 6.29
- valid: 5.90
- test: 6.34
- pl_test: 6.19
------------------------------
# cv-score-r0.9_by_item
- train: 327.89
- valid: 288.21
- test: 311.67
- pl_test: 211.10
------------------------------
# click-score-r0.9_by_item
- train: 509.22
- valid: 501.23
- test: 426.45
- pl_test: 310.68
------------------------------
# pv-score-r0.9_by_item
- train: 149.14
- valid: 129.55
- test: 126.28
- pl_test: 83.43
------------------------------
# other-score-r0.9_by_item
- train: 126.25
- valid: 113.42
- test: 98.67
- pl_test: 80.22
------------------------------
# cv-score-r0.9_by_user-item

In [3]:
date_th = '2017-04-30'
train_period = 7
eval_period = 7
top_n = 30
decay_rate = 0.9

df = cudf.read_csv('../data/processed/train.csv', parse_dates=['time_stamp'])
train_start_date, train_end_date, _, _ = get_data_period(date_th, train_period, eval_period)
tmp1 = period_extraction(df, train_start_date, train_end_date)

filename = f'test_pairs_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv'
pairs1 = cudf.read_csv(filename)

In [4]:
date_th = '2017-04-23'
train_period = 7
eval_period = 7
top_n = 30
decay_rate = 0.9

df = cudf.read_csv('../data/processed/train.csv', parse_dates=['time_stamp'])
train_start_date, train_end_date, _, _ = get_data_period(date_th, train_period, eval_period)
tmp2 = period_extraction(df, train_start_date, train_end_date)

filename = f'pairs_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv'
pairs2 = cudf.read_csv(filename)
pairs2 = pairs2[pairs2['rated']==1].reset_index(drop=True)

In [5]:
date_th = '2017-04-16'
train_period = 7
eval_period = 7
top_n = 30
decay_rate = 0.9

df = cudf.read_csv('../data/processed/train.csv', parse_dates=['time_stamp'])
train_start_date, train_end_date, _, _ = get_data_period(date_th, train_period, eval_period)
tmp3 = period_extraction(df, train_start_date, train_end_date)

filename = f'pairs_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv'
pairs3 = cudf.read_csv(filename)
pairs3 = pairs3[pairs3['rated']==1].reset_index(drop=True)

In [65]:
def f(tmp, pairs):
    print(train_end_date)
    tmp["day_diff"] = (train_end_date - tmp["time_stamp"]) / np.timedelta64(1, "D")
    tmp["weight_decay"] = tmp["day_diff"].apply(lambda x: decay_rate**x)
    action_list = [
        ("cv", 3),
        ("click", 2),
        ("pv", 1),
        ("other", 0),
    ]
    features = []
    features2 = []
    for event_name, event_type in action_list:
        feature_name = f"{event_name}-score-r{decay_rate}_by_item"
        tmp["score"] = tmp["weight_decay"] * (tmp["event_type"] == event_type)
        feature = tmp.groupby("product_id")["score"].sum().reset_index()
        feature = feature.rename(columns={"score": feature_name})
        #print(len(feature), feature[feature_name].mean())
        features.append(feature)
        
        pairs = cudf.merge(pairs, feature, how="left", on="product_id")
        print(len(pairs), pairs[feature_name].median())
        #print(len(set(pairs['product_id'].to_pandas())), len(set(feature['product_id'].to_pandas())))
        #print(pairs[feature_name].isna().sum())
        #pairs[feature_name] = pairs[feature_name].fillna(0)
        pairs = pairs.dropna()
        features2.append(pairs[feature_name])
        
    return features, features2

In [66]:
i = 0

date_th = '2017-04-30'
train_period = 7
eval_period = 7
top_n = 30
decay_rate = 1.0
train_start_date, train_end_date, _, _ = get_data_period(date_th, train_period, eval_period)

feats1, feats2 = f(tmp1, pairs1)
feats1[i].iloc[:, 1].mean(), feats2[i].mean()

2017-04-30 00:00:00
347940 1.7100529624038217
322340 0.5113534833909036
322340 63.88999795233936
322340 24.245238884147092


(0.035919981556410636, 8.184859801334028)

In [67]:
date_th = '2017-04-23'
train_period = 7
eval_period = 7
top_n = 30
decay_rate = 1.0
train_start_date, train_end_date, _, _ = get_data_period(date_th, train_period, eval_period)

feats1, feats2 = f(tmp2, pairs2)
feats1[i].iloc[:, 1].mean(), feats2[i].mean()

2017-04-23 00:00:00
345030 2.461259354711475
345030 0.8372669157117523
345030 100.05107928242064
345030 34.48034622286071


(0.03348005852536735, 10.43461484784274)

In [68]:
date_th = '2017-04-16'
train_period = 7
eval_period = 7
top_n = 30
decay_rate = 1.0
train_start_date, train_end_date, _, _ = get_data_period(date_th, train_period, eval_period)

feats1, feats2 = f(tmp3, pairs3)
feats1[i].iloc[:, 1].mean(), feats2[i].mean()

2017-04-16 00:00:00
360330 2.7245515377237384
360330 0.996718224353803
360330 107.41211022673926
360330 38.03065421812528


(0.033559931218916386, 9.672188229614793)

In [115]:
tmp1['user_id'].nunique()

152621

In [45]:
tmp2['user_id'].nunique() - tmp1['user_id'].nunique()

3101

In [17]:
tmp2[tmp2['event_type']>0]['product_id'].nunique()

301838

In [18]:
tmp1[tmp1['event_type']>0]['product_id'].nunique()

303442

In [19]:
tmp1[tmp1['event_type']>0]['event_type'].sum()

1468411

In [20]:
tmp2[tmp2['event_type']>0]['event_type'].sum()

1441715

In [21]:
tmp1[tmp1['event_type']>0]['user_id'].nunique()

133060

In [22]:
tmp2[tmp2['event_type']>0]['user_id'].nunique()

135798

In [23]:
tmp1.shape

(2313281, 6)

In [24]:
tmp2.shape

(2293937, 6)