In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import tensorflow as tf

from libreco.data import split_by_ratio_chrono, DatasetPure
from libreco.algorithms import (
    SVD, SVDpp, NCF, ALS, UserCF, ItemCF, RNN4Rec, KnnEmbedding,
    KnnEmbeddingApproximate, BPR)

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# load data
df = pd.read_csv('transactions.csv')
prods = pd.read_csv('products.csv')
# df = trans.merge(prods, on=['product_id'])

In [3]:
# preprocessing and optimisation
int64 = df.select_dtypes('int64').columns
float64 = df.select_dtypes('float64').columns
df.days_since_prior_order = df.days_since_prior_order.fillna(9999)
df[int64] = df[int64].astype('int32')
df[float64] = df[float64].astype('int32')
df.sort_values(by=['user_id', 'order_number', 'add_to_cart_order'], ignore_index=True, inplace=True)

# no datatime in row data. Add sequentional parameter
df['time'] = df.index.values
df['time'] = df['time'].astype('int32')

In [4]:
# crate ranking score connected to frequency of purchases
aggr = df.groupby(['user_id','product_id']).agg({'order_id':'count'}).rename(columns={'order_id':'cnt'}).sort_values('cnt', ascending=False)
aggr['rating'] = aggr.rank(pct=True)*5
aggr['rating'] = aggr['rating'].apply(round)
agg_rank = aggr.reset_index()
df = df.merge(agg_rank, on=["user_id","product_id"])

In [5]:
# clear memory
df = df.rename(columns={"user_id":"user", "product_id":"item", "rating":"label"})
data = df[["user", "item", "label", "time"]]
del aggr
del df
del prods

data.head()

Unnamed: 0,user,item,label,time
0,1,196,5,0
1,1,196,5,5
2,1,196,5,11
3,1,196,5,16
4,1,196,5,21


In [6]:
# df.isna().sum()

In [7]:
def reset_state(name):
    tf.compat.v1.reset_default_graph()
    print("\n", "=" * 30, name, "=" * 30)

In [19]:
train_data, eval_data = split_by_ratio_chrono(data, test_size=0.2)
train_data, data_info = DatasetPure.build_trainset(train_data)
eval_data = DatasetPure.build_evalset(eval_data)
print(data_info)
# do negative sampling, assume the data only contains positive feedback
train_data.build_negative_samples(data_info, item_gen_mode="random",
                                  num_neg=1, seed=2020)
eval_data.build_negative_samples(data_info, item_gen_mode="random",
                                 num_neg=1, seed=2222)

reset_state("SVD")
svd = SVD("ranking", data_info, embed_size=16, n_epochs=3, lr=0.001,
          reg=None, batch_size=512, batch_sampling=False, num_neg=1)
svd.fit(train_data, verbose=2, shuffle=True, eval_data=eval_data,
        metrics=["loss", "balanced_accuracy",
                 "roc_auc", "pr_auc", "precision",
                 "recall", "map", "ndcg"])

n_users: 100000, n_items: 49133, data sparsity: 0.4300 %
random neg item sampling elapsed: 10.800s
random neg item sampling elapsed: 2.780s

Training start time: [35m2021-04-07 10:43:20[0m


train: 100%|██████████| 82526/82526 [01:37<00:00, 842.40it/s]
eval_pred:   9%|▉         | 117/1290 [00:00<00:01, 1163.63it/s]

Epoch 1 elapsed: 109.629s
	 [32mtrain_loss: 0.2885[0m


eval_pred: 100%|██████████| 1290/1290 [00:01<00:00, 1126.48it/s]
eval_rec: 100%|██████████| 2048/2048 [00:01<00:00, 1348.29it/s]


	 eval log_loss: 0.2603
	 eval balanced accuracy: 0.9016
	 eval roc_auc: 0.9629
	 eval pr_auc: 0.9608
	 eval precision@10: 0.0419
	 eval recall@10: 0.0108
	 eval map@10: 0.1227
	 eval ndcg@10: 0.1723


train: 100%|██████████| 82526/82526 [01:34<00:00, 870.72it/s]
eval_pred:   9%|▉         | 118/1290 [00:00<00:00, 1179.25it/s]

Epoch 2 elapsed: 106.358s
	 [32mtrain_loss: 0.2212[0m


eval_pred: 100%|██████████| 1290/1290 [00:01<00:00, 1144.84it/s]
eval_rec: 100%|██████████| 2048/2048 [00:01<00:00, 1258.18it/s]


	 eval log_loss: 0.2419
	 eval balanced accuracy: 0.9126
	 eval roc_auc: 0.9693
	 eval pr_auc: 0.9671
	 eval precision@10: 0.0407
	 eval recall@10: 0.0106
	 eval map@10: 0.1174
	 eval ndcg@10: 0.1664


train: 100%|██████████| 82526/82526 [01:34<00:00, 869.26it/s]
eval_pred:   9%|▉         | 118/1290 [00:00<00:00, 1175.50it/s]

Epoch 3 elapsed: 106.513s
	 [32mtrain_loss: 0.1964[0m


eval_pred: 100%|██████████| 1290/1290 [00:01<00:00, 1136.66it/s]
eval_rec: 100%|██████████| 2048/2048 [00:01<00:00, 1428.61it/s]


	 eval log_loss: 0.2377
	 eval balanced accuracy: 0.9171
	 eval roc_auc: 0.9715
	 eval pr_auc: 0.9690
	 eval precision@10: 0.0396
	 eval recall@10: 0.0100
	 eval map@10: 0.1158
	 eval ndcg@10: 0.1623


In [26]:
print("prediction:  for item #196", svd.predict(user=1, item=196))
print("TOP-100 recommendation: ", svd.recommend_user(user=1, n_rec=100))
print(f'Is most likely item (196) for user 1 in recomendation? {196 in svd.recommend_user(user=1, n_rec=100)}')

prediction:  for item #196 [0.99929833]
TOP-100 recommendation:  [(22362, 0.9999939), (13575, 0.9999902), (41400, 0.9999889), (42282, 0.9999869), (31759, 0.9999838), (37710, 0.9999826), (31651, 0.999977), (33783, 0.9999747), (116, 0.9999622), (38768, 0.9999548), (35561, 0.9999287), (21572, 0.99991965), (5258, 0.9999168), (26900, 0.9999167), (21769, 0.9998933), (9434, 0.9998815), (11365, 0.9998747), (21137, 0.999869), (17207, 0.9998654), (39954, 0.99986315), (36472, 0.99986184), (8803, 0.9998615), (10441, 0.99983954), (20518, 0.9998394), (15680, 0.9998375), (30633, 0.9998167), (16732, 0.9997992), (45051, 0.9997774), (12341, 0.99977213), (10310, 0.99976164), (19887, 0.99974555), (22507, 0.9997451), (13042, 0.9997354), (19660, 0.99973124), (907, 0.9997291), (29015, 0.9997197), (32478, 0.9997185), (21386, 0.99970514), (30486, 0.9996941), (38928, 0.9996848), (20738, 0.9996712), (33147, 0.9996644), (22802, 0.9996604), (17313, 0.99964523), (21873, 0.999635), (27038, 0.99962103), (4938, 0.9995

In [27]:
user_item = data.groupby(['user', 'item']) \
    .agg({"label":'count'}) \
    .rename(columns={"label":'cnt'}) \
    .sort_values(by='cnt', ascending=False)

# most likely items for user #1 is item #196
user_item.loc[1].head(10)

Unnamed: 0_level_0,cnt
item,Unnamed: 1_level_1
196,10
12427,10
10258,9
25133,8
46149,3
13032,3
49235,2
13176,2
26088,2
26405,2


In [12]:
# users how bought only 1 items in overall history
data.groupby(['user']) \
    .agg({'item':'nunique'}).reset_index().query('item==1').head(10)

Unnamed: 0,user,item
472,986,1
2114,4433,1
4190,8702,1
5200,10798,1
5558,11534,1
5792,12025,1
9725,20121,1
15549,32121,1
17964,37075,1
23408,48242,1


In [18]:
# purchases history for user #986 (only 1 item 4444)
data.query('user==986')

Unnamed: 0,user,item,label,time
119360,986,4444,5,119360
119361,986,4444,5,119361
119362,986,4444,5,119362
119363,986,4444,5,119363
119364,986,4444,5,119364
119365,986,4444,5,119365
119366,986,4444,5,119366
119367,986,4444,5,119367
119368,986,4444,5,119368
119369,986,4444,5,119369


In [30]:
print("prediction:  for user #986 item=4444", svd.predict(user=986, item=4444))
print("TOP-100 recommendation: ", svd.recommend_user(user=986, n_rec=100))
print(f'Is most likely item (4444) for user #986 in recomendation? {4444 in svd.recommend_user(user=1, n_rec=100)}')

prediction:  for user #986 item=4444 [0.9602553]
TOP-100 recommendation:  [(22901, 0.9986553), (9589, 0.9979254), (24964, 0.99730515), (31553, 0.9972361), (34126, 0.9969591), (6801, 0.99651885), (28509, 0.99521506), (6368, 0.99491477), (41220, 0.99483585), (5427, 0.99467623), (29846, 0.9945832), (44359, 0.9944595), (1157, 0.99432266), (21903, 0.99432015), (5450, 0.994132), (37266, 0.99412495), (23532, 0.99387157), (35163, 0.99385506), (23803, 0.99376786), (19613, 0.993755), (1365, 0.99373215), (49217, 0.9934157), (46979, 0.9933328), (12914, 0.992941), (31717, 0.9929196), (45007, 0.9928763), (47766, 0.9928474), (11185, 0.99284476), (30881, 0.99271214), (26530, 0.99238235), (17795, 0.992326), (39147, 0.992303), (2178, 0.9922988), (26477, 0.99222517), (34487, 0.99221426), (43789, 0.99217623), (9913, 0.99168944), (41330, 0.99162054), (7049, 0.99139106), (42768, 0.99126124), (42398, 0.99118173), (48894, 0.9908404), (18860, 0.99075174), (19660, 0.99055344), (28601, 0.99054396), (12697, 0.990