In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import sys
sys.path.append('../')

In [None]:
import pandas as pd
import numpy as np
import time
from implicit.bpr import BayesianPersonalizedRanking
from scipy.sparse import coo_matrix

from src.preprocess import add_time_idx
from sklearn.model_selection import train_test_split

## Load Data

In [3]:
DATA_PATH ='../data/ml-1m.csv'

In [4]:
data = pd.read_csv(DATA_PATH)
data = add_time_idx(data)
print(data.shape)
data.head()

(1000209, 6)


Unnamed: 0,user_id,item_id,rating,timestamp,time_idx,time_idx_reversed
0,1,3186,4,2000-12-31 22:00:19,0,52
1,1,1270,5,2000-12-31 22:00:55,1,51
2,1,1721,4,2000-12-31 22:00:55,2,50
3,1,1022,5,2000-12-31 22:00:55,3,49
4,1,2340,3,2000-12-31 22:01:43,4,48


In [6]:
data.user_id.nunique(), data.item_id.nunique()

(6040, 3416)

In [7]:
data.user_id.value_counts().describe()

count    6040.000000
mean      165.498510
std       192.543909
min        18.000000
25%        44.000000
50%        96.000000
75%       207.250000
max      2277.000000
Name: user_id, dtype: float64

In [8]:
data.item_id.value_counts().describe()

count    3416.000000
mean      292.626171
std       391.674786
min         5.000000
25%        47.000000
50%       146.000000
75%       374.250000
max      3428.000000
Name: item_id, dtype: float64

In [6]:
# for the next-10-items prediction
train = data[data.time_idx_reversed >= 10]
test = data[data.time_idx_reversed < 10]

users_validation, users_test = train_test_split(
    test.user_id.unique(), test_size=0.5, random_state=42)
validation = test[test.user_id.isin(users_validation)]
test = test[test.user_id.isin(users_test)]

train = add_time_idx(train)
validation = add_time_idx(validation)
test = add_time_idx(test)

In [12]:
## for the first element of test
# test = test[test.time_idx==0]
# test.head()

Unnamed: 0,user_id,item_id,rating,timestamp,time_idx,time_idx_reversed
43,1,2687,3,2001-01-06 23:37:48,0,9
544,7,474,5,2000-12-31 03:54:02,0,9
789,9,349,4,2000-12-31 01:36:04,0,9
1458,13,2470,3,2000-12-30 18:55:24,0,9
1684,15,1291,2,2000-12-30 21:44:05,0,9


## Dataloaders

In [14]:
test_matrix= coo_matrix((np.ones(len(train)), (train.user_id-1, train.item_id-1)), (data.user_id.max(), data.item_id.max()))

In [15]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, recall_at_k

def compute_metrics(ground_truth, preds, k=10):

    if not hasattr(ground_truth, 'rating'):
        ground_truth = ground_truth.assign(rating=1)

    # when we have 1 true positive, HitRate == Recall and MRR == MAP
    metrics = {
        'ndcg': ndcg_at_k(ground_truth, preds, col_user='user_id', col_item='item_id',
                          col_prediction='prediction', col_rating='rating', k=k),
        'recall': recall_at_k(ground_truth, preds, col_user='user_id', col_item='item_id',
                                col_prediction='prediction', col_rating='rating', k=k),
        'map': map_at_k(ground_truth, preds, col_user='user_id', col_item='item_id',
                        col_prediction='prediction', col_rating='rating', k=k)
    }

    return metrics

In [None]:
time_list=[]
recall10 = []
ndcg10 = []
map10 = []
with open("bpr_ml1m.json","w") as f:
    for i in range(10):
        seed = np.random.randint(1000)
        bpr = BayesianPersonalizedRanking(factors=256,use_gpu=True,learning_rate=0.003,regularization=0.001,iterations=200,random_state=seed)

        start_time = time.time()
        bpr.fit(test_matrix)
        time_list.append(time.time() - start_time)

        result = bpr.recommend(np.arange(data.user_id.max()),
                  test_matrix.tocsr(),
                  100,
                  True,
                  recalculate_user= False)
        result_df =pd.DataFrame(columns=["user_id","item_id","prediction"])
        result_df["user_id"]=np.repeat(np.arange(1,data.user_id.max()+1), 100)
        result_df["item_id"]=result[0].ravel()+1
        result_df["prediction"]=result[1].ravel()
        metrics = compute_metrics(test, result_df, k=10)
        recall10.append(metrics["recall"])
        ndcg10.append(metrics["ndcg"])
        map10.append(metrics["map"])
        
        
    f.write(f"""{{"time": {np.mean(time_list)},
"recall10": {np.mean(recall10)},
"ndcg10": {np.mean(ndcg10)},
"map10": {np.mean(map10)}}}\n""")
    