In [12]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import sys
sys.path.append('../')

In [13]:
import pandas as pd
import numpy as np
import time
from implicit.bpr import BayesianPersonalizedRanking
from scipy.sparse import coo_matrix

from src.preprocess import add_time_idx
from sklearn.model_selection import train_test_split

## Load Data

In [3]:
DATA_PATH = '../data/ml-20m.csv'

In [4]:
data = pd.read_csv(DATA_PATH)
data = add_time_idx(data)
print(data.shape)
data.head()

(20000263, 6)


Unnamed: 0,user_id,item_id,rating,timestamp,time_idx,time_idx_reversed
0,1,924,3.5,2004-09-10 03:06:38,0,174
1,1,919,3.5,2004-09-10 03:07:01,1,173
2,1,2683,3.5,2004-09-10 03:07:30,2,172
3,1,1584,3.5,2004-09-10 03:07:36,3,171
4,1,1079,4.0,2004-09-10 03:07:45,4,170


In [13]:
data.user_id.nunique(), data.item_id.nunique()

(138493, 26744)

In [14]:
data.user_id.value_counts().describe()

count    138493.000000
mean        144.413530
std         230.267257
min          20.000000
25%          35.000000
50%          68.000000
75%         155.000000
max        9254.000000
Name: user_id, dtype: float64

In [15]:
data.item_id.value_counts().describe()

count    26744.000000
mean       747.841123
std       3085.818268
min          1.000000
25%          3.000000
50%         18.000000
75%        205.000000
max      67310.000000
Name: item_id, dtype: float64

In [5]:
# for the next-10-items prediction
train = data[data.time_idx_reversed >= 10]
test = data[data.time_idx_reversed < 10]

users_validation, users_test = train_test_split(
    test.user_id.unique(), test_size=0.5, random_state=42)
validation = test[test.user_id.isin(users_validation)]
test = test[test.user_id.isin(users_test)]

train = add_time_idx(train)
validation = add_time_idx(validation)
test = add_time_idx(test)

In [7]:
## for the next-item prediction
# test = test[test.time_idx==0]
# test.head()

Unnamed: 0,user_id,item_id,rating,timestamp,time_idx,time_idx_reversed
165,1,7454,4.0,2005-04-02 23:55:08,0,9
226,2,1972,2.0,2000-11-21 15:36:09,0,9
441,4,370,4.0,1996-08-24 09:34:03,0,9
507,5,594,5.0,1996-12-26 16:26:29,0,9
877,8,508,3.0,1996-06-05 13:55:57,0,9


## Dataloaders

In [9]:
test_matrix= coo_matrix((np.ones(len(train)), (train.user_id-1, train.item_id-1)), (data.user_id.max(), data.item_id.max()))

In [10]:
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, recall_at_k

def compute_metrics(ground_truth, preds, k=10):

    if not hasattr(ground_truth, 'rating'):
        ground_truth = ground_truth.assign(rating=1)

    # when we have 1 true positive, HitRate == Recall and MRR == MAP
    metrics = {
        'ndcg': ndcg_at_k(ground_truth, preds, col_user='user_id', col_item='item_id',
                          col_prediction='prediction', col_rating='rating', k=k),
        'recall': recall_at_k(ground_truth, preds, col_user='user_id', col_item='item_id',
                                col_prediction='prediction', col_rating='rating', k=k),
        'map': map_at_k(ground_truth, preds, col_user='user_id', col_item='item_id',
                        col_prediction='prediction', col_rating='rating', k=k)
    }

    return metrics

In [None]:
time_list=[]
recall10 = []
ndcg10 = []
map10 = []
with open("bpr_ml20m.json","w") as f:
    for i in range(10):
        seed = np.random.randint(1000)
        bpr = BayesianPersonalizedRanking(factors=128,use_gpu=True,learning_rate=0.05,regularization=0.01,random_state=seed)

        start_time = time.time()
        bpr.fit(test_matrix)
        time_list.append(time.time() - start_time)

        result = bpr.recommend(np.arange(data.user_id.max()),
                  test_matrix.tocsr(),
                  100,
                  True,
                  recalculate_user= False)
        result_df =pd.DataFrame(columns=["user_id","item_id","prediction"])
        result_df["user_id"]=np.repeat(np.arange(1,data.user_id.max()+1), 100)
        result_df["item_id"]=result[0].ravel()+1
        result_df["prediction"]=result[1].ravel()
        metrics = compute_metrics(test, result_df, k=10)
        recall10.append(metrics["recall"])
        ndcg10.append(metrics["ndcg"])
        map10.append(metrics["map"])
        
        
    f.write(f"""{{"time": {np.mean(time_list)},
"recall10": {np.mean(recall10)},
"ndcg10": {np.mean(ndcg10)},
"map10": {np.mean(map10)}}}\n""")
    