In [3]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import implicit

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
pred_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\implicit_bpr_hyper_pred"
target_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\implicit_bpr_hyper_target"

In [5]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")

## Full valid

In [6]:
%%time

list_coord = []
list_coord_val = []
target_val = []
np.random.seed(42)  
val_id = np.random.choice(range(100000), size=25000, replace=False)
val_id.sort()

with open(train_path) as f:
    lines = f.readlines()
    idx = 0
    idx_val = 0
    for line in lines:
        tracks = line.strip().split(' ')
        if len(tracks) == 0:
            print(0)
        if len(tracks) == 1:
            print(1)
        for i, track in enumerate(tracks):
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                idx_val += 1
                continue
            if idx in val_id:
                list_coord_val.append((idx_val, track))
            list_coord.append((idx, track))
        idx += 1
        if idx >= 100000:
            break

CPU times: total: 4min 41s
Wall time: 4min 41s


In [7]:
%%time

arr_rows = np.array([i[0] for i in list_coord])
arr_cols = np.array([i[1] for i in list_coord]).astype(int)
ones = np.ones(arr_rows.shape[0])
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=int)
#item_user = csr_matrix((ones, (arr_cols, arr_rows)), dtype=int)

arr_rows_val = np.array([i[0] for i in list_coord_val])
arr_cols_val = np.array([i[1] for i in list_coord_val]).astype(int)
ones_val = np.ones(arr_rows_val.shape[0])
user_item_val = csr_matrix((ones_val, (arr_rows_val, arr_cols_val)), dtype=int)

CPU times: total: 5.64 s
Wall time: 5.63 s


In [8]:
import time

def optimize(factors=100,
             learning_rate=0.01,
             regularization=0.01,
             iterations=100,
             verify_negative_samples=True):
    
    model = implicit.bpr.BayesianPersonalizedRanking(factors=factors, 
                                                     learning_rate=learning_rate,
                                                     regularization=regularization,
                                                     iterations=iterations,
                                                     verify_negative_samples=verify_negative_samples,
                                                     random_state=42)
 
    model.fit(user_item)
    
    start_time = time.time()
    
    userid = val_id
    recommendations = model.recommend(userid, user_item_val, N=100, filter_already_liked_items=True)
    result = [' '.join(map(str, i)) + '\n' for i in recommendations[0]]
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    with open(pred_path, 'w') as f:
        f.writelines(result)
    
    with open(target_path, 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score(target_path, pred_path)

In [7]:
for factors in [400, 600, 800]:
    for iterations in [550, 750, 950]:
                    print(f'factors: {factors}, iterations: {iterations}')
                    optimize(factors=factors,
                             iterations=iterations,
                             learning_rate=0.1,
                             regularization=0.01,
                             verify_negative_samples=True)

factors: 400, iterations: 550


100%|███████████████████████████████████████████████| 550/550 [22:06<00:00,  2.41s/it, train_auc=98.41%, skipped=0.79%]


--- 141.99913883209229 seconds ---
MRR@100 = 0.0246
factors: 400, iterations: 750


100%|███████████████████████████████████████████████| 750/750 [34:07<00:00,  2.73s/it, train_auc=98.42%, skipped=0.79%]


--- 142.1598563194275 seconds ---
MRR@100 = 0.0254
factors: 400, iterations: 950


100%|█████████████████████████████████████████████| 950/950 [1:02:44<00:00,  3.96s/it, train_auc=98.42%, skipped=0.79%]


--- 256.67894291877747 seconds ---
MRR@100 = 0.0261
factors: 600, iterations: 550


100%|███████████████████████████████████████████████| 550/550 [49:47<00:00,  5.43s/it, train_auc=98.43%, skipped=0.79%]


--- 337.18114161491394 seconds ---
MRR@100 = 0.0251
factors: 600, iterations: 750


100%|█████████████████████████████████████████████| 750/750 [1:09:20<00:00,  5.55s/it, train_auc=98.43%, skipped=0.79%]


--- 388.8504390716553 seconds ---
MRR@100 = 0.0255
factors: 600, iterations: 950


100%|█████████████████████████████████████████████| 950/950 [1:29:12<00:00,  5.63s/it, train_auc=98.44%, skipped=0.79%]


--- 369.13466024398804 seconds ---
MRR@100 = 0.0259
factors: 800, iterations: 550


100%|█████████████████████████████████████████████| 550/550 [1:03:59<00:00,  6.98s/it, train_auc=98.42%, skipped=0.79%]


--- 485.3033549785614 seconds ---
MRR@100 = 0.0250
factors: 800, iterations: 750


100%|█████████████████████████████████████████████| 750/750 [1:26:29<00:00,  6.92s/it, train_auc=98.43%, skipped=0.80%]


--- 478.5154519081116 seconds ---
MRR@100 = 0.0265
factors: 800, iterations: 950


100%|█████████████████████████████████████████████| 950/950 [1:49:38<00:00,  6.92s/it, train_auc=98.44%, skipped=0.80%]


--- 520.904402256012 seconds ---
MRR@100 = 0.0252


In [None]:
for factors, iterations in [(400, 900), (400, 1000), (800, 700), (800, 800), (800, 1500)]:
    print(f'factors: {factors}, iterations: {iterations}')
    optimize(factors=factors,
             iterations=iterations,
             learning_rate=0.1,
             regularization=0.01,
             verify_negative_samples=True)

factors: 400, iterations: 900


100%|███████████████████████████████████████████████| 900/900 [38:42<00:00,  2.58s/it, train_auc=98.42%, skipped=0.80%]


--- 118.19736409187317 seconds ---
MRR@100 = 0.0252
factors: 400, iterations: 1000


100%|█████████████████████████████████████████████| 1000/1000 [43:56<00:00,  2.64s/it, train_auc=98.42%, skipped=0.79%]


--- 130.74716877937317 seconds ---
MRR@100 = 0.0246
factors: 800, iterations: 700


100%|███████████████████████████████████████████████| 700/700 [52:52<00:00,  4.53s/it, train_auc=98.43%, skipped=0.79%]


--- 232.59178042411804 seconds ---
MRR@100 = 0.0255
factors: 800, iterations: 800


100%|█████████████████████████████████████████████| 800/800 [1:00:14<00:00,  4.52s/it, train_auc=98.43%, skipped=0.79%]


--- 236.3770673274994 seconds ---
MRR@100 = 0.0253
factors: 800, iterations: 1500


  1%|▋                                            | 22/1500 [01:32<1:41:42,  4.13s/it, train_auc=96.35%, skipped=0.79%]

In [7]:
optimize(factors=800,
         iterations=1500,
         learning_rate=0.1,
         regularization=0.01,
         verify_negative_samples=True)

100%|███████████████████████████████████████████| 1500/1500 [1:55:17<00:00,  4.61s/it, train_auc=98.45%, skipped=0.79%]


--- 218.1873495578766 seconds ---
MRR@100 = 0.0259


In [9]:
l_arr = [0.05, 0.1, 0.15]
r_arr =  [0.005, 0.01, 0.015]

for factors in [400, 350, 450]:
    for learning_rate in l_arr:
        for regularization in r_arr:
            print(f'factors: {factors}, learning_rate: {learning_rate}, regularization: {regularization}')
            optimize(factors=factors,
                     iterations=950,
                     learning_rate=learning_rate,
                     regularization=regularization,
                     verify_negative_samples=True)

factors: 400, learning_rate: 0.05, regularization: 0.005


100%|███████████████████████████████████████████████| 950/950 [41:29<00:00,  2.62s/it, train_auc=99.47%, skipped=0.79%]


--- 146.0083863735199 seconds ---
MRR@100 = 0.0312
factors: 400, learning_rate: 0.05, regularization: 0.01


100%|███████████████████████████████████████████████| 950/950 [41:58<00:00,  2.65s/it, train_auc=98.60%, skipped=0.79%]


--- 147.9226667881012 seconds ---
MRR@100 = 0.0268
factors: 400, learning_rate: 0.05, regularization: 0.015


100%|███████████████████████████████████████████████| 950/950 [42:05<00:00,  2.66s/it, train_auc=97.69%, skipped=0.79%]


--- 136.74054837226868 seconds ---
MRR@100 = 0.0220
factors: 400, learning_rate: 0.1, regularization: 0.005


100%|███████████████████████████████████████████████| 950/950 [43:16<00:00,  2.73s/it, train_auc=99.35%, skipped=0.79%]


--- 149.81246185302734 seconds ---
MRR@100 = 0.0302
factors: 400, learning_rate: 0.1, regularization: 0.01


100%|███████████████████████████████████████████████| 950/950 [50:20<00:00,  3.18s/it, train_auc=98.42%, skipped=0.79%]


--- 148.43332195281982 seconds ---
MRR@100 = 0.0252
factors: 400, learning_rate: 0.1, regularization: 0.015


100%|███████████████████████████████████████████████| 950/950 [49:48<00:00,  3.15s/it, train_auc=97.44%, skipped=0.79%]


--- 147.8036606311798 seconds ---
MRR@100 = 0.0197
factors: 400, learning_rate: 0.15, regularization: 0.005


100%|███████████████████████████████████████████████| 950/950 [49:56<00:00,  3.15s/it, train_auc=99.18%, skipped=0.79%]


--- 148.83624577522278 seconds ---
MRR@100 = 0.0291
factors: 400, learning_rate: 0.15, regularization: 0.01


 90%|██████████████████████████████████████████▏    | 852/950 [45:22<05:13,  3.20s/it, train_auc=98.18%, skipped=0.80%]


KeyboardInterrupt: 

In [10]:
r_arr =  [0.0001, 0.0025, 0.0075]

for regularization in r_arr:
    print(f'regularization: {regularization}')
    optimize(factors=400,
             iterations=950,
             learning_rate=0.05,
             regularization=regularization,
             verify_negative_samples=True)

regularization: 0.0001


100%|██████████████████████████████████████████████| 950/950 [50:22<00:00,  3.18s/it, train_auc=100.00%, skipped=0.79%]


--- 149.0697159767151 seconds ---
MRR@100 = 0.0287
regularization: 0.0025


100%|███████████████████████████████████████████████| 950/950 [54:30<00:00,  3.44s/it, train_auc=99.80%, skipped=0.79%]


--- 165.58738327026367 seconds ---
MRR@100 = 0.0326
regularization: 0.0075


100%|█████████████████████████████████████████████| 950/950 [1:03:52<00:00,  4.03s/it, train_auc=99.07%, skipped=0.79%]


--- 232.23652505874634 seconds ---
MRR@100 = 0.0297


In [11]:
l_arr = [0.025, 0.075]

for learning_rate in l_arr:
    print(f'learning_rate: {learning_rate}')
    optimize(factors=400,
             iterations=950,
             learning_rate=learning_rate,
             regularization=0.0025,
             verify_negative_samples=True)

learning_rate: 0.025


100%|███████████████████████████████████████████████| 950/950 [49:03<00:00,  3.10s/it, train_auc=99.81%, skipped=0.79%]


--- 130.15966725349426 seconds ---
MRR@100 = 0.0316
learning_rate: 0.075


100%|███████████████████████████████████████████████| 950/950 [46:15<00:00,  2.92s/it, train_auc=99.76%, skipped=0.79%]


--- 129.81613063812256 seconds ---
MRR@100 = 0.0329
