In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import implicit
import time
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")

In [3]:
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
target_val_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\target_val"
pred_val_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\pred_val"

In [4]:
%%time

list_coord = []
list_coord_val = []
target_val = []
np.random.seed(42)  
val_id = np.random.choice(range(10000), size=2500, replace=False)
val_id.sort()

with open(train_path) as f:
    lines = f.readlines()
    idx = 0
    idx_val = 0
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        if len(tracks) == 0:
            print(0)
        if len(tracks) == 1:
            print(1)
        for i, track in enumerate(tracks):
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                idx_val += 1
                continue
            if idx in val_id:
                list_coord_val.append((idx_val, track))
            list_coord.append((idx, track))
        idx += 1
        if idx >= 10000:
            break

  1%|▋                                                                         | 9999/1160084 [00:10<20:47, 922.06it/s]

CPU times: total: 7.62 s
Wall time: 12.2 s





In [5]:
%%time

arr_rows = np.array([i[0] for i in list_coord])
arr_cols = np.array([i[1] for i in list_coord]).astype(int)
ones = np.ones(arr_rows.shape[0])
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=int)
#item_user = csr_matrix((ones, (arr_cols, arr_rows)), dtype=int)

arr_rows_val = np.array([i[0] for i in list_coord_val])
arr_cols_val = np.array([i[1] for i in list_coord_val]).astype(int)
ones_val = np.ones(arr_rows_val.shape[0])
user_item_val = csr_matrix((ones_val, (arr_rows_val, arr_cols_val)), dtype=int)

CPU times: total: 766 ms
Wall time: 1.55 s


In [6]:
def optimize(factors=100,
             regularization=0.01,
             alpha=1.0,
             use_native=True,
             use_cg=True,
             iterations=15,
             calculate_training_loss=False):
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 regularization=regularization,
                                                 alpha=alpha,
                                                 use_native=use_native,
                                                 use_cg=use_cg,
                                                 iterations=iterations,
                                                 random_state=42)
 
    model.fit(user_item)
    
    start_time = time.time()
    
    userid = val_id
    recommendations = model.recommend(userid, user_item_val, N=100, filter_already_liked_items=True)
    result = [' '.join(map(str, i)) + '\n' for i in recommendations[0]]
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    with open(pred_val_path, 'w') as f:
        f.writelines(result)
    
    with open(target_val_path, 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score(target_val_path, pred_val_path)

MRR@100 = 0.0234
optimize(factors=200,
     regularization=10,
     alpha=10,
     use_native=True,
     use_cg=False,
     iterations=15,
     calculate_training_loss=False)

In [8]:
alpha_arr = [10, 20, 30, 40, 50]
regularization_arr = [5, 10, 15]
for alpha in alpha_arr:
    for regularization in regularization_arr:
        print(f' alpha: {alpha}, regularization: {regularization}')
        optimize(factors=100,
             regularization=regularization,
             alpha=alpha,
             use_native=True,
             use_cg=False,
             iterations=15,
             calculate_training_loss=False)

 alpha: 10, regularization: 5


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:07<00:00,  4.53s/it]


--- 13.787473201751709 seconds ---
MRR@100 = 0.0185
 alpha: 10, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:04<00:00,  4.31s/it]


--- 16.918511629104614 seconds ---
MRR@100 = 0.0185
 alpha: 10, regularization: 15


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:26<00:00,  5.73s/it]


--- 17.48112940788269 seconds ---
MRR@100 = 0.0184
 alpha: 20, regularization: 5


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:03<00:00,  4.22s/it]


--- 13.648868083953857 seconds ---
MRR@100 = 0.0190
 alpha: 20, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:04<00:00,  4.27s/it]


--- 13.512147188186646 seconds ---
MRR@100 = 0.0191
 alpha: 20, regularization: 15


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:22<00:00,  5.52s/it]


--- 12.228473424911499 seconds ---
MRR@100 = 0.0190
 alpha: 30, regularization: 5


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:02<00:00,  4.19s/it]


--- 10.312493324279785 seconds ---
MRR@100 = 0.0193
 alpha: 30, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:02<00:00,  4.19s/it]


--- 9.404845476150513 seconds ---
MRR@100 = 0.0194
 alpha: 30, regularization: 15


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:23<00:00,  5.58s/it]


--- 12.96262502670288 seconds ---
MRR@100 = 0.0194
 alpha: 40, regularization: 5


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:04<00:00,  4.27s/it]


--- 12.220439434051514 seconds ---
MRR@100 = 0.0198
 alpha: 40, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:17<00:00,  5.14s/it]


--- 9.967333555221558 seconds ---
MRR@100 = 0.0198
 alpha: 40, regularization: 15


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:11<00:00,  4.75s/it]


--- 10.50281023979187 seconds ---
MRR@100 = 0.0198
 alpha: 50, regularization: 5


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:03<00:00,  4.25s/it]


--- 15.335431098937988 seconds ---
MRR@100 = 0.0195
 alpha: 50, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:22<00:00,  5.53s/it]


--- 10.859648704528809 seconds ---
MRR@100 = 0.0196
 alpha: 50, regularization: 15


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:05<00:00,  4.35s/it]


--- 10.556763887405396 seconds ---
MRR@100 = 0.0198


alpha: 40, regularization: 10
100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [01:17<00:00,  5.14s/it]
--- 9.967333555221558 seconds ---
MRR@100 = 0.0198

In [9]:
%%time

list_coord = []
list_coord_val = []
target_val = []
np.random.seed(42)  
val_id = np.random.choice(range(100000), size=25000, replace=False)
val_id.sort()

with open(train_path) as f:
    lines = f.readlines()
    idx = 0
    idx_val = 0
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        if len(tracks) == 0:
            print(0)
        if len(tracks) == 1:
            print(1)
        for i, track in enumerate(tracks):
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                idx_val += 1
                continue
            if idx in val_id:
                list_coord_val.append((idx_val, track))
            list_coord.append((idx, track))
        idx += 1
        if idx >= 100000:
            break

  9%|██████                                                                 | 99999/1160084 [16:38<2:56:24, 100.15it/s]

CPU times: total: 6min 11s
Wall time: 16min 42s





In [10]:
%%time

arr_rows = np.array([i[0] for i in list_coord])
arr_cols = np.array([i[1] for i in list_coord]).astype(int)
ones = np.ones(arr_rows.shape[0])
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=int)
#item_user = csr_matrix((ones, (arr_cols, arr_rows)), dtype=int)

arr_rows_val = np.array([i[0] for i in list_coord_val])
arr_cols_val = np.array([i[1] for i in list_coord_val]).astype(int)
ones_val = np.ones(arr_rows_val.shape[0])
user_item_val = csr_matrix((ones_val, (arr_rows_val, arr_cols_val)), dtype=int)

CPU times: total: 7.59 s
Wall time: 18.4 s


In [11]:
alpha_arr = [35, 40, 45]
regularization_arr = [7, 10, 18]
for alpha in alpha_arr:
    for regularization in regularization_arr:
        print(f' alpha: {alpha}, regularization: {regularization}')
        optimize(factors=100,
             regularization=regularization,
             alpha=alpha,
             use_native=True,
             use_cg=False,
             iterations=15,
             calculate_training_loss=False)

 alpha: 35, regularization: 7


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [05:15<00:00, 21.02s/it]


--- 102.15359735488892 seconds ---
MRR@100 = 0.0210
 alpha: 35, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [05:00<00:00, 20.05s/it]


--- 107.34534931182861 seconds ---
MRR@100 = 0.0210
 alpha: 35, regularization: 18


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [04:52<00:00, 19.49s/it]


--- 109.57082676887512 seconds ---
MRR@100 = 0.0210
 alpha: 40, regularization: 7


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [04:58<00:00, 19.90s/it]


--- 107.25627875328064 seconds ---
MRR@100 = 0.0209
 alpha: 40, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [05:17<00:00, 21.17s/it]


--- 109.84203124046326 seconds ---
MRR@100 = 0.0210
 alpha: 40, regularization: 18


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [04:55<00:00, 19.67s/it]


--- 112.18263387680054 seconds ---
MRR@100 = 0.0210
 alpha: 45, regularization: 7


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [04:53<00:00, 19.54s/it]


--- 114.1187219619751 seconds ---
MRR@100 = 0.0210
 alpha: 45, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [05:03<00:00, 20.21s/it]


--- 82.34618306159973 seconds ---
MRR@100 = 0.0210
 alpha: 45, regularization: 18


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [03:58<00:00, 15.93s/it]


--- 50.82361459732056 seconds ---
MRR@100 = 0.0210


In [None]:
optimize(factors=200,
     regularization=10,
     alpha=40,
     use_native=True,
     use_cg=False,
     iterations=15,
     calculate_training_loss=False)

  0%|                                                                                           | 0/15 [00:00<?, ?it/s]