In [8]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import implicit
import time
from tqdm import tqdm

In [2]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")

In [3]:
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
target_val_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\target_val"
pred_val_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\pred_val"

In [9]:
%%time

list_coord = []
list_coord_val = []
target_val = []
np.random.seed(42)  
val_id = np.random.choice(range(10000), size=2500, replace=False)
val_id.sort()

with open(train_path) as f:
    lines = f.readlines()
    idx = 0
    idx_val = 0
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        if len(tracks) == 0:
            print(0)
        if len(tracks) == 1:
            print(1)
        for i, track in enumerate(tracks):
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                idx_val += 1
                continue
            if idx in val_id:
                list_coord_val.append((idx_val, track))
            list_coord.append((idx, track))
        idx += 1
        if idx >= 10000:
            break

  1%|▌                                                                       | 9999/1160084 [00:39<1:16:31, 250.48it/s]

CPU times: total: 16.6 s
Wall time: 45.6 s





In [10]:
%%time

arr_rows = np.array([i[0] for i in list_coord])
arr_cols = np.array([i[1] for i in list_coord]).astype(int)
ones = np.ones(arr_rows.shape[0])
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=int)
#item_user = csr_matrix((ones, (arr_cols, arr_rows)), dtype=int)

arr_rows_val = np.array([i[0] for i in list_coord_val])
arr_cols_val = np.array([i[1] for i in list_coord_val]).astype(int)
ones_val = np.ones(arr_rows_val.shape[0])
user_item_val = csr_matrix((ones_val, (arr_rows_val, arr_cols_val)), dtype=int)

CPU times: total: 812 ms
Wall time: 2.55 s


In [11]:
def optimize(factors=100,
             regularization=0.01,
             alpha=1.0,
             use_native=True,
             use_cg=True,
             iterations=15,
             calculate_training_loss=False):
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 regularization=regularization,
                                                 alpha=alpha,
                                                 use_native=use_native,
                                                 use_cg=use_cg,
                                                 iterations=iterations,
                                                 random_state=42)
 
    model.fit(user_item)
    
    start_time = time.time()
    
    userid = val_id
    recommendations = model.recommend(userid, user_item_val, N=100, filter_already_liked_items=True)
    result = [' '.join(map(str, i)) + '\n' for i in recommendations[0]]
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    with open(pred_val_path, 'w') as f:
        f.writelines(result)
    
    with open(target_val_path, 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score(target_val_path, pred_val_path)

In [7]:
for alpha in [0.1, 1, 10, 100]:
    for regularization in [0.1, 1, 10, 100]:
        print(f' alpha: {alpha}, regularization: {regularization}')
        optimize(factors=100,
             regularization=regularization,
             alpha=alpha,
             use_native=True,
             use_cg=False,
             iterations=20,
             calculate_training_loss=False)

 alpha: 0.1, regularization: 0.1


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:37<00:00,  4.87s/it]


--- 16.62998628616333 seconds ---
MRR@100 = 0.0105
 alpha: 0.1, regularization: 1


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:39<00:00,  4.97s/it]


--- 21.377461433410645 seconds ---
MRR@100 = 0.0109
 alpha: 0.1, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:44<00:00,  5.23s/it]


--- 17.024970054626465 seconds ---
MRR@100 = 0.0085
 alpha: 0.1, regularization: 100


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [15:42<00:00, 47.13s/it]


--- 1650.7862496376038 seconds ---
MRR@100 = 0.0000
 alpha: 1, regularization: 0.1


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:47<00:00,  2.38s/it]


--- 16.715065240859985 seconds ---
MRR@100 = 0.0176
 alpha: 1, regularization: 1


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:47<00:00,  2.38s/it]


--- 16.871630907058716 seconds ---
MRR@100 = 0.0176
 alpha: 1, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:47<00:00,  2.39s/it]


--- 16.716830492019653 seconds ---
MRR@100 = 0.0175
 alpha: 1, regularization: 100


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:47<00:00,  2.38s/it]


--- 16.5326726436615 seconds ---
MRR@100 = 0.0081
 alpha: 10, regularization: 0.1


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:38<00:00,  4.93s/it]


--- 16.622385025024414 seconds ---
MRR@100 = 0.0205
 alpha: 10, regularization: 1


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:38<00:00,  4.95s/it]


--- 16.626078844070435 seconds ---
MRR@100 = 0.0205
 alpha: 10, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:38<00:00,  4.94s/it]


--- 16.535271406173706 seconds ---
MRR@100 = 0.0206
 alpha: 10, regularization: 100


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:38<00:00,  4.94s/it]


--- 16.731224060058594 seconds ---
MRR@100 = 0.0204
 alpha: 100, regularization: 0.1


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:40<00:00,  5.01s/it]


--- 16.46440100669861 seconds ---
MRR@100 = 0.0209
 alpha: 100, regularization: 1


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:39<00:00,  4.97s/it]


--- 16.3795645236969 seconds ---
MRR@100 = 0.0209
 alpha: 100, regularization: 10


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:38<00:00,  4.95s/it]


--- 16.676282167434692 seconds ---
MRR@100 = 0.0210
 alpha: 100, regularization: 100


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:39<00:00,  4.98s/it]


--- 16.642898559570312 seconds ---
MRR@100 = 0.0209


In [9]:
for iterations in [15, 20, 30, 50, 100]:
    print(f' alpha: {alpha}, regularization: {regularization}')
    optimize(factors=200,
         regularization=10,
         alpha=10,
         use_native=True,
         use_cg=False,
         iterations=iterations,
         calculate_training_loss=False)

 alpha: 100, regularization: 100


100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [41:30<00:00, 166.03s/it]


--- 39.98883414268494 seconds ---
MRR@100 = 0.0234
 alpha: 100, regularization: 100


100%|███████████████████████████████████████████████████████████████████████████████| 20/20 [1:04:16<00:00, 192.83s/it]


--- 40.81911063194275 seconds ---
MRR@100 = 0.0232
 alpha: 100, regularization: 100


100%|███████████████████████████████████████████████████████████████████████████████| 30/30 [1:34:49<00:00, 189.66s/it]


--- 40.72044062614441 seconds ---
MRR@100 = 0.0233
 alpha: 100, regularization: 100


100%|███████████████████████████████████████████████████████████████████████████████| 50/50 [2:26:27<00:00, 175.75s/it]


--- 26.991891384124756 seconds ---
MRR@100 = 0.0231
 alpha: 100, regularization: 100


100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [3:34:14<00:00, 128.54s/it]


--- 26.690924167633057 seconds ---
MRR@100 = 0.0231


In [None]:
alpha_arr = np.linspace(5, 1000, num=10)
regularization_arr = np.linspace(0.001, 100, num=10)
for alpha in alpha_arr:
    for regularization in regularization_arr:
        print(f' alpha: {alpha}, regularization: {regularization}')
        optimize(factors=200,
             regularization=regularization,
             alpha=alpha,
             use_native=True,
             use_cg=False,
             iterations=15,
             calculate_training_loss=False)

 alpha: 5.0, regularization: 0.001


  0%|                                                                                           | 0/15 [00:00<?, ?it/s]