In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import implicit
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
pred_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\implicit_lfm_hyper_pred"
target_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\implicit_lfm_hyper_target"

In [3]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")

In [4]:
%%time

list_coord = []
list_coord_val = []
target_val = []
np.random.seed(42)  
val_id = np.random.choice(range(100000), size=25000, replace=False)
val_id.sort()

with open(train_path) as f:
    lines = f.readlines()
    idx = 0
    idx_val = 0
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        if len(tracks) == 0:
            print(0)
        if len(tracks) == 1:
            print(1)
        for i, track in enumerate(tracks):
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                idx_val += 1
                continue
            if idx in val_id:
                list_coord_val.append((idx_val, track))
            list_coord.append((idx, track))
        idx += 1
        if idx >= 100000:
            break

  9%|██████                                                                 | 99999/1160084 [15:05<2:40:03, 110.39it/s]

CPU times: total: 5min 31s
Wall time: 15min 7s





In [5]:
%%time

arr_rows = np.array([i[0] for i in list_coord])
arr_cols = np.array([i[1] for i in list_coord]).astype(int)
ones = np.ones(arr_rows.shape[0])
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=int)
#item_user = csr_matrix((ones, (arr_cols, arr_rows)), dtype=int)

arr_rows_val = np.array([i[0] for i in list_coord_val])
arr_cols_val = np.array([i[1] for i in list_coord_val]).astype(int)
ones_val = np.ones(arr_rows_val.shape[0])
user_item_val = csr_matrix((ones_val, (arr_rows_val, arr_cols_val)), dtype=int)

CPU times: total: 7.58 s
Wall time: 16.5 s


In [6]:
import time

def optimize(factors=30,
             learning_rate=1.00,
             regularization=0.6,
             iterations=30,
             neg_prop=30):
    
    model = implicit.lmf.LogisticMatrixFactorization(factors=factors,
                                                     learning_rate=learning_rate,
                                                     regularization=regularization,
                                                     iterations=iterations,
                                                     neg_prop=neg_prop,
                                                     random_state=42)
    
    model.fit(user_item)
    
    start_time = time.time()
    
    userid = val_id
    recommendations = model.recommend(userid, user_item_val, N=100, filter_already_liked_items=True)
    result = [' '.join(map(str, i)) + '\n' for i in recommendations[0]]
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    with open(pred_path, 'w') as f:
        f.writelines(result)
    
    with open(target_path, 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score(target_path, pred_path)

In [9]:
for factors, iterations in [(100, 200), (200, 300), (300, 200), (400, 300), (400, 600)]:
    for neg_prop in [1, 3, 30, 60]:
        for regularization in [1.00, 0.6, 0.1]:
                print(f'factors: {factors}, iterations: {iterations}, neg_prop: {neg_prop}, regularization: {regularization}')
                optimize(factors=factors,
                         learning_rate=1.00,
                         regularization=regularization,
                         iterations=iterations,
                         neg_prop=neg_prop)

factors: 100, iterations: 200, neg_prop: 1, regularization: 1.0


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [19:10<00:00,  5.75s/it]


--- 102.39990568161011 seconds ---
MRR@100 = 0.0100
factors: 100, iterations: 200, neg_prop: 1, regularization: 0.6


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [17:59<00:00,  5.40s/it]


--- 100.60209846496582 seconds ---
MRR@100 = 0.0099
factors: 100, iterations: 200, neg_prop: 1, regularization: 0.1


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [19:00<00:00,  5.70s/it]


--- 98.14575743675232 seconds ---
MRR@100 = 0.0110
factors: 100, iterations: 200, neg_prop: 3, regularization: 1.0


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [23:06<00:00,  6.93s/it]


--- 77.39326190948486 seconds ---
MRR@100 = 0.0133
factors: 100, iterations: 200, neg_prop: 3, regularization: 0.6


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [22:47<00:00,  6.84s/it]


--- 100.79625582695007 seconds ---
MRR@100 = 0.0141
factors: 100, iterations: 200, neg_prop: 3, regularization: 0.1


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [23:11<00:00,  6.96s/it]


--- 117.01272439956665 seconds ---
MRR@100 = 0.0153
factors: 100, iterations: 200, neg_prop: 30, regularization: 1.0


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [35:28<00:00, 10.64s/it]


--- 109.6192979812622 seconds ---
MRR@100 = 0.0133
factors: 100, iterations: 200, neg_prop: 30, regularization: 0.6


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [35:36<00:00, 10.68s/it]


--- 106.53899049758911 seconds ---
MRR@100 = 0.0143
factors: 100, iterations: 200, neg_prop: 30, regularization: 0.1


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [35:48<00:00, 10.74s/it]


--- 106.6573531627655 seconds ---
MRR@100 = 0.0142
factors: 100, iterations: 200, neg_prop: 60, regularization: 1.0


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [38:31<00:00, 11.56s/it]


--- 103.50028872489929 seconds ---
MRR@100 = 0.0129
factors: 100, iterations: 200, neg_prop: 60, regularization: 0.6


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [39:05<00:00, 11.73s/it]


--- 72.89613175392151 seconds ---
MRR@100 = 0.0145
factors: 100, iterations: 200, neg_prop: 60, regularization: 0.1


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [38:44<00:00, 11.62s/it]


--- 100.28653764724731 seconds ---
MRR@100 = 0.0133
factors: 200, iterations: 300, neg_prop: 1, regularization: 1.0


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [50:43<00:00, 10.14s/it]


--- 118.18065643310547 seconds ---
MRR@100 = 0.0100
factors: 200, iterations: 300, neg_prop: 1, regularization: 0.6


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [49:42<00:00,  9.94s/it]


--- 157.13664555549622 seconds ---
MRR@100 = 0.0098
factors: 200, iterations: 300, neg_prop: 1, regularization: 0.1


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [50:26<00:00, 10.09s/it]


--- 165.28684830665588 seconds ---
MRR@100 = 0.0101
factors: 200, iterations: 300, neg_prop: 3, regularization: 1.0


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [40:23<00:00,  8.08s/it]


--- 65.22787833213806 seconds ---
MRR@100 = 0.0104
factors: 200, iterations: 300, neg_prop: 3, regularization: 0.6


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [31:00<00:00,  6.20s/it]


--- 67.4002616405487 seconds ---
MRR@100 = 0.0105
factors: 200, iterations: 300, neg_prop: 3, regularization: 0.1


100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [30:56<00:00,  6.19s/it]


--- 73.21377754211426 seconds ---
MRR@100 = 0.0106
factors: 200, iterations: 300, neg_prop: 30, regularization: 1.0


 60%|████████████████████████████████████████████████▎                               | 181/300 [35:48<23:32, 11.87s/it]


KeyboardInterrupt: 

In [None]:
for factors in [200, 350, 500]:
    for iterations in [300, 500, 750]:
        if factors <= iterations:
            for neg_prop in [3, 30, 60, 100]:
                if neg_prop == 3:
                    regularization = 0.1
                else:
                    regularization = 0.6
                print(f'factors: {factors}, iterations: {iterations}, neg_prop: {neg_prop}, regularization: {regularization}')
                optimize(factors=factors,
                         learning_rate=1.00,
                         regularization=regularization,
                         iterations=iterations,
                         neg_prop=neg_prop)