In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import minmax_scale, scale
from tqdm import tqdm
import time
import implicit

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
pred_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\impicit_mix_als_bpr_hyper_full_valid_v3_pred"
target_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\impicit_mix_als_bpr_hyper_full_valid_v3_target"

In [3]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")
    
    return mrr_score / len(y_true)

In [4]:
%%time

list_coord = []
list_coord_val = []
target_val = []
np.random.seed(42)  
val_id = np.random.choice(range(100000), size=25000, replace=False)
val_id.sort()

with open(train_path) as f:
    lines = f.readlines()
    idx = 0
    idx_val = 0
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        if len(tracks) == 0:
            print(0)
        if len(tracks) == 1:
            print(1)
        for i, track in enumerate(tracks):
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                idx_val += 1
                continue
            if idx in val_id:
                list_coord_val.append((idx_val, track))
            list_coord.append((idx, track))
        idx += 1
        if idx >= 100000:
            break

  9%|██████▎                                                                  | 99999/1160084 [03:08<33:17, 530.78it/s]

CPU times: total: 3min 9s
Wall time: 3min 9s





In [5]:
%%time

arr_rows = np.array([i[0] for i in list_coord])
arr_cols = np.array([i[1] for i in list_coord]).astype(int)
ones = np.ones(arr_rows.shape[0])
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=int)
#item_user = csr_matrix((ones, (arr_cols, arr_rows)), dtype=int)

arr_rows_val = np.array([i[0] for i in list_coord_val])
arr_cols_val = np.array([i[1] for i in list_coord_val]).astype(int)
ones_val = np.ones(arr_rows_val.shape[0])
user_item_val = csr_matrix((ones_val, (arr_rows_val, arr_cols_val)), dtype=int)

CPU times: total: 5.12 s
Wall time: 5.11 s


In [6]:
def N_opt(model_als, model_bpr, N=100, q_bpr=1):
    
    N_als = N
    N_bpr = int(N * q_bpr)
    
    userid = val_id
        
    start_time = time.time()
    rec_als = model_als.recommend(userid, user_item_val, N=N_als, filter_already_liked_items=True)
    print("\n\n--- %s seconds model_als ---" % (time.time() - start_time))

    start_time = time.time()
    rec_bpr = model_bpr.recommend(userid, user_item_val, N=N_bpr, filter_already_liked_items=True)
    print("\n--- %s seconds model_bpr ---" % (time.time() - start_time))
    
    return rec_als, rec_bpr, N_als, N_bpr

In [7]:
def optimize(rec_als,
             rec_bpr,
             N_als,
             N_bpr,
             coef_bpr=1,
             norm_all=False,
             preproc=None):
    
    if not isinstance(norm_all, bool) or preproc not in ['minmax', 'standart', None]:
        raise ValueError('wrong params')
        
    if preproc is None:
        als_score = rec_als[1]
        bpr_score = rec_bpr[1]
    else:
        if norm_all:
            if preproc == 'minmax':
                als_score = minmax_scale(rec_als[1].flatten(), feature_range=(0, 1), axis=0).reshape((-1, N_als))
                bpr_score = minmax_scale(rec_bpr[1].flatten(), feature_range=(0, 1), axis=0).reshape((-1, N_bpr))
            if preproc == 'standart':
                als_score = scale(rec_als[1].flatten(), with_mean=True, with_std=True, axis=0).reshape((-1, N_als))
                bpr_score = scale(rec_bpr[1].flatten(), with_mean=True, with_std=True, axis=0).reshape((-1, N_bpr))    
        else:
            if preproc == 'minmax':
                als_score = minmax_scale(rec_als[1], feature_range=(0, 1), axis=1)
                bpr_score = minmax_scale(rec_bpr[1], feature_range=(0, 1), axis=1)   
            if preproc == 'standart':
                als_score = scale(rec_als[1], with_mean=True, with_std=True, axis=1)
                bpr_score = scale(rec_bpr[1], with_mean=True, with_std=True, axis=1)     

    result = []           
    for i in tqdm(range(rec_als[0].shape[0])):
        als_df = pd.DataFrame({'trak_id': rec_als[0][i], 'als_score': als_score[i]})
        bpr_df = pd.DataFrame({'trak_id': rec_bpr[0][i], 'bpr_score': bpr_score[i] * coef_bpr})
        df = pd.merge(als_df, bpr_df, how="outer", on='trak_id', sort=False)
        df['als_prior'] = df.als_score.where(~df.als_score.isna(), df.bpr_score)
        df['total_score'] = np.where((df.als_score.isna()) | (df.bpr_score.isna()), df.als_prior, 
                                     (df.als_score + df.bpr_score) / 2)
        final_rec = df.sort_values(by='total_score', ascending=False).trak_id.values[:100]
        result.append(' '.join(map(str, final_rec)) + '\n')
        
        
    with open(pred_path, 'w') as f:
        f.writelines(result)

    with open(target_path, 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')

    mrr = calc_score(target_path, pred_path)

    return mrr

In [8]:
model_als = implicit.als.AlternatingLeastSquares(factors=200, 
                                                 regularization=10,
                                                 alpha=40,
                                                 use_native=True,
                                                 use_cg=False,
                                                 iterations=15,
                                                 random_state=42)
print('model_als.fit_1:')
model_als.fit(user_item)

model_als.fit_1:


100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [46:24<00:00, 185.63s/it]


In [9]:
print('model_als.fit_2:')
model_als.fit(user_item)

model_als.fit_2:


100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [46:20<00:00, 185.37s/it]


In [10]:
print('model_als.fit_3:')
model_als.fit(user_item)

model_als.fit_3:


100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [45:55<00:00, 183.69s/it]


In [11]:
print('model_als.fit_4:')
model_als.fit(user_item)

model_als.fit_4:


100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [45:53<00:00, 183.54s/it]


In [12]:
model_bpr = implicit.bpr.BayesianPersonalizedRanking(factors=400, 
                                                     learning_rate=0.075,
                                                     regularization=0.0025,
                                                     iterations=950,
                                                     verify_negative_samples=True,
                                                     random_state=42)   
print('model_bpr.fit:')
model_bpr.fit(user_item)

model_bpr.fit:


100%|███████████████████████████████████████████████| 950/950 [39:21<00:00,  2.49s/it, train_auc=99.76%, skipped=0.79%]


In [13]:
rec_als, rec_bpr, N_als, N_bpr = N_opt(model_als, model_bpr, N=1625, q_bpr=0.425)



--- 80.56382703781128 seconds model_als ---

--- 137.1595857143402 seconds model_bpr ---


In [14]:
optimize(rec_als,
           rec_bpr,
           N_als,
           N_bpr,
           coef_bpr=0.925,
           norm_all=True,
           preproc='standart')

100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [01:14<00:00, 334.58it/s]


MRR@100 = 0.0310


0.031032396843047264

In [15]:
rec_als, rec_bpr, N_als, N_bpr = N_opt(model_als, model_bpr, N=1500, q_bpr=0.50)



--- 83.31612849235535 seconds model_als ---

--- 130.41967940330505 seconds model_bpr ---


In [16]:
optimize(rec_als,
           rec_bpr,
           N_als,
           N_bpr,
           coef_bpr=1.0,
           norm_all=True,
           preproc='standart')

100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [01:17<00:00, 320.95it/s]


MRR@100 = 0.0313


0.03125459985605099