# Final BPR

In [1]:
FACTORS = 400
ITERATIONS = 950
LR = 0.075
REG = 0.0025

In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import implicit
import time
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
pred_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\implicit_bpr_400_950_0075_00025"
score_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\implicit_bpr_400_950_0075_00025_score"

In [4]:
%%time

list_coord = []
idx = 0

with open(train_path) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        for track in tracks:
            list_coord.append((idx, track))
        idx += 1
        
first_id_test = idx

with open(test_path) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        for track in tracks:
            list_coord.append((idx, track))
        idx += 1
        
last_id_test = idx - 1

100%|████████████████████████████████████████████████████████████████████| 1160084/1160084 [00:11<00:00, 102927.70it/s]
100%|███████████████████████████████████████████████████████████████████████| 289914/289914 [00:04<00:00, 67318.12it/s]

CPU times: total: 17 s
Wall time: 17.1 s





In [5]:
first_id_test, last_id_test

(1160084, 1449997)

In [6]:
%%time

rows = [i[0] for i in list_coord]
cols = [i[1] for i in list_coord]
arr_rows = np.array(rows)
arr_cols = np.array(cols)
arr_cols = arr_cols.astype(np.int32)
ones = np.ones(len(list_coord))
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=np.int32)

CPU times: total: 1min 8s
Wall time: 1min 8s


In [7]:
model = implicit.bpr.BayesianPersonalizedRanking(factors=FACTORS,
                                                 learning_rate=LR,
                                                 regularization=REG,
                                                 iterations=ITERATIONS,
                                                 verify_negative_samples=True,
                                                 random_state=42)

model.fit(user_item)

100%|████████████████████████████████████████████| 950/950 [10:21:14<00:00, 39.24s/it, train_auc=99.03%, skipped=0.78%]


In [8]:
%%time

userid = np.arange(last_id_test + 1)
recommendations = model.recommend(userid, user_item, N=100, filter_already_liked_items=True)

CPU times: total: 16h 13min 56s
Wall time: 2h 10min 46s


In [9]:
result = [' '.join(map(str, i)) + '\n' for idx, i in tqdm(enumerate(recommendations[0])) if first_id_test <= idx <= last_id_test]
with open(pred_path, 'w') as f:
    f.writelines(result)

1449998it [00:12, 115896.99it/s] 


In [10]:
result = [' '.join(map(str, i)) + '\n' for idx, i in tqdm(enumerate(recommendations[1])) if first_id_test <= idx <= last_id_test]
with open(score_path, 'w') as f:
    f.writelines(result)

1449998it [00:09, 155379.20it/s] 


In [11]:
del result
del recommendations
del userid
del list_coord
del rows
del cols
del arr_rows
del arr_cols
del ones

In [12]:
N_q = 1500
Q_BPR = 0.015
COEF_BPR = 3
PREPROC = 'minmax'
NORM_ALL = False

In [13]:
from sklearn.preprocessing import minmax_scale, scale

In [14]:
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
pred_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\impicit_mix_als_bpr_1500_0.015_3_minmax_false"
score_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\impicit_mix_als_bpr_1500_0.015_3_minmax_false_score"

In [15]:
model_als = implicit.als.AlternatingLeastSquares(factors=200, 
                                                 regularization=10,
                                                 alpha=40,
                                                 use_native=True,
                                                 use_cg=False,
                                                 iterations=15,
                                                 random_state=42)
print('model_als.fit:')
model_als.fit(user_item)

model_als.fit:


100%|██████████████████████████████████████████████████████████████████████████████| 15/15 [4:28:08<00:00, 1072.57s/it]


In [16]:
def N_opt(model_als, model_bpr, N=100, q_bpr=1):
    
    N_als = N
    N_bpr = int(N * q_bpr)
    
    userid = np.arange(last_id_test + 1)
        
    start_time = time.time()
    rec_als = model_als.recommend(userid, user_item, N=N_als, filter_already_liked_items=True)
    rec_als = rec_als[:, first_id_test:]
    print("\n\n--- %s seconds model_als ---" % (time.time() - start_time))

    start_time = time.time()
    rec_bpr = model_bpr.recommend(userid, user_item, N=N_bpr, filter_already_liked_items=True)
    rec_bpr = rec_bpr[:, first_id_test:]
    print("\n--- %s seconds model_bpr ---" % (time.time() - start_time))
    
    return rec_als, rec_bpr, N_als, N_bpr

In [17]:
def optimize(rec_als,
             rec_bpr,
             N_als,
             N_bpr,
             coef_bpr=1,
             norm_all=False,
             preproc=None):
    
    if not isinstance(norm_all, bool) or preproc not in ['minmax', 'standart', None]:
        raise ValueError('wrong params')
        
    if preproc is None:
        als_score = rec_als[1]
        bpr_score = rec_bpr[1]
    else:
        if norm_all:
            if preproc == 'minmax':
                als_score = minmax_scale(rec_als[1].flatten(), feature_range=(0, 1), axis=0).reshape((-1, N_als))
                bpr_score = minmax_scale(rec_bpr[1].flatten(), feature_range=(0, 1), axis=0).reshape((-1, N_bpr))
            if preproc == 'standart':
                als_score = scale(rec_als[1].flatten(), with_mean=True, with_std=True, axis=0).reshape((-1, N_als))
                bpr_score = scale(rec_bpr[1].flatten(), with_mean=True, with_std=True, axis=0).reshape((-1, N_bpr))    
        else:
            if preproc == 'minmax':
                als_score = minmax_scale(rec_als[1], feature_range=(0, 1), axis=1)
                bpr_score = minmax_scale(rec_bpr[1], feature_range=(0, 1), axis=1)   
            if preproc == 'standart':
                als_score = scale(rec_als[1], with_mean=True, with_std=True, axis=1)
                bpr_score = scale(rec_bpr[1], with_mean=True, with_std=True, axis=1)     

    result = []
    scores = []
    for i in tqdm(range(rec_als[0].shape[0])):
        als_df = pd.DataFrame({'trak_id': rec_als[0][i], 'als_score': als_score[i]})
        bpr_df = pd.DataFrame({'trak_id': rec_bpr[0][i], 'bpr_score': bpr_score[i] * coef_bpr})
        df = pd.merge(als_df, bpr_df, how="outer", on='trak_id', sort=False)
        df['als_prior'] = df.als_score.where(~df.als_score.isna(), df.bpr_score)
        df['total_score'] = np.where((df.als_score.isna()) | (df.bpr_score.isna()), df.als_prior, 
                                     (df.als_score + df.bpr_score) / 2)
        final_rec = df.sort_values(by='total_score', ascending=False).trak_id.values[:100]
        result.append(' '.join(map(str, final_rec)) + '\n')
        final_scr = df.sort_values(by='total_score', ascending=False).total_score.values[:100]
        result.append(' '.join(map(str, final_scr)) + '\n')
        
        
    with open(pred_path, 'w') as f:
        f.writelines(result)

    with open(score_path, 'w') as f:
        f.writelines(scores)

In [None]:
rec_als, rec_bpr, N_als, N_bpr = N_opt(model_als, model, N=N_q, q_bpr=Q_BPR)

In [None]:
optimize(rec_als,
         rec_bpr,
         N_als,
         N_bpr,
         coef_bpr=COEF_BPR,
         norm_all=NORM_ALL,
         preproc=PREPROC)

In [1]:
model

NameError: name 'model' is not defined