In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import minmax_scale, scale
from tqdm import tqdm
import time
import implicit
from collections import Counter
from surprise import Dataset, Reader
from surprise import SVD

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\track_artists.csv"
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
pred_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\ensemble_als_bpr_artist_hyper_pred"
target_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\ensemble_als_bpr_artist_hyper_target"
log_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\ensemble_als_bpr_artist_hyper_log"
# df_path = '/Users/david/Documents/GitHub/YandexCup_RecSys/track_artists.csv'
# train_path = '/Users/david/Documents/GitHub/YandexCup_RecSys/train'
# pred_path = '/Users/david/Documents/GitHub/YandexCup_RecSys/ensemble_als_bpr_artist_hyper_pred'
# target_path = '/Users/david/Documents/GitHub/YandexCup_RecSys/ensemble_als_bpr_artist_hyper_target'

In [3]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")
    
    return mrr_score / len(y_true)

In [4]:
%%time
df = pd.read_csv(df_path)
df.index = df['trackId']

CPU times: total: 62.5 ms
Wall time: 47.8 ms


In [5]:
artist_coord = []

list_coord = []
list_coord_val = []
target_val = []
np.random.seed(42)  
val_id = np.random.choice(range(10000), size=2500, replace=False)
val_id.sort()

with open(train_path) as f:
    lines = f.readlines()
    idx = 0
    idx_val = 0
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        
        tracks_arr = np.array(tracks)
        tracks_arr = tracks_arr.astype(np.int32)
        artist_arr = np.array([df.at[track, 'artistId'] for track in tracks_arr])
        if idx in val_id:
            artist_arr = artist_arr[:-1]
        artist_cnt = Counter(artist_arr)
        list_coord_user = [(idx, i[0], i[1]) for i in artist_cnt.items()]
        artist_coord.extend(list_coord_user)
        
        for i, track in enumerate(tracks):
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                idx_val += 1
                continue
            if idx in val_id:
                list_coord_val.append((idx_val, track))
            list_coord.append((idx, track))
        idx += 1
        if idx >= 10000:
            break

  1%|▋                                                                        | 9999/1160084 [00:06<12:12, 1570.08it/s]


In [6]:
%%time

data = pd.DataFrame(artist_coord, columns=['uid', 'iid', 'rating_row'])
data['rating_max'] = data.rating_row / data.groupby('uid')['rating_row'].transform('max')

CPU times: total: 2.47 s
Wall time: 2.48 s


In [7]:
%%time

arr_rows = np.array([i[0] for i in list_coord])
arr_cols = np.array([i[1] for i in list_coord]).astype(int)
ones = np.ones(arr_rows.shape[0])
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=int)
#item_user = csr_matrix((ones, (arr_cols, arr_rows)), dtype=int)

arr_rows_val = np.array([i[0] for i in list_coord_val])
arr_cols_val = np.array([i[1] for i in list_coord_val]).astype(int)
ones_val = np.ones(arr_rows_val.shape[0])
user_item_val = csr_matrix((ones_val, (arr_rows_val, arr_cols_val)), dtype=int)

CPU times: total: 500 ms
Wall time: 506 ms


In [8]:
%%time

reader = Reader(rating_scale=(0, 1))
dataset = Dataset.load_from_df(data[['uid', 'iid', 'rating_max']], reader)
trainset = dataset.build_full_trainset()

CPU times: total: 531 ms
Wall time: 528 ms


In [9]:
%%time

algo_s = SVD(n_factors=300, 
            n_epochs=50, 
            biased=False, 
            init_mean=0, 
            init_std_dev=0.1, 
            lr_bu=0.01625, lr_bi=0.01625, lr_pu=0.01625, lr_qi=0.00005, 
            reg_bu=0.005, reg_bi=0.005, reg_pu=0.05, reg_qi=0.001,
            verbose=True)

print('algo.fit:')
algo_s.fit(trainset)

algo.fit:
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1bde8bcc370>

In [10]:
model_s = implicit.bpr.BayesianPersonalizedRanking(factors=50, 
                                                 learning_rate=0.075,
                                                 regularization=0.0025,
                                                 iterations=50,
                                                 verify_negative_samples=True,
                                                 random_state=42)   
print('model_bpr.fit:')
model_s.fit(user_item)

model_bpr.fit:


100%|█████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.90it/s, train_auc=98.92%, skipped=0.78%]


In [11]:
def N_opt(model, N=100, q_bpr=1):
    
    userid = val_id

    start_time = time.time()
    rec = model.recommend(userid, user_item_val, N=N, filter_already_liked_items=True)
    print("\n--- %s seconds model_bpr ---" % (time.time() - start_time))
    
    return rec, N

In [12]:
def optimize(algo,
             rec,
             N,
             coef_art=1,
             norm_all=True,
             preproc_model=None,
             preproc_algo=None):
        
    if preproc_model is None:
        model_score = rec[1]
    else:
        if norm_all:
            if preproc_model == 'minmax':
                model_score = minmax_scale(rec[1].flatten(), feature_range=(0, 1), axis=0).reshape((-1, N))
            if preproc_model == 'standart':
                model_score = scale(rec[1].flatten(), with_mean=True, with_std=True, axis=0).reshape((-1, N))    
        else:
            if preproc_model == 'minmax':
                model_score = minmax_scale(rec[1], feature_range=(0, 1), axis=1)   
            if preproc_model == 'standart':
                model_score = scale(rec[1], with_mean=True, with_std=True, axis=1)     

    result = []           
    for i, val_i in tqdm(zip(range(rec[0].shape[0]), val_id)):
        df_final = pd.DataFrame({'trak_id': rec[0][i], 'model_score': model_score[i]})
        df_final = pd.merge(df_final, df, how="inner", left_on='trak_id', right_on=df.index, sort=False)
        df_final['artist_score'] = np.array([algo.predict(uid=val_i, iid=iid).est for iid in df_final.artistId.values])
        if preproc_algo == 'minmax':
            df_final['artist_score'] = minmax_scale(df_final.artist_score.values, feature_range=(0, 1), axis=0)
        if preproc_algo == 'standart':
            df_final['artist_score'] = scale(df_final.artist_score.values, with_mean=True, with_std=True, axis=0)
        df_final['final_score'] = df_final.model_score + df_final.artist_score * coef_art
        final_rec = df_final.sort_values(by='final_score', ascending=False).trak_id.values[:100]
        result.append(' '.join(map(str, final_rec)) + '\n')
        
        
    with open(pred_path, 'w') as f:
        f.writelines(result)

    with open(target_path, 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')

    mrr = calc_score(target_path, pred_path)

    return mrr

In [13]:
rec, N = N_opt(model_s, N=100)


--- 2.627439022064209 seconds model_bpr ---


In [14]:
optimize(algo_s,
          rec,
          N,
          coef_art=1,
          norm_all=True,
          preproc_model=None,
          preproc_algo=None)

2500it [01:42, 24.40it/s]


MRR@100 = 0.0142


0.014157982407580207

In [16]:
import pickle
bpr_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\testing_bpr_model_backup"
svd_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\testing_svd_model_backup"

In [17]:
pickle.dump(model_s, open(bpr_path, "wb"))
pickle.dump(algo_s, open(svd_path, "wb"))

In [20]:
model = pickle.load(open(bpr_path, "rb"))
algo = pickle.load(open(svd_path, "rb"))

In [21]:
rec, N = N_opt(model, N=100)


--- 3.2218894958496094 seconds model_bpr ---


In [22]:
optimize(algo,
          rec,
          N,
          coef_art=1,
          norm_all=True,
          preproc_model=None,
          preproc_algo=None)

2500it [01:40, 24.93it/s]


MRR@100 = 0.0142


0.014157982407580207