In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import minmax_scale, scale
from tqdm import tqdm
import time
import implicit
from collections import Counter
from surprise import Dataset, Reader
from surprise import SVD
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
df_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\track_artists.csv"
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
pred_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\ensemble_bpr_artist_svd_625_075_ss"
bpr_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\final_bpr_model_backup"
svd_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\final_svd_model_backup"
rec_bpr_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\final_bpr_rec_625_backup"

In [3]:
first_id_test = 1160084
last_id_test = 1449997

In [4]:
def N_opt(model, N=100):
    
    userid = np.arange(last_id_test + 1)

    start_time = time.time()
    rec = model.recommend(userid, user_item, N=N, filter_already_liked_items=True)
    rec = rec[:, first_id_test:]
    print("\n--- %s seconds model_bpr ---" % (time.time() - start_time))
    
    return rec, N

In [5]:
def optimize(algo,
             rec,
             N,
             coef_art=1,
             norm_all=True,
             preproc_model=None,
             preproc_algo=None,
             impossible=False):
        
    if preproc_model is None:
        model_score = rec[1]
    else:
        if norm_all:
            if preproc_model == 'minmax':
                model_score = minmax_scale(rec[1].flatten(), feature_range=(0, 1), axis=0).reshape((-1, N))
            if preproc_model == 'standart':
                model_score = scale(rec[1].flatten(), with_mean=True, with_std=True, axis=0).reshape((-1, N))    
        else:
            if preproc_model == 'minmax':
                model_score = minmax_scale(rec[1], feature_range=(0, 1), axis=1)   
            if preproc_model == 'standart':
                model_score = scale(rec[1], with_mean=True, with_std=True, axis=1)     

    result = []
    user_id = first_id_test
    for i in tqdm(range(rec[0].shape[0])):
        if impossible:
            user_id = i
        df_final = pd.DataFrame({'trak_id': rec[0][i], 'model_score': model_score[i]})
        df_final = pd.merge(df_final, df, how="inner", left_on='trak_id', right_on=df.index, sort=False)
        df_final['artist_score'] = np.array([algo.predict(uid=user_id, iid=iid).est for iid in df_final.artistId.values])
        if preproc_algo == 'minmax':
            df_final['artist_score'] = minmax_scale(df_final.artist_score.values, feature_range=(0, 1), axis=0)
        if preproc_algo == 'standart':
            df_final['artist_score'] = scale(df_final.artist_score.values, with_mean=True, with_std=True, axis=0)
        df_final['final_score'] = df_final.model_score + df_final.artist_score * coef_art
        final_rec = df_final.sort_values(by='final_score', ascending=False).trak_id.values[:100]
        result.append(' '.join(map(str, final_rec)) + '\n')
        user_id += 1

    return result

In [6]:
%%time

list_coord = []
idx = 0

with open(train_path) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        for track in tracks:
            list_coord.append((idx, track))
        idx += 1
        
first_id_test = idx

with open(test_path) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        tracks = line.strip().split(' ')        
        for track in tracks:
            list_coord.append((idx, track))
        idx += 1
        
last_id_test = idx - 1

100%|█████████████████████████████████████████████████████████████████████| 1160084/1160084 [00:13<00:00, 87857.88it/s]
100%|███████████████████████████████████████████████████████████████████████| 289914/289914 [00:05<00:00, 53748.30it/s]

CPU times: total: 20 s
Wall time: 20.1 s





In [7]:
%%time

arr_rows = np.array([i[0] for i in list_coord])
arr_cols = np.array([i[1] for i in list_coord]).astype(int)
ones = np.ones(arr_rows.shape[0])
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=int)

CPU times: total: 1min 6s
Wall time: 1min 6s


In [8]:
del arr_rows
del arr_cols
del ones
del list_coord

In [9]:
first_id_test, last_id_test

(1160084, 1449997)

In [10]:
%%time

model = pickle.load(open(bpr_path, "rb"))

CPU times: total: 891 ms
Wall time: 898 ms


N: 625, norm_all: False, preproc_model: standart, preproc_algo: standart, coef_art: 0.75

In [12]:
first_id_test

1160084

In [13]:
%%time

N = 625
userid = np.arange(last_id_test + 1)
rec = model.recommend(userid, user_item, N=N, filter_already_liked_items=True)

CPU times: total: 16h 10min 8s
Wall time: 2h 9min 38s


In [15]:
del model
del user_item

In [23]:
%%time

rec = np.array([rec[0][first_id_test:], rec[1][first_id_test:]])

CPU times: total: 1.08 s
Wall time: 1.19 s


In [28]:
rec[0][1].shape

(625,)

In [30]:
pickle.dump(rec, open(rec_bpr_path, "wb"))

In [31]:
%%time
df = pd.read_csv(df_path)
df.index = df['trackId']

CPU times: total: 78.1 ms
Wall time: 92 ms


In [32]:
%%time

algo = pickle.load(open(svd_path, "rb"))

CPU times: total: 2min 4s
Wall time: 2min 40s


In [18]:
result = optimize(algo,
                  rec,
                  N,
                  coef_art=0.75,
                  norm_all=False,
                  preproc_model='standart',
                  preproc_algo='standart')

 11%|███████▎                                                            | 154632/1449998 [1:49:44<15:19:16, 23.49it/s]


KeyboardInterrupt: 

In [None]:
%%time

with open(pred_path, 'w') as f:
    f.writelines(result)