In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import minmax_scale, scale
from tqdm import tqdm
import time
import implicit
from collections import Counter
from surprise import Dataset, Reader
from surprise import SVD
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\track_artists.csv"
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
pred_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\ensemble_bpr_artist_svd_625_075_ss"
bpr_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\final_bpr_model_backup"
svd_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\final_svd_model_backup"

In [3]:
%%time
df = pd.read_csv(df_path)
df.index = df['trackId']

CPU times: total: 46.9 ms
Wall time: 56 ms


In [4]:
%%time

artist_coord = []
list_coord = []
idx = 0

with open(train_path) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        
        tracks_arr = np.array(tracks).astype(np.int32)
        artist_arr = np.array([df.at[track, 'artistId'] for track in tracks_arr])
        artist_cnt = Counter(artist_arr)
        artist_coord_user = [(idx, i[0], i[1]) for i in artist_cnt.items()]
        artist_coord.extend(artist_coord_user)
        
        for track in tracks:
            list_coord.append((idx, track))
        idx += 1
        
first_id_test = idx

with open(test_path) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        
        tracks_arr = np.array(tracks).astype(np.int32)
        artist_arr = np.array([df.at[track, 'artistId'] for track in tracks_arr])
        artist_cnt = Counter(artist_arr)
        artist_coord_user = [(idx, i[0], i[1]) for i in artist_cnt.items()]
        artist_coord.extend(artist_coord_user)
        
        for track in tracks:
            list_coord.append((idx, track))
        idx += 1
        
last_id_test = idx - 1

100%|██████████████████████████████████████████████████████████████████████| 1160084/1160084 [05:12<00:00, 3713.79it/s]
100%|████████████████████████████████████████████████████████████████████████| 289914/289914 [01:18<00:00, 3708.61it/s]

CPU times: total: 6min 34s
Wall time: 6min 32s





In [5]:
%%time

data = pd.DataFrame(artist_coord, columns=['uid', 'iid', 'rating_row'])
data['rating_max'] = data.rating_row / data.groupby('uid')['rating_row'].transform('max')

CPU times: total: 6min 18s
Wall time: 6min 18s


In [6]:
%%time

arr_rows = np.array([i[0] for i in list_coord])
arr_cols = np.array([i[1] for i in list_coord]).astype(int)
ones = np.ones(arr_rows.shape[0])
user_item = csr_matrix((ones, (arr_rows, arr_cols)), dtype=int)

CPU times: total: 1min 17s
Wall time: 1min 17s


In [7]:
%%time

reader = Reader(rating_scale=(0, 1))
dataset = Dataset.load_from_df(data[['uid', 'iid', 'rating_max']], reader)
trainset = dataset.build_full_trainset()

CPU times: total: 7min 50s
Wall time: 10min 58s


In [8]:
def N_opt(model, N=100):
    
    userid = np.arange(last_id_test + 1)

    start_time = time.time()
    rec = model.recommend(userid, user_item, N=N, filter_already_liked_items=True)
    rec = rec[:, first_id_test:]
    print("\n--- %s seconds model_bpr ---" % (time.time() - start_time))
    
    return rec, N

In [10]:
def optimize(algo,
             rec,
             N,
             coef_art=1,
             norm_all=True,
             preproc_model=None,
             preproc_algo=None,
             impossible=False):
        
    if preproc_model is None:
        model_score = rec[1]
    else:
        if norm_all:
            if preproc_model == 'minmax':
                model_score = minmax_scale(rec[1].flatten(), feature_range=(0, 1), axis=0).reshape((-1, N))
            if preproc_model == 'standart':
                model_score = scale(rec[1].flatten(), with_mean=True, with_std=True, axis=0).reshape((-1, N))    
        else:
            if preproc_model == 'minmax':
                model_score = minmax_scale(rec[1], feature_range=(0, 1), axis=1)   
            if preproc_model == 'standart':
                model_score = scale(rec[1], with_mean=True, with_std=True, axis=1)     

    result = []
    user_id = first_id_test
    for i in tqdm(range(rec[0].shape[0])):
        if impossible:
            user_id = i
        df_final = pd.DataFrame({'trak_id': rec[0][i], 'model_score': model_score[i]})
        df_final = pd.merge(df_final, df, how="inner", left_on='trak_id', right_on=df.index, sort=False)
        df_final['artist_score'] = np.array([algo.predict(uid=user_id, iid=iid).est for iid in df_final.artistId.values])
        if preproc_algo == 'minmax':
            df_final['artist_score'] = minmax_scale(df_final.artist_score.values, feature_range=(0, 1), axis=0)
        if preproc_algo == 'standart':
            df_final['artist_score'] = scale(df_final.artist_score.values, with_mean=True, with_std=True, axis=0)
        df_final['final_score'] = df_final.model_score + df_final.artist_score * coef_art
        final_rec = df_final.sort_values(by='final_score', ascending=False).trak_id.values[:100]
        result.append(' '.join(map(str, final_rec)) + '\n')
        user_id += 1

    return result

In [None]:
%%time

algo = SVD(n_factors=300, 
            n_epochs=770, 
            biased=False, 
            init_mean=0, 
            init_std_dev=0.1, 
            lr_bu=0.01625, lr_bi=0.01625, lr_pu=0.01625, lr_qi=0.00005, 
            reg_bu=0.005, reg_bi=0.005, reg_pu=0.05, reg_qi=0.001,
            verbose=True, random_state=42)

print('algo.fit:')
algo.fit(trainset)

algo.fit:
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5


In [None]:
%%time

pickle.dump(algo, open(svd_path, "wb"))

In [None]:
model = implicit.bpr.BayesianPersonalizedRanking(factors=400, 
                                                 learning_rate=0.075,
                                                 regularization=0.0025,
                                                 iterations=950,
                                                 verify_negative_samples=True,
                                                 random_state=42)   
print('model_bpr.fit:')
model.fit(user_item)

In [None]:
%%time

pickle.dump(model, open(bpr_path, "wb"))

N: 625, norm_all: False, preproc_model: standart, preproc_algo: standart, coef_art: 0.75

In [None]:
rec, N = N_opt(model, N=625)

In [None]:
result = optimize(algo,
                  rec,
                  N,
                  coef_art=0.75,
                  norm_all=False,
                  preproc_model='standart',
                  preproc_algo='standart')

In [None]:
%%time

with open(pred_path, 'w') as f:
    f.writelines(result)