In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import minmax_scale, scale
from tqdm import tqdm
import time
import implicit
from collections import Counter
from surprise import Dataset, Reader
from surprise import SVD
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\track_artists.csv"
train_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\train"
test_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\test"
pred_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\ensemble_bpr_artist_svd_625_075_ss"
bpr_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\final_bpr_model_backup"
svd_path = r"C:\Users\dlbol\Downloads\likes\likes\likes_data\final_svd_model_backup"

In [3]:
%%time
df = pd.read_csv(df_path)
df.index = df['trackId']

CPU times: total: 62.5 ms
Wall time: 51.1 ms


In [4]:
%%time

artist_coord = []
idx = 0

with open(train_path) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        tracks_arr = np.array(tracks).astype(np.int32)
        artist_arr = np.array([df.at[track, 'artistId'] for track in tracks_arr])
        artist_cnt = Counter(artist_arr)
        artist_coord_user = [(idx, i[0], i[1]) for i in artist_cnt.items()]
        artist_coord.extend(artist_coord_user)
        idx += 1
        
first_id_test = idx

with open(test_path) as f:
    lines = f.readlines()
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        tracks_arr = np.array(tracks).astype(np.int32)
        artist_arr = np.array([df.at[track, 'artistId'] for track in tracks_arr])
        artist_cnt = Counter(artist_arr)
        artist_coord_user = [(idx, i[0], i[1]) for i in artist_cnt.items()]
        artist_coord.extend(artist_coord_user)
        idx += 1
        
last_id_test = idx - 1

100%|██████████████████████████████████████████████████████████████████████| 1160084/1160084 [04:53<00:00, 3946.93it/s]
100%|████████████████████████████████████████████████████████████████████████| 289914/289914 [01:14<00:00, 3912.95it/s]

CPU times: total: 6min 12s
Wall time: 6min 9s





In [5]:
%%time

data = pd.DataFrame(artist_coord, columns=['uid', 'iid', 'rating_row'])
data['rating_max'] = data.rating_row / data.groupby('uid')['rating_row'].transform('max')

CPU times: total: 6min 22s
Wall time: 6min 22s


In [6]:
%%time

reader = Reader(rating_scale=(0, 1))
dataset = Dataset.load_from_df(data[['uid', 'iid', 'rating_max']], reader)
trainset = dataset.build_full_trainset()

CPU times: total: 2min
Wall time: 2min 3s


In [7]:
%%time

del data
del dataset
del artist_coord

CPU times: total: 14.8 s
Wall time: 19.7 s


In [8]:
%%time

algo = SVD(n_factors=300, 
            n_epochs=770, 
            biased=False, 
            init_mean=0, 
            init_std_dev=0.1, 
            lr_bu=0.01625, lr_bi=0.01625, lr_pu=0.01625, lr_qi=0.00005, 
            reg_bu=0.005, reg_bi=0.005, reg_pu=0.05, reg_qi=0.001,
            verbose=True, random_state=42)

print('algo.fit:')
algo.fit(trainset)

algo.fit:
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49


Processing epoch 395
Processing epoch 396
Processing epoch 397
Processing epoch 398
Processing epoch 399
Processing epoch 400
Processing epoch 401
Processing epoch 402
Processing epoch 403
Processing epoch 404
Processing epoch 405
Processing epoch 406
Processing epoch 407
Processing epoch 408
Processing epoch 409
Processing epoch 410
Processing epoch 411
Processing epoch 412
Processing epoch 413
Processing epoch 414
Processing epoch 415
Processing epoch 416
Processing epoch 417
Processing epoch 418
Processing epoch 419
Processing epoch 420
Processing epoch 421
Processing epoch 422
Processing epoch 423
Processing epoch 424
Processing epoch 425
Processing epoch 426
Processing epoch 427
Processing epoch 428
Processing epoch 429
Processing epoch 430
Processing epoch 431
Processing epoch 432
Processing epoch 433
Processing epoch 434
Processing epoch 435
Processing epoch 436
Processing epoch 437
Processing epoch 438
Processing epoch 439
Processing epoch 440
Processing epoch 441
Processing ep

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2e2825bc400>

In [9]:
%%time

pickle.dump(algo, open(svd_path, "wb"))

CPU times: total: 1min 10s
Wall time: 1min 14s
