In [1]:
!pip install surprise
!pip install scikit-surprise



In [1]:
%%time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix
from tqdm.auto import tqdm
from collections import Counter
from surprise import Dataset, Reader

CPU times: user 1.89 s, sys: 114 ms, total: 2.01 s
Wall time: 345 ms


In [2]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")

In [3]:
%%time
df = pd.read_csv('/Users/david/Documents/GitHub/YandexCup_RecSys/track_artists.csv')
df.index = df['trackId']

CPU times: user 58.1 ms, sys: 11.2 ms, total: 69.3 ms
Wall time: 69.3 ms


In [64]:
%%time

list_coord = []
target_val = []
np.random.seed(42)  
val_id = np.random.choice(range(10000), size=2500, replace=False)
val_id.sort()
all_artist = set()
ls_artists = []

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/train') as f:
    lines = f.readlines()
    idx = 0 
    for line in tqdm(lines):
        tracks = line.strip().split(' ')
        tracks_arr = np.array(tracks)
        tracks_arr = tracks_arr.astype(np.int32)
        artist_arr = np.array([df.at[track, 'artistId'] for track in tracks_arr])
        if idx in val_id:
            target_val.append(artist_arr[-1])
            artist_arr = artist_arr[:-1]
        ls_artists.extend(artist_arr)
        all_artist = all_artist.union(set(artist_arr))
        artist_cnt = Counter(artist_arr)
        list_coord_user = [(idx, i[0], i[1]) for i in artist_cnt.items()]
        list_coord.extend(list_coord_user)
        idx += 1
        if idx > 10000:
            break

  0%|          | 0/1160084 [00:00<?, ?it/s]

CPU times: user 6.1 s, sys: 1.36 s, total: 7.46 s
Wall time: 13.1 s


In [5]:
%%time

data = pd.DataFrame(list_coord, columns=['uid', 'iid', 'rating_row'])
data['rating_max'] = data.rating_row / data.groupby('uid')['rating_row'].transform('max')
data['rating_part'] = data.rating_row / data.groupby('uid')['rating_row'].transform('sum')

CPU times: user 197 ms, sys: 44.2 ms, total: 242 ms
Wall time: 268 ms


# Baseline

In [102]:
all_artist_cnt = [i[0] for i in Counter(ls_artists).most_common(100)]

result = [' '.join(map(str, all_artist_cnt)) + '\n' for _ in val_id]

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")

MRR@100 = 0.0145


# RATING_MAX

In [103]:
%%time

reader = Reader(rating_scale=(0, 1))
dataset = Dataset.load_from_df(data[['uid', 'iid', 'rating_part']], reader)
trainset = dataset.build_full_trainset()

CPU times: user 312 ms, sys: 123 ms, total: 435 ms
Wall time: 836 ms


## KNNWithMeans

In [None]:
%%time
from surprise import KNNWithMeans

sim_options = {
    "name": "pearson_baseline",
    "user_based": True
}
algo = KNNWithMeans(k=40, min_k=1, sim_options=sim_options, verbose=True)
algo.fit(trainset)

In [None]:
algo.predict(uid=10, iid=25735).est

In [None]:
pred = []
for uid in tqdm(val_id):
    pred_user = []
    for iid in all_artist:
        score = algo.predict(uid=uid, iid=iid).est
        pred_user.append((iid, score))
    pred.append(pred_user)

In [None]:
res = []
for user in tqdm(pred):
    ls = sorted(user, key=lambda x: x[1])
    temp = [i[0] for i in ls[-100:]]
    temp.reverse()
    res.append(temp)

result = [' '.join(map(str, i)) + '\n' for i in res]

    
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")

MRR@100 = 0.0084


## KNNBasic

In [107]:
%%time
from surprise import KNNBasic

sim_options = {
    "name": "pearson_baseline",
    "user_based": True
}
algo = KNNBasic(k=40, min_k=1, sim_options=sim_options, verbose=True)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
CPU times: user 5.44 s, sys: 13.3 s, total: 18.7 s
Wall time: 28.6 s


<surprise.prediction_algorithms.knns.KNNBasic at 0x14b8e9540>

In [108]:
pred = []
for uid in tqdm(val_id):
    pred_user = []
    for iid in all_artist:
        score = algo.predict(uid=uid, iid=iid).est
        pred_user.append((iid, score))
    pred.append(pred_user)
    
res = []
for user in tqdm(pred):
    ls = sorted(user, key=lambda x: x[1])
    temp = [i[0] for i in ls[-100:]]
    temp.reverse()
    res.append(temp)

result = [' '.join(map(str, i)) + '\n' for i in res]

    
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

MRR@100 = 0.0658


MRR@100 = 0.0038

## KNNWithZScore

In [109]:
%%time
from surprise import KNNWithZScore

sim_options = {
    "name": "pearson_baseline",
    "user_based": True
}
algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options, verbose=True)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
CPU times: user 5.58 s, sys: 12.9 s, total: 18.5 s
Wall time: 27.8 s


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x69a3d1ba0>

In [110]:
pred = []
for uid in tqdm(val_id):
    pred_user = []
    for iid in all_artist:
        score = algo.predict(uid=uid, iid=iid).est
        pred_user.append((iid, score))
    pred.append(pred_user)
    
res = []
for user in tqdm(pred):
    ls = sorted(user, key=lambda x: x[1])
    temp = [i[0] for i in ls[-100:]]
    temp.reverse()
    res.append(temp)

result = [' '.join(map(str, i)) + '\n' for i in res]

    
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

MRR@100 = 0.0097


MRR@100 = 0.0010


## KNNBaseline

In [111]:
%%time
from surprise import KNNBaseline

sim_options = {
    "name": "pearson_baseline",
    "user_based": True
}
algo = KNNBaseline(k=40, min_k=1, sim_options=sim_options, verbose=True)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
CPU times: user 5.59 s, sys: 16.3 s, total: 21.8 s
Wall time: 46.9 s


<surprise.prediction_algorithms.knns.KNNBaseline at 0x14b89c7f0>

In [112]:
pred = []
for uid in tqdm(val_id):
    pred_user = []
    for iid in all_artist:
        score = algo.predict(uid=uid, iid=iid).est
        pred_user.append((iid, score))
    pred.append(pred_user)
    
res = []
for user in tqdm(pred):
    ls = sorted(user, key=lambda x: x[1])
    temp = [i[0] for i in ls[-100:]]
    temp.reverse()
    res.append(temp)

result = [' '.join(map(str, i)) + '\n' for i in res]

    
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

MRR@100 = 0.0583


MRR@100 = 0.0034


## SVD

In [113]:
%%time
from surprise import SVD

algo = SVD(n_factors=100, 
            n_epochs=20, 
            biased=True, 
            init_mean=0, 
            init_std_dev=0.1, 
            lr_all=0.005, 
            reg_all=0.02, 
            random_state=42, 
            verbose=False)

algo.fit(trainset)

CPU times: user 2.4 s, sys: 82.8 ms, total: 2.49 s
Wall time: 2.78 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x69a3d0970>

In [114]:
pred = []
for uid in tqdm(val_id):
    pred_user = []
    for iid in all_artist:
        score = algo.predict(uid=uid, iid=iid).est
        pred_user.append((iid, score))
    pred.append(pred_user)
    
res = []
for user in tqdm(pred):
    ls = sorted(user, key=lambda x: x[1])
    temp = [i[0] for i in ls[-100:]]
    temp.reverse()
    res.append(temp)

result = [' '.join(map(str, i)) + '\n' for i in res]

    
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

KeyboardInterrupt: 

## SVDpp

In [89]:
%%time
from surprise import SVDpp

algo = SVDpp(n_factors=20, 
             n_epochs=20, 
             init_mean=0, 
             init_std_dev=0.1, 
             lr_all=0.007, 
             reg_all=0.02,
             random_state=42, 
             verbose=False, 
             cache_ratings=False)

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x14b33d180>

In [90]:
pred = []
for uid in tqdm(val_id):
    pred_user = []
    for iid in all_artist:
        score = algo.predict(uid=uid, iid=iid).est
        pred_user.append((iid, score))
    pred.append(pred_user)
    
res = []
for user in tqdm(pred):
    ls = sorted(user, key=lambda x: x[1])
    temp = [i[0] for i in ls[-100:]]
    temp.reverse()
    res.append(temp)

result = [' '.join(map(str, i)) + '\n' for i in res]

    
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

MRR@100 = 0.0031


## NMF

In [96]:
%%time
from surprise import NMF

algo = SVDpp(n_factors=15, 
             n_epochs=50,
             random_state=42, 
             verbose=False)

algo.fit(trainset)

CPU times: user 1min 11s, sys: 156 ms, total: 1min 11s
Wall time: 1min 12s


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x14b8e96f0>

In [97]:
pred = []
for uid in tqdm(val_id):
    pred_user = []
    for iid in all_artist:
        score = algo.predict(uid=uid, iid=iid).est
        pred_user.append((iid, score))
    pred.append(pred_user)
    
res = []
for user in tqdm(pred):
    ls = sorted(user, key=lambda x: x[1])
    temp = [i[0] for i in ls[-100:]]
    temp.reverse()
    res.append(temp)

result = [' '.join(map(str, i)) + '\n' for i in res]

    
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/2500 [00:00<?, ?it/s]

MRR@100 = 0.0009


## SlopeOne

In [98]:
%%time
from surprise import SlopeOne

algo = SlopeOne()
algo.fit(trainset)

CPU times: user 4.7 s, sys: 13.8 s, total: 18.5 s
Wall time: 27.4 s


<surprise.prediction_algorithms.slope_one.SlopeOne at 0x14b89ca30>

In [99]:
pred = []
for uid in tqdm(val_id):
    pred_user = []
    for iid in all_artist:
        score = algo.predict(uid=uid, iid=iid).est
        pred_user.append((iid, score))
    pred.append(pred_user)
    
res = []
for user in tqdm(pred):
    ls = sorted(user, key=lambda x: x[1])
    temp = [i[0] for i in ls[-100:]]
    temp.reverse()
    res.append(temp)

result = [' '.join(map(str, i)) + '\n' for i in res]

    
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")

  0%|          | 0/2500 [00:00<?, ?it/s]

KeyboardInterrupt: 

## CoClustering

In [None]:
%%time
from surprise import CoClustering

algo = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=42, verbose=False)
algo.fit(trainset)

In [None]:
pred = []
for uid in tqdm(val_id):
    pred_user = []
    for iid in all_artist:
        score = algo.predict(uid=uid, iid=iid).est
        pred_user.append((iid, score))
    pred.append(pred_user)
    
res = []
for user in tqdm(pred):
    ls = sorted(user, key=lambda x: x[1])
    temp = [i[0] for i in ls[-100:]]
    temp.reverse()
    res.append(temp)

result = [' '.join(map(str, i)) + '\n' for i in res]

    
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred', 'w') as f:
    f.writelines(result)

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target', 'w') as f:
    for i in target_val:
        f.write(f'{i}\n')

calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_target',
           "/Users/david/Documents/GitHub/YandexCup_RecSys/surprize_artist_pred")