In [93]:
%%time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.neighbors import NearestNeighbors
from lightfm import LightFM

CPU times: user 56 µs, sys: 1.22 ms, total: 1.28 ms
Wall time: 5.14 ms


In [94]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")

In [95]:
%%time
df = pd.read_csv('/Users/david/Documents/GitHub/YandexCup_RecSys/track_artists.csv')
arr_item = df['trackId'].values
arr_feature = df['artistId'].values
ones = np.ones(len(arr_item))
item_features = coo_matrix((ones, (arr_item, arr_feature)), dtype=np.int8)

CPU times: user 37.9 ms, sys: 57.5 ms, total: 95.5 ms
Wall time: 382 ms


In [96]:
%%time

list_coord = []
target_val = []
val_id = np.random.randint(0, 10000, size=1000)
val_id.sort()

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/train') as f:
    lines = f.readlines()
    idx = 0 
    for line in lines:
        tracks = line.strip().split(' ')
        for i, track in enumerate(tracks):
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                continue
            list_coord.append((idx, track))
        idx += 1
        if idx > 10000:
            break

CPU times: user 1.97 s, sys: 4.26 s, total: 6.23 s
Wall time: 34 s


In [97]:
%%time
from scipy.sparse import coo_matrix
rows = [i[0] for i in list_coord]
cols = [i[1] for i in list_coord]
arr_rows = np.array(rows)
arr_cols = np.array(cols)
arr_cols = arr_cols.astype(np.int32)
ones = np.ones(len(list_coord))
user_item = coo_matrix((ones, (arr_rows, arr_cols)), dtype=np.int32)
#item_user = coo_matrix((ones, (arr_cols, arr_rows)), dtype=np.int32)

CPU times: user 320 ms, sys: 105 ms, total: 425 ms
Wall time: 1.04 s


In [100]:
from tqdm.auto import tqdm

def optimize(n_comp, n_epochs):
    start_time = time.time()
    model = LightFM(no_components=n_comp, loss='warp', random_state=42)
    model.fit(user_item, epochs=n_epochs)
    print("--- %s seconds ---" % (time.time() - start_time))

    result = []
    items = np.unique(arr_cols)
    for user_id in tqdm(val_id):
        scores = model.predict(int(user_id), items)
        arr_id = np.argsort(-scores)[:100]
        top_items = items[arr_id]
        result.append(' '.join(map(str, top_items)) + '\n')
        
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm', 'w') as f:
        f.writelines(result)
    
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target', 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target',
               "/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm")

In [101]:
for n_comp in [3, 5, 10, 30, 50]:
    for n_epochs in [3, 5, 10, 30, 50]:
        print(f'\n{n_comp, n_epochs}')
        optimize(n_comp, n_epochs)


(3, 3)
--- 2.220673084259033 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0020

(3, 5)
--- 4.35873818397522 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0029

(3, 10)
--- 7.8040900230407715 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0028

(3, 30)
--- 23.352973222732544 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0028

(3, 50)
--- 33.625344038009644 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0027

(5, 3)
--- 5.086142301559448 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0029

(5, 5)
--- 4.764045238494873 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0019

(5, 10)
--- 8.541550159454346 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0026

(5, 30)
--- 23.835681915283203 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0019

(5, 50)
--- 40.67186737060547 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0024

(10, 3)
--- 4.12228798866272 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0021

(10, 5)
--- 5.526208877563477 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0025

(10, 10)
--- 10.198329210281372 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0015

(10, 30)
--- 29.362096071243286 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0011

(10, 50)
--- 46.25600600242615 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0007

(30, 3)
--- 6.721861839294434 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0012

(30, 5)
--- 10.388898134231567 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0019

(30, 10)
--- 22.140347957611084 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0014

(30, 30)
--- 54.83745098114014 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0008

(30, 50)
--- 88.95152401924133 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0010

(50, 3)
--- 9.850772380828857 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0011

(50, 5)
--- 15.050923824310303 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0012

(50, 10)
--- 30.53219175338745 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0010

(50, 30)
--- 79.54738521575928 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0010

(50, 50)
--- 127.10450720787048 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0004


In [58]:
%%time

list_coord = []
target_val = []
np.random.seed(42)
val_id = np.random.randint(0, 100000, size=10000)
val_id.sort()
val_tracks = {}

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/train') as f:
    lines = f.readlines()
    idx = 0 
    for line in lines:
        tracks = line.strip().split(' ')
        for i, track in enumerate(tracks):
            if idx in val_id and i == 0:
                tracks_arr = np.array(tracks)
                tracks_arr = tracks_arr.astype(np.int32)
                val_tracks[idx] = tracks_arr
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                continue
            list_coord.append((idx, track))
        idx += 1
        if idx > 100000:
            break
 

CPU times: user 1min 28s, sys: 2.5 s, total: 1min 30s
Wall time: 1min 37s


In [59]:
%%time         
from scipy.sparse import coo_matrix
rows = [i[0] for i in list_coord]
cols = [i[1] for i in list_coord]
arr_rows = np.array(rows)
arr_cols = np.array(cols)
arr_cols = arr_cols.astype(np.int32)
ones = np.ones(len(list_coord))
user_item = coo_matrix((ones, (arr_rows, arr_cols)), dtype=np.int32)

CPU times: user 3.07 s, sys: 1.5 s, total: 4.57 s
Wall time: 6.92 s


In [60]:
import time

def optimize(n_comp, n_epochs, learning_rate=0.05):
    start_time = time.time()
    model = LightFM(no_components=n_comp, loss='warp', learning_rate=learning_rate, random_state=42)
    model.fit(user_item, epochs=n_epochs)
    print("--- %s seconds ---" % (time.time() - start_time))
    result = []
    items = np.unique(arr_cols)
    for user_id in tqdm(val_id):
        scores = model.predict(int(user_id), items)
        arr_id = np.argsort(-scores)[:350]
        top_items = items[arr_id]
        top_items = top_items[~np.isin(top_items, val_tracks[user_id])]
        top_items = top_items[:100]
        result.append(' '.join(map(str, top_items)) + '\n')
        
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm', 'w') as f:
        f.writelines(result)
    
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target', 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target',
               "/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm")

In [62]:
for n_comp in [3, 30]:
    for n_epochs in [30, 50, 100]:
        print(f'\n{n_comp, n_epochs}')
        optimize(n_comp, n_epochs)


(3, 30)
--- 236.36713075637817 seconds ---


  0%|          | 0/10000 [00:00<?, ?it/s]

MRR@100 = 0.0018

(3, 50)
--- 416.19042587280273 seconds ---


  0%|          | 0/10000 [00:00<?, ?it/s]

MRR@100 = 0.0017

(3, 100)
--- 41720.86213898659 seconds ---


  0%|          | 0/10000 [00:00<?, ?it/s]

MRR@100 = 0.0017

(30, 30)
--- 525.3546690940857 seconds ---


  0%|          | 0/10000 [00:00<?, ?it/s]

MRR@100 = 0.0009

(30, 50)
--- 845.6940741539001 seconds ---


  0%|          | 0/10000 [00:00<?, ?it/s]

MRR@100 = 0.0008

(30, 100)
--- 3097.4497170448303 seconds ---


  0%|          | 0/10000 [00:00<?, ?it/s]

MRR@100 = 0.0007


In [None]:
for n_comp in [2, 3, 4]:
    for n_epochs in [30, 50, 100]:
        print(f'\n{n_comp, n_epochs}')
        optimize(n_comp, n_epochs)

In [None]:
for lr in [0.01, 0.05, 0.1]:
    for n_epochs in [30, 50, 100]:
        print(f'\n{3, n_epochs, lr}')
        optimize(3, n_epochs, lr)

## actors

In [104]:
%%time
df = pd.read_csv('/Users/david/Documents/GitHub/YandexCup_RecSys/track_artists.csv')
arr_item = df['trackId'].values
arr_feature = df['artistId'].values

CPU times: user 35.9 ms, sys: 38.5 ms, total: 74.5 ms
Wall time: 257 ms


In [105]:
%%time

dict_user_item = {}
target_val = []
val_id = np.random.randint(0, 10000, size=1000)
val_id.sort()
val_tracks = {}
all_tracks = []
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/train') as f:
    lines = f.readlines()
    idx = 0 
    for line in lines:
        tracks = line.strip().split(' ')
        for i, track in enumerate(tracks):
            if idx in val_id and i == 0:
                tracks_arr = np.array(tracks)
                tracks_arr = tracks_arr.astype(np.int32)
                val_tracks[idx] = tracks_arr
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                continue
            if i == 0:
                dict_user_item[idx] = []
            dict_user_item[idx].append(int(track))
            all_tracks.append(int(track))
        idx += 1
        if idx > 10000:
            break
 

CPU times: user 3.39 s, sys: 2.39 s, total: 5.78 s
Wall time: 15.2 s


In [106]:
%%time
from scipy.sparse import coo_matrix

rows = []
cols = []
data = []
for idx, tracks in dict_user_item.items():
    temp = {}
    for track in tracks:
        artist = df[df['trackId'] == track]['artistId'].values[0]
        if artist not in temp:
            temp[artist] = 0
        temp[artist] += 1
    
    rows.extend([idx for i in temp])
    cols.extend([i for i in temp.keys()])
    data.extend([i / len(temp) for i in temp.values()])

arr_rows = np.array(rows)
arr_cols = np.array(cols)
arr_data = np.array(data)
user_item = coo_matrix((arr_data, (arr_rows, arr_cols)), dtype=np.int32)

CPU times: user 4min 23s, sys: 2.13 s, total: 4min 25s
Wall time: 4min 31s


In [107]:
user_item.get_shape()

(10001, 56134)

In [108]:
len(all_tracks)

812964

In [109]:
from collections import Counter
cnt = Counter(all_tracks)
dict(cnt)
pop = pd.Series(cnt).to_frame().reset_index()
pop.columns = ['trackId', 'count']
df_temp = df.merge(pop)
df_temp

Unnamed: 0,trackId,artistId,count
0,454079,38292,1
1,346334,24922,3
2,403921,18862,1
3,238166,4171,5
4,219040,27137,1
...,...,...,...
171262,12682,45792,1
171263,220317,943,2
171264,377199,37675,1
171265,55303,28180,5


In [110]:
arr_tracks_of_artisit = (df_temp[df_temp['artistId'] == 45792]
                                     .sort_values(by='count', ascending=False)['trackId']
                                     .head(5).values)
arr_tracks_of_artisit

array([482674, 141671, 427292,    716,  97717])

In [111]:
import time

def optimize(n_comp, n_epochs, learning_rate=0.05, n_top_art=20):
    n_top_tracks = 100 // n_top_art
    start_time = time.time()
    model = LightFM(no_components=n_comp, loss='warp', learning_rate=learning_rate, random_state=42)
    model.fit(user_item, epochs=n_epochs)
    print("--- %s seconds ---" % (time.time() - start_time))
    result = []
    items = np.unique(arr_cols)
    for user_id in tqdm(val_id):
        scores = model.predict(int(user_id), items)
        arr_id = np.argsort(-scores)[:350]
        top_items = items[arr_id]
        top_items = top_items[~np.isin(top_items, val_tracks[user_id])]
        top_items = top_items[:n_top_art]
        top_res = []
        for i in top_items:
            arr_tracks_of_artisit = (df_temp[df_temp['artistId'] == i]
                                     .sort_values(by='count', ascending=False)['trackId']
                                     .head(n_top_tracks).values)
            top_res.extend(list(arr_tracks_of_artisit))
        result.append(' '.join(map(str, top_items)) + '\n')
        
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm', 'w') as f:
        f.writelines(result)
    
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target', 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target',
               "/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm")

In [112]:
optimize(3, 30, n_top_art=50)

--- 1.0414819717407227 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0000


In [113]:
optimize(3, 30, n_top_art=10)

--- 1.004601001739502 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0000


## Baseline

In [114]:
%%time
track_stats = {}

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/train') as f:
    lines = f.readlines()
    for line in lines:
        tracks = line.strip().split(' ')
        for track in tracks:
            if track not in track_stats:
                track_stats[track] = 0
            track_stats[track] += 1

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/test') as f:
    test = f.readlines()

popular_tracks = sorted(track_stats.items(), key=lambda item: item[1], reverse=True)[:100]
popular_tracks_list = [x[0] for x in popular_tracks]

top_tracks = sorted(track_stats.items(), key=lambda item: item[1], reverse=True)[:1000]
top_tracks_set = set([x[0] for x in top_tracks])

global_track_score = {}
for track in top_tracks:
    global_track_score[track[0]] = track_stats[track[0]] ** 0.5

track_count = {}
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/train') as f:
    lines = f.readlines()
    for (i, line) in enumerate(lines):
        tracks = line.strip().split(' ')
        filtered_tracks = []
        for track in tracks:
            if track in top_tracks_set:
                filtered_tracks.append(track)
        for i in range(len(filtered_tracks)):
            track1 = filtered_tracks[i]
            for j in range(len(filtered_tracks)):
                if i != j:
                    track2 = filtered_tracks[j]
                    if track1 not in track_count:
                        track_count[track1] = {}
                    current_count = track_count[track1]
                    if track2 not in current_count:
                        current_count[track2] = 0
                    current_count[track2] += 1

result = []
empty_track_score = 0
for query in test:
    test_tracks = query.strip().split(' ')
    track_score = {}
    for track in test_tracks:
        if track in track_count:
            for track_id in track_count[track]:
                score = track_count[track][track_id]
                if track_id not in track_score:
                    track_score[track_id] = 0
                track_score[track_id] += score / global_track_score[track] / global_track_score[track_id]
    if len(track_score) == 0:
        result.append(' '.join(popular_tracks_list) + '\n')
        empty_track_score += 1
    else:
        best_tracks = sorted(track_score.items(), key=lambda item: item[1], reverse=True)[:100]
        result.append(' '.join([x[0] for x in best_tracks]) + '\n')

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/result_baseline', 'w') as f:
    f.writelines(result)

CPU times: user 24min 59s, sys: 12.4 s, total: 25min 11s
Wall time: 25min 39s


## LightFM

In [1]:
%%time
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from lightfm import LightFM

CPU times: user 2.12 s, sys: 198 ms, total: 2.32 s
Wall time: 648 ms




In [2]:
%%time

list_coord = []
idx = 0
test_tracks = {}

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/train') as f:
    lines = f.readlines()
    for line in lines:
        tracks = line.strip().split(' ')
        for track in tracks:
            list_coord.append((idx, track))
        idx += 1
        
first_id_test = idx

with open('/Users/david/Documents/GitHub/YandexCup_RecSys/test') as f:
    lines = f.readlines()
    for line in lines:
        tracks = line.strip().split(' ')
        tracks_arr = np.array(tracks).astype(np.int32)
        test_tracks[idx] = tracks_arr
        for track in tracks:
            list_coord.append((idx, track))
        idx += 1
        
last_id_test = idx - 1

CPU times: user 19.5 s, sys: 3.07 s, total: 22.5 s
Wall time: 27.1 s


In [3]:
first_id_test, last_id_test

(1160084, 1449997)

In [4]:
%%time
from scipy.sparse import coo_matrix
rows = [i[0] for i in list_coord]
cols = [i[1] for i in list_coord]
arr_rows = np.array(rows)
arr_cols = np.array(cols)
arr_cols = arr_cols.astype(np.int32)
ones = np.ones(len(list_coord))
user_item = coo_matrix((ones, (arr_rows, arr_cols)), dtype=np.int32)

CPU times: user 43.9 s, sys: 24 s, total: 1min 7s
Wall time: 2min 20s


In [5]:
%%time
model = LightFM(no_components=3, loss='warp', random_state=42)
model.fit(user_item, epochs=30)

CPU times: user 2h 3min 25s, sys: 3h 27min 46s, total: 5h 31min 12s
Wall time: 8h 21min 27s


<lightfm.lightfm.LightFM at 0x5bc5ed8e0>

In [6]:
from tqdm.auto import tqdm
result = []
items = np.unique(arr_cols)
for user_id in tqdm(range(first_id_test, last_id_test + 1)):
    scores = model.predict(int(user_id), items)
    arr_id = np.argsort(-scores)[:350]
    top_items = items[arr_id]
    top_items = top_items[~np.isin(top_items, test_tracks[user_id])]
    top_items = top_items[:100]
    result.append(' '.join(map(str, top_items)) + '\n')
        
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/lightfm_result_3_30', 'w') as f:
    f.writelines(result)

  0%|          | 0/289914 [00:00<?, ?it/s]