In [11]:
%%time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.neighbors import NearestNeighbors
from lightfm import LightFM
from tqdm.auto import tqdm

CPU times: user 22.6 ms, sys: 17.7 ms, total: 40.3 ms
Wall time: 201 ms


In [2]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")

In [3]:
%%time
df = pd.read_csv('/Users/david/Documents/GitHub/YandexCup_RecSys/track_artists.csv')
arr_item = df['trackId'].values
arr_feature = df['artistId'].values

CPU times: user 32.3 ms, sys: 11.3 ms, total: 43.6 ms
Wall time: 475 ms


In [4]:
%%time

dict_user_item = {}
target_val = []
val_id = np.random.randint(0, 10000, size=1000)
val_id.sort()
val_tracks = {}
all_tracks = []
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/train') as f:
    lines = f.readlines()
    idx = 0 
    for line in lines:
        tracks = line.strip().split(' ')
        for i, track in enumerate(tracks):
            if idx in val_id and i == 0:
                tracks_arr = np.array(tracks)
                tracks_arr = tracks_arr.astype(np.int32)
                val_tracks[idx] = tracks_arr[:-1]
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                continue
            if i == 0:
                dict_user_item[idx] = []
            dict_user_item[idx].append(int(track))
            all_tracks.append(int(track))
        idx += 1
        if idx > 10000:
            break
 

CPU times: user 3.26 s, sys: 396 ms, total: 3.65 s
Wall time: 4.29 s


In [5]:
%%time
from scipy.sparse import coo_matrix

rows = []
cols = []
data = []
for idx, tracks in dict_user_item.items():
    temp = {}
    for track in tracks:
        artist = df[df['trackId'] == track]['artistId'].values[0]
        if artist not in temp:
            temp[artist] = 0
        temp[artist] += 1
    
    rows.extend([idx for i in temp])
    cols.extend([i for i in temp.keys()])
    data.extend([i / max(temp.values()) for i in temp.values()])

arr_rows = np.array(rows)
arr_cols = np.array(cols)
arr_data = np.array(data)
user_item = coo_matrix((arr_data, (arr_rows, arr_cols)), dtype=np.int32)

CPU times: user 4min 38s, sys: 1.57 s, total: 4min 39s
Wall time: 4min 44s


In [6]:
user_item.get_shape()

(10001, 56134)

In [7]:
len(all_tracks)

812954

In [8]:
from collections import Counter
cnt = Counter(all_tracks)
dict(cnt)
pop = pd.Series(cnt).to_frame().reset_index()
pop.columns = ['trackId', 'count']
df_temp = df.merge(pop)
df_temp

Unnamed: 0,trackId,artistId,count
0,454079,38292,1
1,346334,24922,3
2,403921,18862,1
3,238166,4171,5
4,219040,27137,1
...,...,...,...
171279,12682,45792,1
171280,220317,943,2
171281,377199,37675,1
171282,55303,28180,5


In [102]:
import time

def optimize(n_comp, n_epochs, learning_rate=0.05, n_top_art=20):
    n_top_tracks = 100 // n_top_art
    start_time = time.time()
    model = LightFM(no_components=n_comp, loss='warp', learning_rate=learning_rate, random_state=42)
    model.fit(user_item, epochs=n_epochs)
    print("--- %s seconds ---" % (time.time() - start_time))
    result = []
    items = np.unique(arr_cols)
    for user_id in tqdm(val_id):
        scores = model.predict(int(user_id), items)
        arr_id = np.argsort(-scores)[:n_top_art]
        top_items = items[arr_id]
        top_res = []
        for i in top_items:
            arr_tracks_of_artisit = (df_temp[df_temp['artistId'] == i]
                                     .sort_values(by='count', ascending=False)['trackId']
                                     .head(n_top_tracks).values)
            arr_tracks_of_artisit = arr_tracks_of_artisit[~np.isin(arr_tracks_of_artisit, val_tracks[user_id])]
            top_res.extend(list(arr_tracks_of_artisit))
        result.append(' '.join(map(str, top_res)) + '\n')
        
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm', 'w') as f:
        f.writelines(result)
    
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target', 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target',
               "/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm")

In [104]:
for n_comp in [3, 5, 10, 20]:
    for n_epochs in [10, 30, 50, 70]:
        for n_top_art in [50, 100]:
            print(f'\n{n_comp, n_epochs}')
            optimize(n_comp, n_epochs,  n_top_art=n_top_art)


(3, 10)
--- 0.42542290687561035 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0011

(3, 10)
--- 0.502410888671875 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0018

(3, 30)
--- 1.3595669269561768 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0010

(3, 30)
--- 1.2579419612884521 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0016

(3, 50)
--- 2.2775039672851562 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0010

(3, 50)
--- 2.166038990020752 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0017

(3, 70)
--- 3.628000020980835 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0009

(3, 70)
--- 3.184851884841919 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0015

(5, 10)
--- 0.5282201766967773 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0033

(5, 10)
--- 0.5303082466125488 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0033

(5, 30)
--- 1.3916559219360352 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0022

(5, 30)
--- 1.4734649658203125 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0025

(5, 50)
--- 2.2976691722869873 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0023

(5, 50)
--- 2.1755728721618652 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0028

(5, 70)
--- 3.059685230255127 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0019

(5, 70)
--- 2.9676589965820312 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0022

(10, 10)
--- 0.5626862049102783 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0015

(10, 10)
--- 0.6065049171447754 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0018

(10, 30)
--- 1.5055367946624756 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0015

(10, 30)
--- 1.524662971496582 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0019

(10, 50)
--- 2.408845901489258 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0018

(10, 50)
--- 3.045664072036743 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0021

(10, 70)
--- 3.3034420013427734 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0021

(10, 70)
--- 3.509676933288574 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0021

(20, 10)
--- 0.7066490650177002 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0005

(20, 10)
--- 0.8068680763244629 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0008

(20, 30)
--- 1.8375308513641357 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0005

(20, 30)
--- 2.033615827560425 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [85]:
import time

def optimize(n_comp, n_epochs, learning_rate=0.05, n_top_tracks=2, power=1):
    n_top_art = 100
    start_time = time.time()
    model = LightFM(no_components=n_comp, loss='warp', learning_rate=learning_rate, random_state=42)
    model.fit(user_item, epochs=n_epochs)
    print("--- %s seconds ---" % (time.time() - start_time))
    result = []
    items = np.unique(arr_cols)
    for user_id in tqdm(val_id):
        scores = model.predict(int(user_id), items)
        arr_id = np.argsort(-scores)[:n_top_art]
        scores.sort()
        best_scores = scores[-n_top_art:]
        top_items = items[arr_id]
        top_res = []
        for i, j in zip(top_items, best_scores):
            arr_tracks_of_artisit = (df_temp[df_temp['artistId'] == i]
                                     .sort_values(by='count', ascending=False)['trackId']
                                     .head(n_top_tracks).values)
            
            arr_scores_pop = (df_temp[df_temp['artistId'] == i]
                                     .sort_values(by='count', ascending=False)['count']
                                     .head(n_top_tracks).values)
            
            arr_scores_pop = arr_scores_pop / max(arr_scores_pop) * j**power
            ls_zip = list(zip(arr_tracks_of_artisit, arr_scores_pop))
            ls = [i for i in ls_zip if i[0] not in val_tracks[user_id]]
            top_res.extend(ls)
        top_res = sorted(top_res, key=lambda x: x[1])
        top_res = [i[0] for i in top_res[-100:]]
        result.append(' '.join(map(str, top_res)) + '\n')
        
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm', 'w') as f:
        f.writelines(result)
    
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target', 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target',
               "/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm")

In [94]:
optimize(3, 30, n_top_tracks=2, power=10)

--- 1.21596097946167 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0020


In [100]:
optimize(5, 50, n_top_tracks=2, power=10)

--- 2.162593126296997 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0004


In [123]:
df_temp[df_temp['artistId'] == 24525].sort_values(by='count', ascending=False)

Unnamed: 0,trackId,artistId,count
121558,101675,24525,243
82374,142951,24525,193
86991,14714,24525,176
105632,331149,24525,121
134940,357432,24525,104
...,...,...,...
92907,352469,24525,1
93615,391119,24525,1
96048,17990,24525,1
96275,283822,24525,1


In [124]:
import time

def optimize(n_comp, n_epochs, learning_rate=0.05, n_top_art=100):
    n_top_tracks = 100 // n_top_art
    start_time = time.time()
    model = LightFM(no_components=n_comp, loss='warp', learning_rate=learning_rate, random_state=42)
    model.fit(user_item, epochs=n_epochs)
    print("--- %s seconds ---" % (time.time() - start_time))
    result = []
    items = np.unique(arr_cols)
    for user_id in tqdm(val_id):
        scores = model.predict(int(user_id), items)
        arr_id = np.argsort(-scores)[:n_top_art]
        top_items = items[arr_id]
        top_res = []
        for i in top_items:
            arr_tracks_of_artisit = (df_temp[df_temp['artistId'] == i]
                                     .sort_values(by='count', ascending=False)['trackId']
                                     .head(n_top_tracks).values)
            arr_tracks_of_artisit = arr_tracks_of_artisit[~np.isin(arr_tracks_of_artisit, val_tracks[user_id])]
            top_res.extend(list(arr_tracks_of_artisit))
        result.append(' '.join(map(str, top_res)) + '\n')
        
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm', 'w') as f:
        f.writelines(result)
    
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target', 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target',
               "/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm")

In [128]:
for n_epochs in [9, 10, 11]:
        print(f'\n{5, n_epochs}')
        optimize(5, n_epochs)


(5, 9)
--- 0.42342305183410645 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0034

(5, 10)
--- 0.5157999992370605 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0033

(5, 11)
--- 0.5549161434173584 seconds ---


  0%|          | 0/1000 [00:00<?, ?it/s]

MRR@100 = 0.0028


## Valid 100k artist-based

In [1]:
%%time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.neighbors import NearestNeighbors
from lightfm import LightFM
from tqdm.auto import tqdm

CPU times: user 1.97 s, sys: 913 ms, total: 2.88 s
Wall time: 7.06 s




In [9]:
%%time
df = pd.read_csv('/Users/david/Documents/GitHub/YandexCup_RecSys/track_artists.csv')
arr_item = df['trackId'].values
arr_feature = df['artistId'].values

from collections import Counter
cnt = Counter(all_tracks)
dict(cnt)
pop = pd.Series(cnt).to_frame().reset_index()
pop.columns = ['trackId', 'count']
df_temp = df.merge(pop)

CPU times: user 2.53 s, sys: 488 ms, total: 3.02 s
Wall time: 4.31 s


In [7]:
%%time

dict_user_item = {}
target_val = []
val_id = np.random.randint(300_000, 400_000, size=10_000)
val_id.sort()
val_tracks = {}
all_tracks = []
with open('/Users/david/Documents/GitHub/YandexCup_RecSys/train') as f:
    lines = f.readlines()
    idx = 0 
    for line in tqdm(lines):
        idx += 1
        if idx < 300_000:
            continue
        tracks = line.strip().split(' ')
        for i, track in enumerate(tracks):
            if idx in val_id and i == 0:
                tracks_arr = np.array(tracks)
                tracks_arr = tracks_arr.astype(np.int32)
                val_tracks[idx] = tracks_arr[:-1]
            if idx in val_id and i+1 == len(tracks):
                target_val.append(track)
                continue
            if i == 0:
                dict_user_item[idx] = []
            dict_user_item[idx].append(int(track))
            all_tracks.append(int(track))
        if idx > 400_000:
            break
 

  0%|          | 0/1160084 [00:00<?, ?it/s]

CPU times: user 1min 37s, sys: 2.63 s, total: 1min 40s
Wall time: 1min 52s


In [10]:
%%time
from scipy.sparse import coo_matrix

rows = []
cols = []
data = []
for idx, tracks in tqdm(dict_user_item.items()):
    temp = {}
    for track in tracks:
        artist = df[df['trackId'] == track]['artistId'].values[0]
        if artist not in temp:
            temp[artist] = 0
        temp[artist] += 1
    
    rows.extend([idx for i in temp])
    cols.extend([i for i in temp.keys()])
    data.extend([i / max(temp.values()) for i in temp.values()])

arr_rows = np.array(rows)
arr_cols = np.array(cols)
arr_data = np.array(data)
user_item = coo_matrix((arr_data, (arr_rows, arr_cols)), dtype=np.int32)

  0%|          | 0/100002 [00:00<?, ?it/s]

CPU times: user 47min 37s, sys: 20.6 s, total: 47min 58s
Wall time: 1h 15min 48s


In [13]:
def calc_mrr(predict, answer):
    for i in range(len(predict)):
        if predict[i] == answer:
            return 1. / (i + 1)
    return 0

max_prediction_len = 100

def calc_score(target_path, predict_path):
    with open(target_path) as f:
        y_true = [int(x.strip()) for x in f.readlines()]

    with open(predict_path) as f:
        y_pred = [[int(x) for x in line.strip().split(' ')] for line in f.readlines()]

    mrr_score = 0
    for (pred, answer) in zip(y_pred, y_true):
        if len(pred) > max_prediction_len:
            raise ValueError('$maximum prediction length is {}, got {}$'.format(max_prediction_len, len(y_pred[i])))
        mrr_score += calc_mrr(pred, answer)

    print(f"MRR@100 = {(mrr_score / len(y_true)):.4f}")

In [14]:
import time

def optimize(n_comp=5, n_epochs=9, learning_rate=0.05):
    start_time = time.time()
    model = LightFM(no_components=n_comp, loss='warp', learning_rate=learning_rate, random_state=42)
    model.fit(user_item, epochs=n_epochs)
    print("--- %s seconds ---" % (time.time() - start_time))
    result = []
    items = np.unique(arr_cols)
    for user_id in tqdm(val_id):
        scores = model.predict(int(user_id), items)
        arr_id = np.argsort(-scores)[:350]
        top_items = items[arr_id]
        top_res = []
        for i in top_items:
            track_of_artisit = (df_temp[df_temp['artistId'] == i]
                                     .sort_values(by='count', ascending=False)['trackId']
                                     .head(1).values)[0]
            if not np.isin(track_of_artisit, val_tracks[int(user_id)]):
                top_res.append(track_of_artisit)
            if len(top_res) == 100:
                break
        result.append(' '.join(map(str, top_res)) + '\n')
        
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm', 'w') as f:
        f.writelines(result)
    
    with open('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target', 'w') as f:
        for i in target_val:
            f.write(f'{i}\n')
            
    calc_score('/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm_target',
               "/Users/david/Documents/GitHub/YandexCup_RecSys/first_lightfm")

In [15]:
optimize()

--- 4.980743169784546 seconds ---


  0%|          | 0/10000 [00:00<?, ?it/s]

MRR@100 = 0.0019
