In [1]:
from typing import List
from utils import ROOT_DIR

import numpy as np
import pandas as pd

from lightfm import LightFM
from lightfm.data import Dataset



In [2]:
SEED = 671993

SAMPLED_PLAYLISTS = 300000

N_THREADS = 6
N_EPOCHS = 20

In [3]:
playlists = pd.read_csv(f"{ROOT_DIR}/data/mdp_playlists.csv")
interactions = pd.read_csv(f"{ROOT_DIR}/data/mdp_interactions.csv")

In [4]:
np.quantile(
    playlists.num_tracks,
    np.arange(0,1,.05)
)

array([  5.,  11.,  15.,  18.,  22.,  26.,  30.,  34.,  39.,  44.,  49.,
        55.,  62.,  70.,  80.,  92., 105., 123., 148., 184.])

In [5]:
# We keep a playlist subset
filtered_playlists = playlists.query("num_tracks > 25").sample(SAMPLED_PLAYLISTS, random_state=SEED)[["pid"]]

# With this filtered_playlist, we retrieve their interactions
filtered_interactions = pd.merge(
    interactions,
    filtered_playlists,
    on="pid"
)

In [6]:
filtered_interactions.track_uri.size

24813480

In [7]:
# Challenge set
challenge_set = pd.read_csv(f"{ROOT_DIR}/data/mdp_challenge_set.csv")[["pid", "track_uri"]]

# We generate interactions for lightFM
interactions = pd.concat([filtered_interactions.drop(['pos'], axis = 1), challenge_set])

In [8]:
del playlists, filtered_playlists

## LightFM

In [9]:
data = Dataset()
data.fit(
    interactions.pid.unique(),
    interactions.track_uri.unique()
)

train, train_weights_matrix = data.build_interactions(
    [tuple(i) for i in filtered_interactions.drop(['pos'], axis = 1).values]
)
test, test_weights_matrix = data.build_interactions(
    [tuple(i) for i in challenge_set.values]
)

In [10]:
# Instantiate model.
# Loss function can be either:
# -BRP (Bayesian Personalised Ranking (optimise ROC/AUC)
# -WARP (Weighted Approximate-RankPairwise (top of recommendation list optimization)
# -K-WARP (warp-kos)
model = LightFM(
    learning_rate=0.05,
    loss='warp',
    max_sampled=20,
    random_state=SEED
)

In [11]:
# Model training
model.fit(
    train,
    epochs=N_EPOCHS,
    num_threads=N_THREADS
)

<lightfm.lightfm.LightFM at 0x7ff23b46a970>

In [12]:
# We add test data
model.fit_partial(
    test,
    num_threads=N_THREADS
)

<lightfm.lightfm.LightFM at 0x7ff23b46a970>

## Prediction

In [13]:
# Number of Users (pid) & items (track_uri)
n_users, n_items = train.shape
print(f"users: {n_users} - items: {n_items}")

users: 310000 - items: 1388000


In [14]:
from tqdm import tqdm

def batch_predict_recommended_tracks(
        model: LightFM,
        sampled_playlists: int,
        tracks_array: List[str],
        n_items: int,
        n_users: int = 10000,
        batch_size: int = 1000
):
    """

    :param model:
    :param sampled_playlists:
    :param tracks_array:
    :param n_items:
    :param n_users:
    :param batch_size: integer that has to be a denominator for 10k
    :return:
    """

    start = sampled_playlists
    iterations = int(n_users/batch_size)
    recommendations = None

    # TODO use tqdm
    for i in tqdm(range(iterations), total=iterations):
        # We create prediction arrays
        users_id = np.repeat(
            np.arange(start, start+batch_size),
            n_items
        )
        items_id = np.tile(
            np.arange(n_items),
            batch_size
        )

        # We pararelize prediction
        scores = model.predict(
            users_id,
            items_id,
            num_threads=N_THREADS
        )

        # We create a prediction matrix
        scores = np.reshape(
            scores,
            (batch_size, n_items)
        )

        # Ordenar por score predicho (obtengo los indices)
        scores = np.argsort(-scores)

        # Conservo el top 750. El challenge set tiene pids con a lo sumo 100 tracks
        scores = scores[:, :750]

        # Obtengo el track uri
        recommended_tracks = tracks_array[scores]

        if i == 0:
            recommendations = recommended_tracks
        elif i > 0:
            recommendations = np.concatenate(
                (recommendations, recommended_tracks)
            )
        print("Shape", recommendations.shape)
        start = start + batch_size

    return recommendations

In [15]:
tracks_array = np.array(interactions.track_uri.unique())

In [16]:
all_recommended_tracks = batch_predict_recommended_tracks(
    model=model,
    sampled_playlists=SAMPLED_PLAYLISTS,
    tracks_array=tracks_array,
    n_items=n_items,
    n_users=10000,
    batch_size=500
)

  5%|▌         | 1/20 [02:36<49:32, 156.45s/it]

Shape (500, 750)


 10%|█         | 2/20 [05:13<47:00, 156.69s/it]

Shape (1000, 750)


 15%|█▌        | 3/20 [07:33<42:16, 149.22s/it]

Shape (1500, 750)


 20%|██        | 4/20 [09:58<39:20, 147.53s/it]

Shape (2000, 750)


 25%|██▌       | 5/20 [12:34<37:36, 150.42s/it]

Shape (2500, 750)


 30%|███       | 6/20 [15:05<35:10, 150.78s/it]

Shape (3000, 750)


 35%|███▌      | 7/20 [17:17<31:20, 144.69s/it]

Shape (3500, 750)


 40%|████      | 8/20 [19:31<28:15, 141.33s/it]

Shape (4000, 750)


 45%|████▌     | 9/20 [21:47<25:36, 139.66s/it]

Shape (4500, 750)


 50%|█████     | 10/20 [24:04<23:07, 138.75s/it]

Shape (5000, 750)


 55%|█████▌    | 11/20 [26:16<20:29, 136.56s/it]

Shape (5500, 750)


 60%|██████    | 12/20 [28:28<18:01, 135.24s/it]

Shape (6000, 750)


 65%|██████▌   | 13/20 [30:54<16:10, 138.66s/it]

Shape (6500, 750)


 70%|███████   | 14/20 [33:07<13:41, 136.86s/it]

Shape (7000, 750)


 75%|███████▌  | 15/20 [35:21<11:20, 136.08s/it]

Shape (7500, 750)


 80%|████████  | 16/20 [37:48<09:16, 139.21s/it]

Shape (8000, 750)


 85%|████████▌ | 17/20 [40:28<07:16, 145.57s/it]

Shape (8500, 750)


 90%|█████████ | 18/20 [42:44<04:45, 142.57s/it]

Shape (9000, 750)


 95%|█████████▌| 19/20 [45:25<02:28, 148.09s/it]

Shape (9500, 750)


100%|██████████| 20/20 [47:53<00:00, 143.68s/it]

Shape (10000, 750)





## Submit preparation

In [17]:
import pickle

with open(f"{ROOT_DIR}/results/challenge_set.pickle", "rb") as f:
    pid_track_uris = pickle.load(f)

with open(f"{ROOT_DIR}/results/tracks_interaction_count.pickle", "rb") as f:
    top_all = pickle.load(f)

top_tracks = [pid for (_, pid) in sorted([(top_all[k],k) for k in top_all], reverse=True)]

In [18]:
import gzip
from tqdm import tqdm

N_COLD_START = 5
OS_PATH = f"{ROOT_DIR}/results/lightfm.csv.gz"

with gzip.open(OS_PATH, 'wt') as submit_file:
    _ = submit_file.write("team_info,francotestori,franco.testori@hotmail.com\n")

    for i,pid in tqdm(enumerate(pid_track_uris), total=len(pid_track_uris)):
        user_id = SAMPLED_PLAYLISTS + i

        if len(pid_track_uris[pid]) > N_COLD_START:
            top_recommendations = all_recommended_tracks[i]
        else:
            top_recommendations = top_tracks

        tracks_recomendados = []
        for tt in top_recommendations:
            if tt in pid_track_uris[pid] or type(tt)==float:
                continue
            tracks_recomendados.append("spotify:track:" + tt)
            if len(tracks_recomendados) == 500:
                break

        _ = submit_file.write(f"{pid},{','.join(tracks_recomendados)}\n")



100%|██████████| 10000/10000 [00:15<00:00, 651.40it/s]
