In [1]:
""" An example of using this library to calculate related artists
from the last.fm dataset. More details can be found
at http://www.benfrederickson.com/matrix-factorization/

This code will automatically download a HDF5 version of the dataset from
GitHub when it is first run. The original dataset can also be found at
http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html
"""
import argparse
import codecs
import logging
import time

import numpy as np
import tqdm
!pip install implicit

from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (
    AnnoyAlternatingLeastSquares,
    FaissAlternatingLeastSquares,
    NMSLibAlternatingLeastSquares,
)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.datasets.lastfm import get_lastfm
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp38-cp38-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2




In [2]:
# maps command line model argument to class name
MODELS = {
    "als": AlternatingLeastSquares,
    "nmslib_als": NMSLibAlternatingLeastSquares,
    "annoy_als": AnnoyAlternatingLeastSquares,
    "faiss_als": FaissAlternatingLeastSquares,
    "tfidf": TFIDFRecommender,
    "cosine": CosineRecommender,
    "bpr": BayesianPersonalizedRanking,
    "lmf": LogisticMatrixFactorization,
    "bm25": BM25Recommender,
}

In [3]:
def get_model(model_name):
    print(f"getting model {model_name}")
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError(f"Unknown Model '{model_name}'")

    # some default params
    if model_name.endswith("als"):
        params = {"factors": 128, "dtype": np.float32}
    elif model_name == "bm25":
        params = {"K1": 100, "B": 0.5}
    elif model_name == "bpr":
        params = {"factors": 63}
    elif model_name == "lmf":
        params = {"factors": 30, "iterations": 40, "regularization": 1.5}
    else:
        params = {}

    return model_class(**params)


In [6]:
def calculate_similar_artists(output_filename, model_name="als"):
    """generates a list of similar artists in lastfm by utilizing the 'similar_items'
    api of the models"""
    artists, _, plays = get_lastfm()

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if model_name.endswith("als"):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_recommend = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()
    user_plays = plays.T.tocsr()

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(user_plays)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

    # write out similar artists by popularity
    start = time.time()
    logging.debug("calculating top artists")

    user_count = np.ediff1d(plays.indptr)
    to_generate = sorted(np.arange(len(artists)), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    logging.debug("writing similar items")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            batch_size = 1000
            for startidx in range(0, len(to_generate), batch_size):
                batch = to_generate[startidx : startidx + batch_size]
                ids, scores = model.similar_items(batch, 11)
                for i, artistid in enumerate(batch):
                    artist = artists[artistid]
                    for other, score in zip(ids[i], scores[i]):
                        o.write(f"{artist}\t{artists[other]}\t{score}\n")
                progress.update(batch_size)

    logging.debug("generated similar artists in %0.2fs", time.time() - start)


In [7]:
def calculate_recommendations(output_filename, model_name="als"):
    """Generates artist recommendations for each user in the dataset"""
    # train the model based off input params
    artists, users, plays = get_lastfm()

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if model_name.endswith("als"):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_similar_items = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()
    user_plays = plays.T.tocsr()

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(user_plays)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

    # generate recommendations for each user and write out to a file
    start = time.time()
    with tqdm.tqdm(total=len(users)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            batch_size = 1000
            to_generate = np.arange(len(users))
            for startidx in range(0, len(to_generate), batch_size):
                batch = to_generate[startidx : startidx + batch_size]
                ids, scores = model.recommend(
                    batch, user_plays[batch], filter_already_liked_items=True
                )
                for i, userid in enumerate(batch):
                    username = users[userid]
                    for other, score in zip(ids[i], scores[i]):
                        o.write(f"{username}\t{artists[other]}\t{score}\n")
                progress.update(batch_size)
    logging.debug("generated recommendations in %0.2fs", time.time() - start)

In [10]:

calculate_recommendations("similar-artists.tsv", model_name='als')


0.00B [00:00, ?B/s]

getting model als


  0%|          | 0/15 [00:00<?, ?it/s]

359000it [17:40, 338.39it/s]


In [9]:
# train the model based off input params
artists, users, plays = get_lastfm()
model_name = 'als'
# create a model from the input data
model = get_model('als')

# if we're training an ALS based model, weight input for last.fm
# by bm25
if model_name.endswith("als"):
    # lets weight these models by bm25weight.
    logging.debug("weighting matrix by bm25_weight")
    plays = bm25_weight(plays, K1=100, B=0.8)

    # also disable building approximate recommend index
    model.approximate_similar_items = False

# this is actually disturbingly expensive:
plays = plays.tocsr()
user_plays = plays.T.tocsr()

0.00B [00:00, ?B/s]

getting model als


In [10]:
user_plays.shape

(358868, 292385)