<a href="https://colab.research.google.com/github/denisparra/pyreclab_tutorial/blob/master/implicit_als_vs_bpr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Práctico librería implicit - ALS y BPR

Autor: Manuel Cartagena, ayudante

Profesor: Denis Parra

Clase: IIC3633 Sistemas Recomendadores, PUC Chile

https://github.com/PUC-RecSys-Class/RecSysPUC-2023

(actualizado en agosto de 2023)


In [1]:
!curl -L -o "u2.base" "https://drive.google.com/uc?export=download&id=1bGweNw7NbOHoJz11v6ld7ymLR8MLvBsA"
!curl -L -o "u2.test" "https://drive.google.com/uc?export=download&id=1f_HwJWC_1HFzgAjKAWKwkuxgjkhkXrVg"
!curl -L -o "u.item" "https://drive.google.com/uc?export=download&id=10YLhxkO2-M_flQtyo9OYV4nT9IvSESuz"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100 1546k  100 1546k    0     0   922k      0  0:00:01  0:00:01 --:--:--  922k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  385k  100  385k    0     0   289k      0  0:00:01  0:00:01 --:--:-- 1060k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100  230k  100  230k    0     0  91044      0  0:00:02  0:00:02 --:--:--  112M


In [2]:
!pip install pandas --upgrade
!pip install implicit --upgrade

Collecting pandas
  Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tzdata, pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Uninstalling pandas-1.5.3:
      Successfully uninstalled pandas-1.5.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==1.5.3, but you have pandas 2.0.3 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.0.3 tzdata-2023.3
Collecting im

In [3]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse



In [4]:
columns = ['movieid', 'title', 'release_date', 'video_release_date', \
           'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', \
           'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', \
           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', \
           'Thriller', 'War', 'Western']

In [20]:
# Primero creamos el dataframe con los datos
df_train = pd.read_csv('u2.base',
                         sep='\t',
                         names=['userid', 'itemid', 'rating', 'timestamp'],
                         header=None)

In [21]:
# Cargamos el dataset con los items
df_items = pd.read_csv('u.item',
                        sep='|',
                        index_col=0,
                        names = columns,
                        header=None,
                        encoding='latin-1')

In [22]:
# Cargamos el dataset de testing
df_test = pd.read_csv('u2.test',
                      sep='\t',
                      names=['userid', 'itemid', 'rating', 'timestamp'],
                      header=None)

user_items_test = {}

for row in df_test.itertuples():
    if row[1] not in user_items_test:
        user_items_test[row[1]] = []

    user_items_test[row[1]].append(row[2])

### Métricas

In [8]:
# Definicion de métricas (No editar)
# Obtenido de https://gist.github.com/bwhite/3726239

def precision_at_k(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def average_precision(r):
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)

def mean_average_precision(rs):
    return np.mean([average_precision(r) for r in rs])

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)

    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

### Preprocesamiento de los datos a formato sparse

In [23]:
user_items = {}
itemset = set()

for row in df_train.itertuples():
    if row[1] not in user_items:
        user_items[row[1]] = []

    user_items[row[1]].append(row[2])
    itemset.add(row[2])

itemset = np.sort(list(itemset))

sparse_matrix = np.zeros((len(user_items), len(itemset)))

for i, items in enumerate(user_items.values()):
    sparse_matrix[i] = np.isin(itemset, items, assume_unique=True).astype(int)

matrix = sparse.csr_matrix(sparse_matrix.T)

user_ids = {key: i for i, key in enumerate(user_items.keys())}
user_item_matrix = matrix.T.tocsr()

In [91]:
def evaluate_model(model, n):
  mean_map = 0.
  mean_ndcg = 0.
  for u in user_items_test.keys():
    rec = [ t for t in model.recommend(u, user_item_matrix[u], n, filter_already_liked_items= False)[0] ]
    rel_vector = [np.isin(user_items_test[u], rec, assume_unique=True).astype(int)]
    mean_map += mean_average_precision(rel_vector)
    mean_ndcg += ndcg_at_k(rel_vector, n)

  mean_map /= len(user_items_test)
  mean_ndcg /= len(user_items_test)

  return mean_map, mean_ndcg

In [92]:
def show_recommendations(model, user, n):
  recommendations = [ t for t in model.recommend(user, user_item_matrix[user], n, filter_already_liked_items= False)[0] ]
  return df_items.loc[recommendations]['title']

In [110]:
def show_similar_movies(model, item, n=10):
  sim_items = [t for t in model.similar_items(item, N=n)[0]]
  print(sim_items)
  return df_items.loc[sim_items]['title']

## ALS (Implicit Feedback)

In [66]:
# Definimos y entrenamos el modelo ALS
model_als = implicit.als.AlternatingLeastSquares(factors=100, iterations=10)
model_als.fit(matrix)

  0%|          | 0/10 [00:00<?, ?it/s]

In [93]:
show_recommendations(model_als, user=70, n=10)

movieid
886                         Life Less Ordinary, A (1997)
221                            Breaking the Waves (1996)
746                                   Real Genius (1985)
806                             Menace II Society (1993)
193                              Right Stuff, The (1983)
200                                  Shining, The (1980)
415                      Apple Dumpling Gang, The (1975)
915                                Primary Colors (1998)
129                                         Bound (1996)
863    Garden of Finzi-Contini, The (Giardino dei Fin...
Name: title, dtype: object

In [94]:
maprec, ndcg = evaluate_model(model_als, n=10)
print('map: {}\nndcg: {}'.format(maprec, ndcg))

map: 0.038277135929224965
ndcg: 0.24808575803981622


In [116]:
 show_similar_movies(model_als, 99, n=10)

[99, 751, 841, 178, 610, 862, 783, 205, 586, 190]


movieid
99     Snow White and the Seven Dwarfs (1937)
751                Tomorrow Never Dies (1997)
841                   Glimmer Man, The (1996)
178                       12 Angry Men (1957)
610                               Gigi (1958)
862                 Jingle All the Way (1996)
783                         Milk Money (1994)
205                             Patton (1970)
586                  Terminal Velocity (1994)
190                            Henry V (1989)
Name: title, dtype: object

## BPR

In [95]:
# Definimos y entrenamos el modelo BPR
model_bpr = implicit.bpr.BayesianPersonalizedRanking(factors = 450,iterations=35)
model_bpr.fit(matrix)

  0%|          | 0/35 [00:00<?, ?it/s]

In [114]:
show_recommendations(model_bpr, user=75, n=10)

movieid
845                            That Thing You Do! (1996)
803                                Heaven & Earth (1993)
649                   Once Upon a Time in America (1984)
404                                     Pinocchio (1940)
748                                    Saint, The (1997)
715                                    To Die For (1995)
795                                   Richie Rich (1994)
895                                      Scream 2 (1997)
863    Garden of Finzi-Contini, The (Giardino dei Fin...
773                                  Mute Witness (1994)
Name: title, dtype: object

In [97]:
maprec, ndcg = evaluate_model(model_bpr, n=10)
print('map: {}\nndcg: {}'.format(maprec, ndcg))

map: 0.023457071656302394
ndcg: 0.23736600306278713


In [117]:
 show_similar_movies(model_bpr, 99, n=10)

[99, 723, 240, 645, 111, 597, 125, 407, 2, 588]


movieid
99     Snow White and the Seven Dwarfs (1937)
723                   Boys on the Side (1995)
240    Beavis and Butt-head Do America (1996)
645                   Paris Is Burning (1990)
111       Truth About Cats & Dogs, The (1996)
597                             Eraser (1996)
125                         Phenomenon (1996)
407                           Spy Hard (1996)
2                            GoldenEye (1995)
588               Beauty and the Beast (1991)
Name: title, dtype: object