### Importar librerías

In [6]:
# imports
import random as rd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from scipy import stats

from surprise import AlgoBase
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNBaseline
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import NMF
from surprise import SVD
from surprise import SVDpp
from surprise import SlopeOne
from surprise import CoClustering
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

### Calcular Popularidad de ítems

In [7]:
def calculate_item_popularity(df):
    # Calcular la cantidad total de usuarios
    total_users = df['user_id'].nunique()
    
    # Contar la cantidad de usuarios únicos que compraron cada item
    item_user_count = df.groupby('item_id')['user_id'].nunique().reset_index()
    
    # Renombrar la columna user_id a popularity
    item_user_count = item_user_count.rename(columns={'user_id': 'popularity'})
    
    # Calcular la popularidad dividiendo por la cantidad total de usuarios
    item_user_count['popularity'] = item_user_count['popularity'] / total_users
    
    return item_user_count

### Calcular ítems más populares (top 20%)

In [8]:
def get_top_20_percent_items(popularity_df):
    # Ordenar los items por popularidad de mayor a menor
    sorted_popularity_df = popularity_df.sort_values(by='popularity', ascending=False)
    
    # Calcular el número de ítems que corresponde al 20%
    top_20_percent_count = int(len(sorted_popularity_df) * 0.20)
    
    # Obtener los ítems más populares que corresponden al 20%
    I_pop = sorted_popularity_df.head(top_20_percent_count)['item_id'].tolist()
    
    return I_pop

### Calcular popularidad de users

In [9]:
def calculate_user_popularity(df, I_pop):
    # Filtrar los items rateados que están en I_pop
    df['is_popular'] = df['item_id'].isin(I_pop)
    
    # Calcular la cantidad de items populares rateados por cada usuario
    user_popular_items_count = df[df['is_popular']].groupby('user_id')['item_id'].count().reset_index()
    user_popular_items_count = user_popular_items_count.rename(columns={'item_id': 'popular_items_count'})
    
    # Calcular la cantidad total de items rateados por cada usuario
    user_total_items_count = df.groupby('user_id')['item_id'].count().reset_index()
    user_total_items_count = user_total_items_count.rename(columns={'item_id': 'total_items_count'})

    # Asegurar que todos los usuarios están presentes en el resultado final
    user_popular_items_count = pd.merge(user_total_items_count[['user_id']], 
                                        user_popular_items_count, 
                                        on='user_id', 
                                        how='left').fillna(0)

    # Combinar los dos DataFrames
    user_popularity_df = pd.merge(user_popular_items_count, user_total_items_count, on='user_id')
    
    # Calcular user_pop dividiendo popular_items_count por total_items_count
    user_popularity_df['user_pop'] = user_popularity_df['popular_items_count'] / user_popularity_df['total_items_count']
    
    # Seleccionar solo las columnas user_id y user_pop
    user_popularity_df = user_popularity_df[['user_id', 'user_pop']]
    
    return user_popularity_df

### Prueba con dataset de amazon

In [10]:
df_amazon = pd.read_csv('data/myanime_600K.csv', sep=",")

In [11]:
#df_amazon = df_amazon[['User_id', 'Id', 'review/score']]
#df_amazon = df_amazon.rename(columns={'User_id' : 'user_id', 'Id': 'item_id', 'review/score' : 'rating'})
df_amazon = df_amazon.rename(columns={'anime_id': 'item_id'})
popularity_df = calculate_item_popularity(df_amazon)
I_pop = get_top_20_percent_items(popularity_df)
user_popularity_df = calculate_user_popularity(df_amazon, I_pop)
user_popularity_df = user_popularity_df.sort_values(by='user_pop', ascending=False)
top = user_popularity_df.head(len(user_popularity_df)//3)
bot = user_popularity_df.tail(len(user_popularity_df)//3)
# take the 100 users in the middle of the ranking, using the len of user_popularity_df
mid = user_popularity_df.iloc[len(user_popularity_df)//3: len(user_popularity_df) - len(user_popularity_df)//3]

top = top.rename(columns={'user_id': 'user', 'user_pop': 'mainstreaminess'})
#top = top.iloc[:1000]
bot = bot.rename(columns={'user_id': 'user', 'user_pop': 'mainstreaminess'})
#bot = bot.iloc[len(bot) - 1000:]
mid = mid.rename(columns={'user_id': 'user', 'user_pop': 'mainstreaminess'})
#mid = mid.iloc[len(mid)//2 - 500: len(mid)//2 + 500]
df_amazon = df_amazon.rename(columns={'user_id': 'user', 'product_id': 'item', 'rating': 'preference'})

In [10]:
# top.to_csv('./myanime/top.csv', index=False)
# bot.to_csv('./myanime/bot.csv', index=False)
# mid.to_csv('./myanime/mid.csv', index=False)

In [11]:
print("Top 10 users con high pop")
print(top.head(10))
print("Top 10 users con mid pop")
print(mid.head(10))
print("Top 10 users con low pop")
print(bot.head(10))

Top 10 users con high pop
      user  mainstreaminess
2077  2370              1.0
3581  4100              1.0
3583  4102              1.0
3568  4083              1.0
3569  4084              1.0
3592  4112              1.0
3593  4113              1.0
3595  4115              1.0
780    892              1.0
768    880              1.0
Top 10 users con mid pop
      user  mainstreaminess
3376  3862         0.968109
3373  3859         0.968085
1939  2212         0.968085
1775  2031         0.968085
1757  2010         0.968000
2901  3322         0.967949
3472  3967         0.967949
3490  3992         0.967890
2124  2425         0.967871
2663  3049         0.967836
Top 10 users con low pop
      user  mainstreaminess
1236  1417         0.892063
614    708         0.891892
535    622         0.891892
2041  2332         0.891753
507    590         0.891608
1761  2016         0.891566
1543  1764         0.891566
3209  3671         0.891304
1953  2227         0.891304
1461  1670         0.891089


In [12]:
# constants and initialization
dataset = 'book'#options:'lfm', anime', 'book', 'ml'
folds = 5
my_seed = 0
rd.seed(my_seed)
np.random.seed(my_seed)
top_fraction = 0.2
# user_events_file = 'data/' + dataset + '/user_events.txt'
# low_user_file = 'data/' + dataset + '/low_main_users.txt'
# medium_user_file = 'data/' + dataset + '/medium_main_users.txt'
# high_user_file = 'data/' + dataset + '/high_main_users.txt'
df_events = df_amazon.copy()
df_events = df_events[['user', 'item_id', 'preference']]
df_events = df_events.rename(columns={'item_id': 'item'})
low_users = bot.copy()
low_users.set_index('user', inplace=True)
medium_users = mid.copy()
medium_users.set_index('user', inplace=True)
high_users = top.copy()
high_users.set_index('user', inplace=True)

In [13]:
# read user events and users
cols = ['user', 'item', 'preference']
#df_events = pd.read_csv(user_events_file, sep=',', names=cols, skiprows=1)
print('No. of user events: ' + str(len(df_events)))
# read users
#low_users = pd.read_csv(low_user_file, sep=',').set_index('user')
#medium_users = pd.read_csv(medium_user_file, sep=',').set_index('user')
#high_users = pd.read_csv(high_user_file, sep=',').set_index('user')
no_users = len(low_users) + len(medium_users) + len(high_users)
print('No. of users: ' + str(no_users))
print('No. of events per user: ' + str(len(df_events) / no_users))

No. of user events: 633278
No. of users: 3614
No. of events per user: 175.22910902047593


In [14]:
# get item distribution
item_dist = df_events['item'].value_counts()
num_items = len(item_dist)
print('No. items: ' + str(num_items))
# create item dataframe with normalized item counts
df_item_dist = pd.DataFrame(item_dist)
df_item_dist.columns = ['count']
df_item_dist['count'] /= no_users
print('No. of events per item: ' + str(len(df_events) / num_items))

No. items: 11536
No. of events per item: 54.895804438280166


In [15]:
# sparsity
1 - len(df_events) / (no_users * num_items)

0.9848102367353956

In [16]:
# rating range
print('Min rating: ' + str(df_events['preference'].min()))
print('Max rating: ' + str(df_events['preference'].max()))

Min rating: 1
Max rating: 10


In [17]:
# # get fractions
# user_hist = [] # user history sizes
# pop_item_fraq = [] # average popularity of items in user profiles
# for u, df in df_events.groupby('user'):
#     no_user_items = len(set(df['item'])) # profile size
#     user_hist.append(no_user_items)
#     # get popularity (= fraction of users interacted with item) of user items and calculate average of it
#     user_pop_item_fraq = sum(item_dist[df['item']] / no_users) / no_user_items
#     pop_item_fraq.append(user_pop_item_fraq)

In [18]:
# plt.figure()
# slope, intercept, r_value, p_value, std_err = stats.linregress(user_hist, pop_item_fraq)
# print('R-value: ' + str(r_value))
# print('R2-value: ' + str(r_value**2))
# print('P-value: ' + str(p_value))
# print('Slope: ' + str(slope))
# print('Intercept: ' + str(intercept))
# print(stats.spearmanr(user_hist, pop_item_fraq))

# line = slope * np.array(user_hist) + intercept
# plt.plot(user_hist, pop_item_fraq, 'o', user_hist, line)
# plt.xlabel('User profile size', fontsize='15')
# plt.ylabel('Average popularity of items', fontsize='15')
# plt.xticks(fontsize='13')
# plt.yticks(fontsize='13')
# #plt.savefig('data/' + dataset + '/plots/corr_user_avg.png', dpi=300, bbox_inches='tight')

In [19]:
# start recommender

In [20]:
reader = Reader(rating_scale=(df_events['preference'].min(), df_events['preference'].max()))

In [21]:
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_events, reader)

In [22]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [23]:
def get_mae_of_groups(predictions):
    #print('All: ')
    #all_mae = accuracy.mae(predictions)
    all_predictions = []
    low_predictions = []
    med_predictions = []
    high_predictions = []
    for uid, iid, true_r, est, details in predictions:
        prediction = [(uid, iid, true_r, est, details)]
        if uid in low_users.index:
            low_predictions.append(accuracy.mae(prediction, verbose=False))
        elif uid in medium_users.index:
            med_predictions.append(accuracy.mae(prediction, verbose=False))
        else:
            high_predictions.append(accuracy.mae(prediction, verbose=False))          
    low_mae = np.mean(low_predictions)
    #print('LowMS: ' + str(low_mae))
    med_mae = np.mean(med_predictions)
    #print('MedMS: ' + str(med_mae))
    high_mae = np.mean(high_predictions)
    #print('HighMS: ' + str(high_mae))
    all_mae = np.mean([low_mae, med_mae, high_mae])
    #print('All: ' + str(all_mae))
    print('Low vs. med: ' + str(stats.ttest_ind(low_predictions, med_predictions)))
    print('Low vs. high: ' + str(stats.ttest_ind(low_predictions, high_predictions)))
    
    return low_mae, med_mae, high_mae, all_mae

In [24]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""
    low_precisions = []
    med_precisions = []
    high_precisions = []
    low_recalls = []
    med_recalls = []
    high_recalls = []

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    for uid in precisions.keys():
        if uid in low_users.index:
            low_precisions.append(precisions[uid])
            low_recalls.append(recalls[uid])
        elif uid in medium_users.index:
            med_precisions.append(precisions[uid])
            med_recalls.append(recalls[uid])
        else:
            high_precisions.append(precisions[uid])
            high_recalls.append(recalls[uid])
    
    return np.mean(low_precisions), np.mean(med_precisions), np.mean(high_precisions), np.mean(low_recalls), np.mean(med_recalls), np.mean(high_recalls)

In [25]:
sim_users = {'name': 'cosine', 'user_based': True}  # compute cosine similarities between users
algos = []

algos.append(KNNBasic(sim_options = sim_users, k=40)) 
algos.append(KNNWithMeans(sim_options = sim_users, k=40))
algos.append(NMF(n_factors = 30, random_state=my_seed))
algos.append(CoClustering(n_cltr_u=3, n_cltr_i=3, random_state=my_seed))
algo_names = ['KNNBasic',
              'KNNWithMeans',
              'NMF',
              'CoClustering']

kf = KFold(n_splits=folds, random_state = my_seed)
for i in range(0, len(algo_names)):
    df_item_dist[algo_names[i]] = 0
    # low_maes = []
    # med_maes = []
    # high_maes = []
    # all_maes = []
    low_precisions = []
    med_precisions = []
    high_precisions = []
    all_precisions = []
    low_recalls = []
    med_recalls = []
    high_recalls = []
    all_recalls = []

    print(algo_names[i])
    fold_count = 0
    for trainset, testset in kf.split(data):
        # calculate and evaluate recommendations
        algos[i].fit(trainset)
        predictions = algos[i].test(testset)        
        # low_mae, med_mae, high_mae, all_mae = get_mae_of_groups(predictions)
        # low_maes.append(low_mae)
        # med_maes.append(med_mae)
        # high_maes.append(high_mae)
        # all_maes.append(all_mae)

        # calculate precision and recall
        low_precision, mid_precision, high_precision, low_recall, mid_recall, high_recall = precision_recall_at_k(predictions, k=10, threshold=3.5)
        low_precisions.append(low_precision)
        med_precisions.append(mid_precision)
        high_precisions.append(high_precision)
        all_precisions.append(np.mean([low_precision, mid_precision, high_precision]))

        low_recalls.append(low_recall)
        med_recalls.append(mid_recall)
        high_recalls.append(high_recall)
        all_recalls.append(np.mean([low_recall, mid_recall, high_recall]))

        # get top-n recommendation counts
        # top_n = get_top_n(predictions, n=10)
        # for uid, user_ratings in top_n.items():
        #     for (iid, _) in user_ratings:
        #         df_item_dist.loc[iid, algo_names[i]] += 1
        
    # print('LowMS: ' + str(np.mean(low_maes)))
    # print('MedMS: ' + str(np.mean(med_maes)))
    # print('HighMS: ' + str(np.mean(high_maes)))
    # print('All: ' + str(np.mean(all_maes)))
    print('Low Precision: ' + str(np.mean(low_precisions)))
    print('Med Precision: ' + str(np.mean(med_precisions)))
    print('High Precision: ' + str(np.mean(high_precisions)))
    print('All Precision: ' + str(np.mean(all_precisions)))
    print('\n')
    print('Low Recall: ' + str(np.mean(low_recalls)))
    print('Med Recall: ' + str(np.mean(med_recalls)))
    print('High Recall: ' + str(np.mean(high_recalls)))
    print('All Recall: ' + str(np.mean(all_recalls)))

KNNBasic


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Low Precision: 0.9938201267567784
Med Precision: 0.9955457707767221
High Precision: 0.994249268214902
All Precision: 0.9945383885828007


Low Recall: 0.39657141133273716
Med Recall: 0.4849721559767105
High Recall: 0.7247747820510398
All Recall: 0.5354394497868292
KNNWithMeans
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


KeyboardInterrupt: 

KNNBasic
* Low Precision: 0.9938201267567784
* Med Precision: 0.9955457707767221
* High Precision: 0.994249268214902
* All Precision: 0.9945383885828007

* Low Recall: 0.39657141133273716
* Med Recall: 0.4849721559767105
* High Recall: 0.7247747820510398
* All Recall: 0.5354394497868292

KNNWithMeans
* Low Precision: 0.9915877565683928
* Med Precision: 0.9955473982119647
* High Precision: 0.9942943911602035
* All Precision: 0.9938098486468536

* Low Recall: 0.39613806776898836
* Med Recall: 0.48478519778438856
* High Recall: 0.7247455270760168
* All Recall: 0.5352229308764646

NMF
* Low Precision: 0.9925776793395326
* Med Precision: 0.9952142907133863
* High Precision: 0.9939831473956422
* All Precision: 0.9939250391495204

* Low Recall: 0.3960976163465185
* Med Recall: 0.48452718107564297
* High Recall: 0.7244137077574455
* All Recall: 0.5350128350598691

CoClustering
* Low Precision: 0.9917593204269437
* Med Precision: 0.9956996481792876
* High Precision: 0.9943917621678924
* All Precision: 0.9939502435913747

* Low Recall: 0.3960974193955978
* Med Recall: 0.48478724212146407
* High Recall: 0.7246998161380069
* All Recall: 0.535194825885023


In [1]:
!pip3 install implicit --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
import numpy as np

# Load the dataset
df = pd.read_csv('data/myanime_600K.csv')
df = df.rename(columns={'anime_id': 'item_id'})

# Preprocess the data
df['rating'] = df['rating'].apply(lambda x: 1 if x >= 6 else 0)

# Create a sparse matrix for ALS
sparse_item_user = coo_matrix((df['rating'].astype(float),
                               (df['item_id'], df['user_id'])))

# Split the data into training and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Create a sparse matrix for ALS training
sparse_item_user_train = coo_matrix((train_data['rating'].astype(float),
                                     (train_data['item_id'], train_data['user_id'])))

# Create a sparse matrix for ALS test
sparse_item_user_test = coo_matrix((test_data['rating'].astype(float),
                                    (test_data['item_id'], test_data['user_id'])))


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Initialize the ALS model
model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)

# Train the ALS model
model.fit(sparse_item_user_train.T)


  check_blas_config()
100%|██████████| 20/20 [00:01<00:00, 19.02it/s]


Get MAE, Prevision and recall

In [19]:
def recommend(model, user_id, sparse_item_user, N=10):
    user_items = sparse_item_user.T.tocsr()
    recommendations = model.recommend(user_id, user_items[user_id], N=N)
    return recommendations[0], recommendations[1]


# Define evaluation metrics
def precision_at_k(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def average_precision(r):
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)

def mean_average_precision(rs):
    return np.mean([average_precision(r) for r in rs])

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

# Create user-item interaction dictionaries for test data
user_items_test = test_data.groupby('user_id')['item_id'].apply(list).to_dict()

# Evaluate the model
def evaluate_model(model, user_items_test, sparse_item_user, n=10):
    mean_map_low = 0.
    mean_map_med = 0.
    mean_map_high = 0.
    mean_ndcg_low = 0.
    mean_ndcg_med = 0.
    mean_ndcg_high = 0.
    user_low = 0
    user_med = 0
    user_high = 0

    user_items = sparse_item_user.T.tocsr()
    
    for user in user_items_test.keys():
        rec_items, _ = recommend(model, user, user_items, N=n)
        rel_vector = [1 if item in user_items_test[user] else 0 for item in rec_items]
        if user in low_users.index:
            mean_map_low += mean_average_precision([rel_vector])
            mean_ndcg_low += ndcg_at_k(rel_vector, n)
            user_low += 1
        elif user in medium_users.index:
            mean_map_med += mean_average_precision([rel_vector])
            mean_ndcg_med += ndcg_at_k(rel_vector, n)
            user_med += 1
        else:
            mean_map_high += mean_average_precision([rel_vector])
            mean_ndcg_high += ndcg_at_k(rel_vector, n)
            user_high += 1
    mean_map_low /= user_low
    mean_map_med /= user_med
    mean_map_high /= user_high
    mean_ndcg_low /= user_low
    mean_ndcg_med /= user_med
    mean_ndcg_high /= user_high
    print(user_low, user_med, user_high)
    
    return mean_map_low, mean_map_med, mean_map_high, mean_ndcg_low, mean_ndcg_med, mean_ndcg_high

In [20]:

# Perform the evaluation
mean_map_low, mean_map_med, mean_map_high, mean_ndcg_low, mean_ndcg_med, mean_ndcg_high = evaluate_model(model, user_items_test, sparse_item_user_test, n=10)
print('MAP Low: ' + str(mean_map_low))
print('MAP Med: ' + str(mean_map_med))
print('MAP High: ' + str(mean_map_high))
print('NDCG Low: ' + str(mean_ndcg_low))
print('NDCG Med: ' + str(mean_ndcg_med))
print('NDCG High: ' + str(mean_ndcg_high))

1179 1203 1060
MAP Low: 0.134152199424676
MAP Med: 0.12857068637928976
MAP High: 0.11564153439153442
NDCG Low: 0.2258431700285121
NDCG Med: 0.22331426922678385
NDCG High: 0.20496220492654404
