### Importar librerías

In [1]:
# !pip3 install implicit --upgrade

In [2]:
# !pip3 install surprise --upgrade

In [3]:
# imports
import random as rd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from scipy import stats

from sklearn.model_selection import train_test_split

from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares




### Calcular Popularidad de ítems

In [4]:
def calculate_item_popularity(df):
    # Calcular la cantidad total de usuarios
    total_users = df['user_id'].nunique()
    
    # Contar la cantidad de usuarios únicos que compraron cada item
    item_user_count = df.groupby('item_id')['user_id'].nunique().reset_index()
    
    # Renombrar la columna user_id a popularity
    item_user_count = item_user_count.rename(columns={'user_id': 'popularity'})
    
    # Calcular la popularidad dividiendo por la cantidad total de usuarios
    item_user_count['popularity'] = item_user_count['popularity'] / total_users
    
    return item_user_count

### Calcular ítems más populares (top 20%)

In [5]:
def get_top_20_percent_items(popularity_df):
    # Ordenar los items por popularidad de mayor a menor
    sorted_popularity_df = popularity_df.sort_values(by='popularity', ascending=False)
    
    # Calcular el número de ítems que corresponde al 20%
    top_20_percent_count = int(len(sorted_popularity_df) * 0.20)
    
    # Obtener los ítems más populares que corresponden al 20%
    I_pop = sorted_popularity_df.head(top_20_percent_count)['item_id'].tolist()
    
    return I_pop

### Calcular popularidad de users

In [6]:
def calculate_user_popularity(df, I_pop):
    # Filtrar los items rateados que están en I_pop
    df['is_popular'] = df['item_id'].isin(I_pop)
    
    # Calcular la cantidad de items populares rateados por cada usuario
    user_popular_items_count = df[df['is_popular']].groupby('user_id')['item_id'].count().reset_index()
    user_popular_items_count = user_popular_items_count.rename(columns={'item_id': 'popular_items_count'})
    
    # Calcular la cantidad total de items rateados por cada usuario
    user_total_items_count = df.groupby('user_id')['item_id'].count().reset_index()
    user_total_items_count = user_total_items_count.rename(columns={'item_id': 'total_items_count'})

    # Asegurar que todos los usuarios están presentes en el resultado final
    user_popular_items_count = pd.merge(user_total_items_count[['user_id']], 
                                        user_popular_items_count, 
                                        on='user_id', 
                                        how='left').fillna(0)

    # Combinar los dos DataFrames
    user_popularity_df = pd.merge(user_popular_items_count, user_total_items_count, on='user_id')
    
    # Calcular user_pop dividiendo popular_items_count por total_items_count
    user_popularity_df['user_pop'] = user_popularity_df['popular_items_count'] / user_popularity_df['total_items_count']
    
    # Seleccionar solo las columnas user_id y user_pop
    user_popularity_df = user_popularity_df[['user_id', 'user_pop']]
    
    return user_popularity_df

### Inicializar data original

In [7]:
# constants and initialization
dataset = 'book'  #options:'lfm', anime', 'book', 'ml'
folds = 5
my_seed = 0
rd.seed(my_seed)
np.random.seed(my_seed)
top_fraction = 0.2
user_events_file = dataset + '/user_events.txt'
low_user_file = dataset + '/low_main_users.txt'
medium_user_file = dataset + '/medium_main_users.txt'
high_user_file = dataset + '/high_main_users.txt'

In [8]:
# # read user events and users
# df_events = pd.read_csv(user_events_file, sep=',')
# df_events = df_events.rename(columns={'user': 'user_id', 'item': 'item_id', 'preference': 'rating'})
# print('No. of user events: ' + str(len(df_events)))
# # read users
# low_users = pd.read_csv(low_user_file, sep=',').set_index('user')
# medium_users = pd.read_csv(medium_user_file, sep=',').set_index('user')
# high_users = pd.read_csv(high_user_file, sep=',').set_index('user')
# no_users = len(low_users) + len(medium_users) + len(high_users)
# print('No. of users: ' + str(no_users))
# print('No. of events per user: ' + str(len(df_events) / no_users))

In [9]:
# # rating range
# print('Min rating: ' + str(df_events['rating'].min()))
# print('Max rating: ' + str(df_events['rating'].max()))

### Inicializar netflix

In [10]:
# read user events and users
df_events = pd.read_csv('./netflix/netflix.csv', sep=',')
# Preprocess the data
print('No. of user events: ' + str(len(df_events)))
# read users
low_users = pd.read_csv('./netflix/bot.csv', sep=',').set_index('user_id')
medium_users = pd.read_csv('./netflix/mid.csv', sep=',').set_index('user_id')
high_users = pd.read_csv('./netflix/top.csv', sep=',').set_index('user_id')
no_users = len(low_users) + len(medium_users) + len(high_users)
print('No. of users: ' + str(no_users))
print('No. of events per user: ' + str(len(df_events) / no_users))

No. of user events: 459514
No. of users: 3000
No. of events per user: 153.17133333333334


### Inicializar new anime

In [11]:
# # read user events and users
# df_events = pd.read_csv('./myanime_600K.csv', sep=',')
# df_events = df_events.rename(columns={'anime_id': 'item_id'})
# # Preprocess the data
# print('No. of user events: ' + str(len(df_events)))
# # read users
# low_users = pd.read_csv('./myanime/bot.csv', sep=',').set_index('user')
# medium_users = pd.read_csv('./myanime/mid.csv', sep=',').set_index('user')
# high_users = pd.read_csv('./myanime/top.csv', sep=',').set_index('user')
# no_users = len(low_users) + len(medium_users) + len(high_users)
# print('No. of users: ' + str(no_users))
# print('No. of events per user: ' + str(len(df_events) / no_users))

### Preprocesar data

In [12]:
df = df_events.copy()

# Ajustar segun el dataset

# Myanime y book crossing x = 1 si rating >= 6, x = 0 si rating < 6
#df_events['rating'] = df_events['rating'].apply(lambda x: 1 if x >= 6 else 0)

# Netflix y movieLens x = 1 si rating >= 3, x = 0 si rating < 3
df_events['rating'] = df_events['rating'].apply(lambda x: 1 if x >= 3 else 0)

# LFM x = 1 si rating >= 60, x = 0 si rating < 60
#df_events['rating'] = df_events['rating'].apply(lambda x: 1 if x >= 60 else 0)

# Reindexar usuarios e ítems
df['user_id'] = df['user_id'].astype('category')
user_cat_mapping = df['user_id'].cat.categories
df['user_id'] = df['user_id'].cat.codes
df['item_id'] = df['item_id'].astype('category')
item_cat_mapping = df['item_id'].cat.categories
df['item_id'] = df['item_id'].cat.codes

# Create a sparse matrix for ALS
sparse_item_user = coo_matrix((df['rating'].astype(float),
                               (df['item_id'], df['user_id'])))

# Split the data into training and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Create a sparse matrix for ALS training
sparse_item_user_train = coo_matrix((train_data['rating'].astype(float),
                                     (train_data['item_id'], train_data['user_id'])))

# Create a sparse matrix for ALS test
sparse_item_user_test = coo_matrix((test_data['rating'].astype(float),
                                    (test_data['item_id'], test_data['user_id'])))

In [13]:
# Initialize the ALS model
model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)

# Train the ALS model
model.fit(sparse_item_user_train.T)

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

In [14]:
low_users.index

Index([2598097,  847615, 1770582,  453497,   67188, 1638715, 1131687, 2251190,
        315988, 1602731,
       ...
        281628,  770204, 1541981, 1533580,  989589, 2263361, 1870457,  616769,
        844639, 1140339],
      dtype='int64', name='user_id', length=1000)

In [15]:
def recommend(model, user_id, sparse_item_user, N=10):
    user_items = sparse_item_user.T.tocsr()
    recommendations = model.recommend(user_id, user_items[user_id], N=N)
    return recommendations[0], recommendations[1]


# Define evaluation metrics
def precision_at_k(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def average_precision(r):
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)

def mean_average_precision(rs):
    return np.mean([average_precision(r) for r in rs])

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def recall_at_k(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    return np.sum(r) / len(r)


# Create user-item interaction dictionaries for test data
user_items_test = test_data.groupby('user_id')['item_id'].apply(list).to_dict()

# Evaluate the model
def evaluate_model(model, user_items_test, sparse_item_user, n=10):
    mean_pre_low = 0.
    mean_pre_med = 0.
    mean_pre_high = 0.
    mean_recall_low = 0.
    mean_recall_med = 0.
    mean_recall_high = 0.
    user_low = 0
    user_med = 0
    user_high = 0

    user_items = sparse_item_user.T.tocsr()
    
    for user in user_items_test.keys():
        rec_items, _ = recommend(model, user, user_items, N=n)
        rel_vector = [1 if item in user_items_test[user] else 0 for item in rec_items]
        user_original_id = user_cat_mapping[user]
        if user_original_id in low_users.index:
            mean_pre_low += precision_at_k(rel_vector, n)
            mean_recall_low += recall_at_k(rel_vector, n)
            user_low += 1
        elif user_original_id in medium_users.index:
            mean_pre_med += precision_at_k(rel_vector, n)
            mean_recall_med += recall_at_k(rel_vector, n)
            user_med += 1
        else:
            mean_pre_high += precision_at_k(rel_vector, n)
            mean_recall_high += recall_at_k(rel_vector, n)
            user_high += 1
    mean_pre_low /= user_low
    mean_pre_med /= user_med
    mean_pre_high /= user_high
    mean_recall_low /= user_low
    mean_recall_med /= user_med
    mean_recall_high /= user_high
    print(user_low, user_med, user_high)
    
    return mean_pre_low, mean_pre_med, mean_pre_high, mean_recall_low, mean_recall_med, mean_recall_high

In [16]:

# Perform the evaluation
mean_pre_low, mean_pre_med, mean_pre_high, mean_recall_low, mean_recall_med, mean_recall_high = evaluate_model(model, user_items_test, sparse_item_user_test, n=10)
print('Precision Low: ' + str(mean_pre_low))
print('Precision Med: ' + str(mean_pre_med))
print('Precision High: ' + str(mean_pre_high))
print('Recall Low: ' + str(mean_recall_low))
print('Recall Med: ' + str(mean_recall_med))
print('Recall High: ' + str(mean_recall_high))

967 995 939
Precision Low: 0.07776628748707362
Precision Med: 0.10140703517587903
Precision High: 0.0935037273695419
Recall Low: 0.07776628748707362
Recall Med: 0.10140703517587903
Recall High: 0.0935037273695419


### Netflix

Precision Low: 0.07776628748707362
Precision Med: 0.10140703517587903
Precision High: 0.0935037273695419
Recall Low: 0.07776628748707362
Recall Med: 0.10140703517587903
Recall High: 0.0935037273695419

### New anime

Precision Low: 0.144274809160304
Precision Med: 0.14181213632585082
Precision High: 0.12122641509433925
Recall Low: 0.144274809160304
Recall Med: 0.14181213632585082
Recall High: 0.12122641509433925

### LFM

Precision Low: 0.13009999999999955
Precision Med: 0.18159999999999885
Precision High: 0.1783999999999989
Recall Low: 0.13009999999999955
Recall Med: 0.18159999999999885
Recall High: 0.1783999999999989

### Anime

Precision Low: 0.15359999999999926
Precision Med: 0.15359999999999918
Precision High: 0.14369999999999913
Recall Low: 0.15359999999999926
Recall Med: 0.15359999999999918
Recall High: 0.14369999999999913

### Book

Precision Low: 0.020000000000000004
Precision Med: 0.03950000000000016
Precision High: 0.04550000000000023
Recall Low: 0.020000000000000004
Recall Med: 0.03950000000000016
Recall High: 0.04550000000000023

### MovieLens

Precision Low: 0.1236999999999996
Precision Med: 0.1258999999999995
Precision High: 0.12379999999999945
Recall Low: 0.1236999999999996
Recall Med: 0.1258999999999995
Recall High: 0.12379999999999945