In [1]:
# imports
import random as rd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from scipy import stats

import pyreclab

from surprise import AlgoBase
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNBaseline
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import NMF
from surprise import SVD
from surprise import SVDpp
from surprise import SlopeOne
from surprise import CoClustering
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

### Inicializar originales

In [2]:
# constants and initialization
dataset = 'ml'  #options:'lfm', anime', 'book', 'ml'
folds = 5
my_seed = 0
rd.seed(my_seed)
np.random.seed(my_seed)
top_fraction = 0.2
user_events_file = dataset + '/user_events.txt'
low_user_file = dataset + '/low_main_users.txt'
medium_user_file = dataset + '/medium_main_users.txt'
high_user_file = dataset + '/high_main_users.txt'

In [3]:
# # read user events and users
# cols = ['user', 'item', 'preference']
# df_events = pd.read_csv(user_events_file, sep=',', names=cols, skiprows=1)
# print('No. of user events: ' + str(len(df_events)))
# # read users
# low_users = pd.read_csv(low_user_file, sep=',').set_index('user')
# medium_users = pd.read_csv(medium_user_file, sep=',').set_index('user')
# high_users = pd.read_csv(high_user_file, sep=',').set_index('user')
# no_users = len(low_users) + len(medium_users) + len(high_users)
# print('No. of users: ' + str(no_users))
# print('No. of events per user: ' + str(len(df_events) / no_users))

### Inicializar new anime

In [4]:
# # read user events and users
# cols = ['user', 'item', 'preference']
# df_events = pd.read_csv('./myanime_600K.csv', sep=',', names=cols)
# df_events = df_events.rename(columns={'user_id': 'user', 'anime_id': 'item', 'rating': 'preference'})
# print('No. of user events: ' + str(len(df_events)))
# # read users
# low_users = pd.read_csv('./myanime/bot.csv', sep=',').set_index('user')
# medium_users = pd.read_csv('./myanime/mid.csv', sep=',').set_index('user')
# high_users = pd.read_csv('./myanime/top.csv', sep=',').set_index('user')
# no_users = len(low_users) + len(medium_users) + len(high_users)
# print('No. of users: ' + str(no_users))
# print('No. of events per user: ' + str(len(df_events) / no_users))

### Inicializar netflix

In [5]:
# read user events and users
cols = ['user', 'item', 'preference']
df_events = pd.read_csv('./netflix/netflix.csv', sep=',')
df_events = df_events.rename(columns={'user_id': 'user', 'item_id': 'item', 'rating': 'preference'})
print('No. of user events: ' + str(len(df_events)))
# read users
low_users = pd.read_csv('./netflix/bot.csv', sep=',').set_index('user_id')
medium_users = pd.read_csv('./netflix/mid.csv', sep=',').set_index('user_id')
high_users = pd.read_csv('./netflix/top.csv', sep=',').set_index('user_id')
no_users = len(low_users) + len(medium_users) + len(high_users)
print('No. of users: ' + str(no_users))
print('No. of events per user: ' + str(len(df_events) / no_users))

No. of user events: 459514
No. of users: 3000
No. of events per user: 153.17133333333334


### Most Popular

In [6]:
def calculate_precision(model, test_df, topN):
    low_prec = []
    medium_prec = []
    high_prec = []
    for user_id in test_df['user'].unique():
        precision = model.precision( f'{user_id}', 
                                topn = topN,
                                relevance_threshold = 0,
                                include_rated = False )
        if user_id in low_users.index:
            low_prec.append(precision)
        elif user_id in medium_users.index:
            medium_prec.append(precision)
        else:
            high_prec.append(precision)
    return np.mean(low_prec), np.mean(medium_prec), np.mean(high_prec)

In [7]:
def calculate_recall(model, test_df, topN):
    low_recall = []
    medium_recall = []
    high_recall = []
    for user_id in test_df['user'].unique():
        recall = model.recall( str(user_id),
                                topn = topN,
                                relevance_threshold = 0,
                                include_rated = False )
        if user_id in low_users.index:
            low_recall.append(recall)
        elif user_id in medium_users.index:
            medium_recall.append(recall)
        else:
            high_recall.append(recall)
    return np.mean(low_recall), np.mean(medium_recall), np.mean(high_recall)

In [8]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=folds, shuffle=True, random_state = my_seed)
kfold_data = kf.split(df_events)
kfold = 1
topN = 10
low_precisions = []
mid_precisions = []
high_precisions = []
low_recalls = []
mid_recalls = []
high_recalls = []
count = 1

for trainset, testset in kfold_data:
    train_df = df_events.iloc[trainset]
    test_df = df_events.iloc[testset]
    train_df.to_csv(f'./most pop data/train{count}.csv', index=False)
    test_df.to_csv(f'./most pop data/test{count}.csv', index=False)

    # Convertir el DataFrame a la estructura esperada por pyreclab
    train_ratings = train_df[['user', 'item', 'preference']].values.tolist()

    # Inicializar y entrenar el modelo
    model = pyreclab.MostPopular( dataset = './most pop data/train' + str(count) + '.csv',
                                dlmchar=b',',
                                header = True,
                                usercol = 0,
                                itemcol = 1,
                                ratingcol = 2 )
    model.train()

    recommendList, maprec, ndcg = model.testrec(input_file='./most pop data/test' + str(count) + '.csv',
                                          dlmchar=b',',
                                          header=False,
                                          usercol=0,
                                          itemcol=1,
                                          ratingcol=2,
                                          topn=topN,
                                          relevance_threshold=0,
                                          includeRated=False)

    # Calcular las precisions
    low_precision, mid_precision, high_precision = calculate_precision(model, test_df, topN)
    low_precisions.append(low_precision)
    mid_precisions.append(mid_precision)
    high_precisions.append(high_precision)
    
    # Calcular las recalls
    low_recall, mid_recall, high_recall = calculate_recall(model, test_df, topN)
    low_recalls.append(low_recall)
    mid_recalls.append(mid_recall)
    high_recalls.append(high_recall)
    
    count += 1

all_precision = np.mean([low_precisions, mid_precisions, high_precisions])
all_recall = np.mean([low_recalls, mid_recalls, high_recalls])

print('Low Precision: ', np.mean(low_precisions))
print('Medium Precision: ', np.mean(mid_precisions))
print('High Precision: ', np.mean(high_precisions))
print('All Precision: ', all_precision)
print('\n')
print('Low Recall: ', np.mean(low_recalls))
print('Medium Recall: ', np.mean(mid_recalls))
print('High Recall: ', np.mean(high_recalls))
print('All Recall: ', all_recall)
    

Low Precision:  0.0580030354074601
Medium Precision:  0.09284732714508113
High Precision:  0.07965375621723551
All Precision:  0.07683470625659224


Low Recall:  0.024346900975211902
Medium Recall:  0.04391453008944767
High Recall:  0.07611538560635847
All Recall:  0.04812560555700602


## Recall y Precision con MOST POPULAR
ANIME:
* Low Precision:  0.07180000134408475
* Medium Precision:  0.10962000215649606
* High Precision:  0.11118000219613314
* All Precision:  0.09753333523223798

* Low Recall:  0.012959075435390696
* Medium Recall:  0.03867494782991707
* High Recall:  0.06998966443855316
* All Recall:  0.04054122923462031

BOOKS:
* Low Precision:  0.011920000185072421
* Medium Precision:  0.02654000042229891
* High Precision:  0.030840000510215758
* All Precision:  0.02310000037252903

* Low Recall:  0.00416008519846946
* Medium Recall:  0.0074944833249785
* High Recall:  0.01176950469762087
* All Recall:  0.007808024407022943

MOVIELENS:
* Low Precision:  0.10162000198364259
* Medium Precision:  0.11240000228583813
* High Precision:  0.1223000024497509
* All Precision:  0.11210666890641052

* Low Recall:  0.0165079234149307
* Medium Recall:  0.033235900679510086
* High Recall:  0.061033416801132265
* All Recall:  0.03692574696519102

LASTFM:
* Low Precision:  0.059540001083910464
* Medium Precision:  0.09260000180304051
* High Precision:  0.09624000196456908
* All Precision:  0.0827933349505067

* Low Recall:  0.009334813251486048
* Medium Recall:  0.009003960465663114
* High Recall:  0.012973698530229737
* All Recall:  0.0104374907491263

NEW ANIME
* Low Precision:  0.09143458506065552
* Medium Precision:  0.10682347217294035
* High Precision:  0.09647292604811222
* All Precision:  0.09824366109390271

* Low Recall:  0.025005798261108585
* Medium Recall:  0.04312398380231956
* High Recall:  0.060711176127519216
* All Recall:  0.04294698606364911

Netflix
* Low Precision:  0.0580030354074601
* Medium Precision:  0.09284732714508113
* High Precision:  0.07965375621723551
* All Precision:  0.07683470625659224

* Low Recall:  0.024346900975211902
* Medium Recall:  0.04391453008944767
* High Recall:  0.07611538560635847
* All Recall:  0.04812560555700602