In [1]:
# imports
import random as rd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
from scipy import stats

from surprise import AlgoBase
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNBaseline
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import NMF
from surprise import SVD
from surprise import SVDpp
from surprise import SlopeOne
from surprise import CoClustering
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

In [2]:
# constants and initialization
dataset = 'lfm'  #options:'lfm', anime', 'book', 'ml'
folds = 5
my_seed = 0
rd.seed(my_seed)
np.random.seed(my_seed)
top_fraction = 0.2
user_events_file = dataset + '/user_events.txt'
low_user_file = dataset + '/low_main_users.txt'
medium_user_file = dataset + '/medium_main_users.txt'
high_user_file = dataset + '/high_main_users.txt'

In [3]:
# read user events and users
cols = ['user', 'item', 'preference']
df_events = pd.read_csv(user_events_file, sep=',', names=cols, skiprows=1)
print('No. of user events: ' + str(len(df_events)))
# read users
low_users = pd.read_csv(low_user_file, sep=',').set_index('user')
medium_users = pd.read_csv(medium_user_file, sep=',').set_index('user')
high_users = pd.read_csv(high_user_file, sep=',').set_index('user')
no_users = len(low_users) + len(medium_users) + len(high_users)
print('No. of users: ' + str(no_users))
print('No. of events per user: ' + str(len(df_events) / no_users))

No. of user events: 1755361
No. of users: 3000
No. of events per user: 585.1203333333333


In [4]:
# get item distribution
item_dist = df_events['item'].value_counts()
num_items = len(item_dist)
print('No. items: ' + str(num_items))
# create item dataframe with normalized item counts
df_item_dist = pd.DataFrame(item_dist)
df_item_dist.columns = ['count']
df_item_dist['count'] /= no_users
print('No. of events per item: ' + str(len(df_events) / num_items))

No. items: 352805
No. of events per item: 4.9754425249075265


In [5]:
# sparsity
1 - len(df_events) / (no_users * num_items)

0.9983415191583641

In [6]:
# rating range
print('Min rating: ' + str(df_events['preference'].min()))
print('Max rating: ' + str(df_events['preference'].max()))

Min rating: 1.0
Max rating: 1000.0


In [7]:
# # get fractions
# user_hist = [] # user history sizes
# pop_item_fraq = [] # average popularity of items in user profiles
# for u, df in df_events.groupby('user'):
#     no_user_items = len(set(df['item'])) # profile size
#     user_hist.append(no_user_items)
#     # get popularity (= fraction of users interacted with item) of user items and calculate average of it
#     user_pop_item_fraq = sum(item_dist[df['item']] / no_users) / no_user_items
#     pop_item_fraq.append(user_pop_item_fraq)

In [8]:
# plt.figure()
# slope, intercept, r_value, p_value, std_err = stats.linregress(user_hist, pop_item_fraq)
# print('R-value: ' + str(r_value))
# print('R2-value: ' + str(r_value**2))
# print('P-value: ' + str(p_value))
# print('Slope: ' + str(slope))
# print('Intercept: ' + str(intercept))
# print(stats.spearmanr(user_hist, pop_item_fraq))

# line = slope * np.array(user_hist) + intercept
# plt.plot(user_hist, pop_item_fraq, 'o', user_hist, line)
# plt.xlabel('User profile size', fontsize='15')
# plt.ylabel('Average popularity of items', fontsize='15')
# plt.xticks(fontsize='13')
# plt.yticks(fontsize='13')
# #plt.savefig('data/' + dataset + '/plots/corr_user_avg.png', dpi=300, bbox_inches='tight')

In [9]:
# start recommender

In [10]:
reader = Reader(rating_scale=(df_events['preference'].min(), df_events['preference'].max()))

In [11]:
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_events, reader)
print(data)

<surprise.dataset.DatasetAutoFolds object at 0x7f980d4cefb0>


In [12]:
def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [13]:
def get_mae_of_groups(predictions):
    #print('All: ')
    #all_mae = accuracy.mae(predictions)
    all_predictions = []
    low_predictions = []
    med_predictions = []
    high_predictions = []
    for uid, iid, true_r, est, details in predictions:
        prediction = [(uid, iid, true_r, est, details)]
        if uid in low_users.index:
            low_predictions.append(accuracy.mae(prediction, verbose=False))
        elif uid in medium_users.index:
            med_predictions.append(accuracy.mae(prediction, verbose=False))
        else:
            high_predictions.append(accuracy.mae(prediction, verbose=False))          
    low_mae = np.mean(low_predictions)
    #print('LowMS: ' + str(low_mae))
    med_mae = np.mean(med_predictions)
    #print('MedMS: ' + str(med_mae))
    high_mae = np.mean(high_predictions)
    #print('HighMS: ' + str(high_mae))
    all_mae = np.mean([low_mae, med_mae, high_mae])
    #print('All: ' + str(all_mae))
    print('Low vs. med: ' + str(stats.ttest_ind(low_predictions, med_predictions)))
    print('Low vs. high: ' + str(stats.ttest_ind(low_predictions, high_predictions)))
    
    return low_mae, med_mae, high_mae, all_mae

In [14]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""
    low_precisions = []
    med_precisions = []
    high_precisions = []
    low_recalls = []
    med_recalls = []
    high_recalls = []

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    for uid in precisions.keys():
        if uid in low_users.index:
            low_precisions.append(precisions[uid])
            low_recalls.append(recalls[uid])
        elif uid in medium_users.index:
            med_precisions.append(precisions[uid])
            med_recalls.append(recalls[uid])
        else:
            high_precisions.append(precisions[uid])
            high_recalls.append(recalls[uid])
    
    return np.mean(low_precisions), np.mean(med_precisions), np.mean(high_precisions), np.mean(low_recalls), np.mean(med_recalls), np.mean(high_recalls)

In [15]:
sim_users = {'name': 'cosine', 'user_based': True}  # compute cosine similarities between users
algos = []

algos.append(KNNBasic(sim_options = sim_users, k=40)) 
algos.append(KNNWithMeans(sim_options = sim_users, k=40))
algos.append(NMF(n_factors = 30, random_state=my_seed))
algos.append(CoClustering(n_cltr_u=3, n_cltr_i=3, random_state=my_seed))
algo_names = ['KNNBasic',
              'KNNWithMeans',
              'NMF',
              'CoClustering']

kf = KFold(n_splits=folds, random_state = my_seed)
for i in range(0, len(algo_names)):
    df_item_dist[algo_names[i]] = 0
    # low_maes = []
    # med_maes = []
    # high_maes = []
    # all_maes = []
    low_precisions = []
    med_precisions = []
    high_precisions = []
    all_precisions = []
    low_recalls = []
    med_recalls = []
    high_recalls = []
    all_recalls = []

    print(algo_names[i])
    fold_count = 0
    for trainset, testset in kf.split(data):
        # calculate and evaluate recommendations
        algos[i].fit(trainset)
        predictions = algos[i].test(testset)        
        # low_mae, med_mae, high_mae, all_mae = get_mae_of_groups(predictions)
        # low_maes.append(low_mae)
        # med_maes.append(med_mae)
        # high_maes.append(high_mae)
        # all_maes.append(all_mae)

        # calculate precision and recall
        low_precision, mid_precision, high_precision, low_recall, mid_recall, high_recall = precision_recall_at_k(predictions, k=10, threshold=3.5)
        low_precisions.append(low_precision)
        med_precisions.append(mid_precision)
        high_precisions.append(high_precision)
        all_precisions.append(np.mean([low_precision, mid_precision, high_precision]))

        low_recalls.append(low_recall)
        med_recalls.append(mid_recall)
        high_recalls.append(high_recall)
        all_recalls.append(np.mean([low_recall, mid_recall, high_recall]))

        # get top-n recommendation counts
        # top_n = get_top_n(predictions, n=10)
        # for uid, user_ratings in top_n.items():
        #     for (iid, _) in user_ratings:
        #         df_item_dist.loc[iid, algo_names[i]] += 1
        
    # print('LowMS: ' + str(np.mean(low_maes)))
    # print('MedMS: ' + str(np.mean(med_maes)))
    # print('HighMS: ' + str(np.mean(high_maes)))
    # print('All: ' + str(np.mean(all_maes)))
    print('Low Precision: ' + str(np.mean(low_precisions)))
    print('Med Precision: ' + str(np.mean(med_precisions)))
    print('High Precision: ' + str(np.mean(high_precisions)))
    print('All Precision: ' + str(np.mean(all_precisions)))
    print('\n')
    print('Low Recall: ' + str(np.mean(low_recalls)))
    print('Med Recall: ' + str(np.mean(med_recalls)))
    print('High Recall: ' + str(np.mean(high_recalls)))
    print('All Recall: ' + str(np.mean(all_recalls)))

KNNBasic
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Low Precision: 0.6717803968253968
Med Precision: 0.6926784920634921
High Precision: 0.7676726984126985
All Precision: 0.7107105291005291


Low Recall: 0.21364471295990822
Med Recall: 0.15192751591900525
High Recall: 0.23994027398395787
All Recall: 0.2018375009542904
KNNWithMeans
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine simila

In [16]:
# for i in range(0, len(algo_names)):
#     plt.figure()
#     x = df_item_dist['count']
#     y = df_item_dist[algo_names[i]]
#     #plt.gca().set_ylim(0, 300)
#     slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
#     line = slope * np.array(x) + intercept
#     print(algo_names[i])
#     print(stats.spearmanr(x, y))
#     plt.plot(x, y, 'o', x, line)
#     plt.xlabel('Item popularity', fontsize='15')
#     plt.ylabel('Recommendation frequency', fontsize='15')
#     plt.xticks(fontsize='13')
#     plt.yticks(fontsize='13')
#     #plt.savefig('data/' + dataset + '/plots/rec_' + algo_names[i] + '.png', dpi=300, bbox_inches='tight')

# Resultados


### Movie Lens

KNNBasic

* Low Precision: 0.7704684920634921
* Med Precision: 0.798344126984127
* High Precision: 0.7915609523809524
* All Precision: 0.7867911904761905

* Low Recall: 0.33841475348709904
* Med Recall: 0.4577492550791713
* High Recall: 0.6210483597199806
* All Recall: 0.47240412276208354

KNNWithMeans
* Low Precision: 0.6965993650793652
* Med Precision: 0.7756833333333335
* High Precision: 0.786331507936508
* All Precision: 0.7528714021164022

* Low Recall: 0.2538119181328916
* Med Recall: 0.3918997939115019
* High Recall: 0.5819585612029813
* All Recall: 0.4092234244157916

NMF
* Low Precision: 0.7527957936507936
* Med Precision: 0.78965
* High Precision: 0.7886530952380953
* All Precision: 0.7770329629629631

* Low Recall: 0.3281243813860379
* Med Recall: 0.455435270841933
* High Recall: 0.6178257154555379
* All Recall: 0.4671284558945029

CoClustering
* Low Precision: 0.7465554761904762
* Med Precision: 0.7995379365079366
* High Precision: 0.7972762698412699
* All Precision: 0.7811232275132276

* Low Recall: 0.28726400365063925
* Med Recall: 0.42888337935572823
* High Recall: 0.6050122296094746
* All Recall: 0.44038653753861406


### Anime

KNNBasic

* Low Precision: 0.9945884126984128
* Med Precision: 0.9972578571428571
* High Precision: 0.9962325396825398
* All Precision: 0.9960262698412699

* Low Recall: 0.3010750950781106
* Med Recall: 0.42410904956695106
* High Recall: 0.6665163599731005
* All Recall: 0.463900168206054

KNNWithMeans
* Low Precision: 0.9929084126984126
* Med Precision: 0.9974978571428572
* High Precision: 0.9965325396825397
* All Precision: 0.9956462698412698

* Low Recall: 0.3008656489549565
* Med Recall: 0.4241893817579811
* High Recall: 0.6666790484474276
* All Recall: 0.46391135972012176

NMF
* Low Precision: 0.993325634920635
* Med Precision: 0.9971578571428571
* High Precision: 0.9962125396825396
* All Precision: 0.9955653439153439

* Low Recall: 0.3007827932626696
* Med Recall: 0.42412373934078645
* High Recall: 0.6665027390613784
* All Recall: 0.46380309055494473

CoClustering
* Low Precision: 0.9930684126984127
* Med Precision: 0.9970778571428571
* High Precision: 0.9965725396825397
* All Precision: 0.9955729365079365

* Low Recall: 0.30067954029541677
* Med Recall: 0.4240182979573649
* High Recall: 0.6667154636596091
* All Recall: 0.4638044339707969


### Book

KNNBasic
* Low Precision: 0.9909996031746031
* Med Precision: 0.99301126984127
* High Precision: 0.9934638095238094
* All Precision: 0.9924915608465609

* Low Recall: 0.5092605070850196
* Med Recall: 0.4620812954025634
* High Recall: 0.5173442636697978
* All Recall: 0.49622868871912684

KNNWithMeans
* Low Precision: 0.9916887301587302
* Med Precision: 0.9932912698412698
* High Precision: 0.9936838095238094
* All Precision: 0.9928879365079364

* Low Recall: 0.5093677488973936
* Med Recall: 0.4622402678622658
* High Recall: 0.5174850764919627
* All Recall: 0.4963643644172073

NMF
* Low Precision: 0.9917019047619048
* Med Precision: 0.99313126984127
* High Precision: 0.9937546031746031
* All Precision: 0.9928625925925927

* Low Recall: 0.5085475026055173
* Med Recall: 0.46179779789501013
* High Recall: 0.5172949705413175
* All Recall: 0.49588009034728164

CoClustering
* Low Precision: 0.9915996031746033
* Med Precision: 0.9929512698412697
* High Precision: 0.993372380952381
* All Precision: 0.9926410846560847

* Low Recall: 0.5095295354945317
* Med Recall: 0.46226876594519695
* High Recall: 0.5173457822555484
* All Recall: 0.496381361231759

### LFM

KNNBasic

* Low Precision: 0.6717803968253968
* Med Precision: 0.6926784920634921
* High Precision: 0.7676726984126985
* All Precision: 0.7107105291005291

* Low Recall: 0.21364471295990822
* Med Recall: 0.15192751591900525
* High Recall: 0.23994027398395787
* All Recall: 0.2018375009542904

KNNWithMeans
* Low Precision: 0.6731023809523811
* Med Precision: 0.6880788095238095
* High Precision: 0.7559572222222223
* All Precision: 0.7057128042328042

* Low Recall: 0.21294207683498362
* Med Recall: 0.14992896860078572
* High Recall: 0.23615352781749904
* All Recall: 0.19967485775108945

NMF
* Low Precision: 0.6312755555555556
* Med Precision: 0.598235238095238
* High Precision: 0.6549547619047619
* All Precision: 0.6281551851851852

* Low Recall: 0.1965284343490094
* Med Recall: 0.1295440782539194
* High Recall: 0.20650777629230252
* All Recall: 0.1775267629650771

CoClustering
* Low Precision: 0.672306507936508
* Med Precision: 0.6898487301587302
* High Precision: 0.7660446031746032
* All Precision: 0.709399947089947

* Low Recall: 0.21368047819309094
* Med Recall: 0.15228889489132955
* High Recall: 0.2421086986639204
* All Recall: 0.20269269058278033