# Baseline - Most popular
Use max count of user ratings in test set for precision and recall calculation

In [1]:
import importlib
from collections import defaultdict

import pandas as pd
from sklearn.model_selection import train_test_split

import bibrec.server.Utils as Utils
import bibrec.server.evaluation as eval
from bibrec.server.Utils import assign_popular_based_score

importlib.reload(eval)
importlib.reload(Utils)

books, users, ratings = Utils.get_normalized_data(books_path="../data/normalized_books.csv",
                                                  users_path="../data/normalized_users.csv",
                                                  ratings_path="../data/normalized_ratings.csv")

  ratings = pd.read_csv(ratings_path, sep=",", encoding="utf-8", na_filter=False)


In [2]:
users_with_ratings = ratings["user_id"].unique()

# normal splitting since user specific splitting doesn't make that much of a difference and takes a long time
train, test = train_test_split(ratings, test_size=0.25)

In [3]:
print("Total Ratings Count:", len(train) + len(test))
print("Train Data Ratings Count:", len(train))
print("Test Data Ratings Count:", len(test))

Total Ratings Count: 383962
Train Data Ratings Count: 287971
Test Data Ratings Count: 95991


In [4]:
book_ratings_dict = {
    'user_id': train['user_id'].values,
    'isbn13': train['isbn13'].values,
    'book_rating': train['book_rating'].values
}

book_rating_df = train[['user_id', 'isbn13', 'book_rating']]

In [5]:
train["isbn13"] = train["isbn13"].astype("int")
books["isbn13"] = books["isbn13"].astype("int")
popular_books = assign_popular_based_score(train, books, "user_id", "isbn13", "book_rating")
popular_books = popular_books.sort_values("vote_count", ascending=False)

print(popular_books[:20])

              isbn13  vote_count  avg_rating  weighted_rating
9052   9780316666343         547    8.170018         8.167693
33525  9780971880108         409    4.435208         4.450276
14199  9780385504201         373    8.388740         8.384169
7943   9780312195519         282    8.170213         8.165717
4608   9780060928339         240    7.920833         7.917618
1725    978044667227         231    8.125541         8.120444
23405  9780590353427         228    8.942982         8.930711
7007   9780142001745         224    8.366071         8.358688
24603  9780671027360         210    8.119048         8.113507
20298  9780452282155         206    8.058252         8.053190
9033   9780316601955         204    7.421569         7.422638
16938  9780440237228         199    7.326633         7.328674
26726  9780679764021         188    7.851064         7.847703
29789  9780786868711         184    7.885870         7.882062
9091   9780316769488         179    7.597765         7.597036
28372  9

In [6]:
pop_dict = {
    'item_id': popular_books['isbn13'].values,
    'est_r': popular_books['avg_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

ratings_dict = {
    'user_id': test['user_id'].values,
    'item_id': test['isbn13'].values,
    'rating': test['book_rating'].values
}

ratings_df = pd.DataFrame.from_dict(ratings_dict)

In [7]:
test_uids = ratings_df["user_id"].unique()

print("User Count:", len(test_uids))

User Count: 29857


In [8]:
def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in test_uids:
    top_n[uid] = pop_df

In [9]:
import time

importlib.reload(eval)
importlib.reload(Utils)
start_time = time.time()
# use_max_val specifies whether the calculation should use the max count of user ratings as denominator if the recommendation count is higher
avg_precision = eval.get_avg_precision(ratings_df, top_n, k=50, use_max_val=True)

print("Average Precision:", avg_precision)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Precision: 0.018975924835369744
--- Calculation time: 52.31079387664795 seconds ---


In [10]:
importlib.reload(eval)
importlib.reload(Utils)
# use_max_val specifies whether the calculation should use the max count of user ratings as denominator if the relevant item count is higher
avg_recall = eval.get_avg_recall(ratings_df, top_n, k=50, use_max_val=True)

print("Average Recall:", avg_recall)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Recall: 0.022382074528103858
--- Calculation time: 102.24447202682495 seconds ---
