# Baseline - Most popular
Most popular books are a combination of vote count and the weighted rating

In [1]:
from bibrec.server.Utils import assign_popular_based_score
import pandas as pd
import numpy as np
import bibrec.server.evaluation as eval
from collections import defaultdict
import bibrec.server.Utils as Utils
import importlib
from sklearn.model_selection import train_test_split

importlib.reload(eval)
importlib.reload(Utils)

books, users, ratings = Utils.get_normalized_data(books_path="../data/normalized_books.csv",
                                                  users_path="../data/normalized_users.csv",
                                                  ratings_path="../data/normalized_ratings.csv")

  ratings = pd.read_csv(ratings_path, sep=",", encoding="utf-8", na_filter=False)


In [2]:
train, test = train_test_split(ratings, test_size=0.25)

In [3]:
print("Total Ratings Count:", len(ratings))
print("Train Data Ratings Count:", len(train))
print("Test Data Ratings Count:", len(test))

Total Ratings Count: 383962
Train Data Ratings Count: 287971
Test Data Ratings Count: 95991


In [4]:
book_ratings_dict = {
    'user_id': train['user_id'].values,
    'isbn13': train['isbn13'].values,
    'book_rating': train['book_rating'].values
}

book_rating_df = train[['user_id', 'isbn13', 'book_rating']]

In [5]:
train["isbn13"] = train["isbn13"].astype("int")
books["isbn13"] = books["isbn13"].astype("int")
popular_books = assign_popular_based_score(train, books, "user_id", "isbn13", "book_rating")
popular_books = popular_books.sort_values("vote_count", ascending=False)
popular_books = popular_books[:200]
popular_books = popular_books.sort_values("weighted_rating", ascending=False)

print("Most popular books are:")
print(popular_books[:20])

Most popular books are:
              isbn13  vote_count  avg_rating  weighted_rating
16349  9780439139595         108    9.333333         9.300542
16411  9780439358064         150    9.126667         9.105655
16346  9780439136365          94    9.063830         9.031871
23545  9780590353403          84    9.011905         8.977437
23546  9780590353427         240    8.954167         8.942395
16345  9780439136358         109    8.954128         8.928465
1543    978043913961          82    8.939024         8.905471
9828   9780345339706          96    8.864583         8.837343
17684  9780446310789         163    8.852761         8.836725
16309  9780439064866          94    8.808511         8.781871
31038  9780812550702          94    8.787234         8.761038
16310  9780439064873         148    8.736486         8.720397
9827   9780345339683         130    8.707692         8.689845
20132  9780451524935          71    8.718310         8.685748
16764  9780440219071          77    8.701299  

In [6]:
pop_dict = {
    'item_id': popular_books['isbn13'].values,
    'est_r': popular_books['weighted_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

ratings_dict = {
    'user_id': test['user_id'].values,
    'item_id': test['isbn13'].values,
    'rating': test['book_rating'].values
}

ratings_df = pd.DataFrame.from_dict(ratings_dict)

In [7]:
test_uids = ratings_df["user_id"].unique()

print("User Count:", len(test_uids))

User Count: 29804


In [8]:
def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in test_uids:
    top_n[uid] = pop_df

In [9]:
import time

start_time = time.time()
avg_precision = eval.get_avg_precision(ratings_df, top_n, k=50)

print("Average Precision:", avg_precision)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Precision: 0.001025060421410738
--- Calculation time: 72.16734409332275 seconds ---


In [10]:
avg_recall = eval.get_avg_recall(ratings_df, top_n, k=50)

print("Average Recall:", avg_recall)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Recall: 0.017260116906959715
--- Calculation time: 129.41158509254456 seconds ---
