# Baseline - Most popular
Most popular books are a combination of vote count and the average rating

In [1]:
import importlib
from collections import defaultdict

import pandas as pd
from sklearn.model_selection import train_test_split

import bibrec.server.Utils as Utils
import bibrec.server.evaluation as eval
from bibrec.server.Utils import assign_popular_based_score

importlib.reload(eval)
importlib.reload(Utils)

books, users, ratings = Utils.get_normalized_data(books_path="../data/normalized_books.csv",
                                                  users_path="../data/normalized_users.csv",
                                                  ratings_path="../data/normalized_ratings.csv")

  ratings = pd.read_csv(ratings_path, sep=",", encoding="utf-8", na_filter=False)


In [2]:
train, test = train_test_split(ratings, test_size=0.25)

In [3]:
print("Total Ratings Count:", len(ratings))
print("Train Data Ratings Count:", len(train))
print("Test Data Ratings Count:", len(test))

Total Ratings Count: 383962
Train Data Ratings Count: 287971
Test Data Ratings Count: 95991


In [4]:
book_ratings_dict = {
    'user_id': train['user_id'].values,
    'isbn13': train['isbn13'].values,
    'book_rating': train['book_rating'].values
}

book_rating_df = train[['user_id', 'isbn13', 'book_rating']]

In [5]:
train["isbn13"] = train["isbn13"].astype("int")
books["isbn13"] = books["isbn13"].astype("int")
popular_books = assign_popular_based_score(train, books, "user_id", "isbn13", "book_rating")
popular_books = popular_books.sort_values("vote_count", ascending=False)
popular_books = popular_books[:200]
popular_books = popular_books.sort_values("avg_rating", ascending=False)

print("Most popular books are:")
print(popular_books[:20])

Most popular books are:
              isbn13  vote_count  avg_rating  weighted_rating
16382  9780439139595         104    9.259615         9.227056
16379  9780439136365         109    9.137615         9.108720
16447  9780439358064         154    9.045455         9.026077
1577    978043913961          80    9.037500         9.000829
23596  9780590353403          91    8.978022         8.946967
17737  9780446310789         162    8.969136         8.951634
23597  9780590353427         244    8.954918         8.943366
9873   9780345339706          94    8.914894         8.886125
16341  9780439064866          92    8.891304         8.862425
16378  9780439136358         100    8.890000         8.863411
20190  9780451524935          72    8.847222         8.811729
16342  9780439064873         147    8.823129         8.805825
31044  9780812550702          96    8.791667         8.766000
9872   9780345339683         124    8.701613         8.683079
16799  9780440219071          72    8.652778  

In [6]:
pop_dict = {
    'item_id': popular_books['isbn13'].values,
    'est_r': popular_books['avg_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

ratings_dict = {
    'user_id': test['user_id'].values,
    'item_id': test['isbn13'].values,
    'rating': test['book_rating'].values
}

ratings_df = pd.DataFrame.from_dict(ratings_dict)

In [7]:
test_uids = ratings_df["user_id"].unique()

print("User Count:", len(test_uids))

User Count: 29778


In [8]:
def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in test_uids:
    top_n[uid] = pop_df

In [9]:
import time

start_time = time.time()
avg_precision = eval.get_avg_precision(ratings_df, top_n, k=50)

print("Average Precision:", avg_precision)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Precision: 0.001052158578504444
--- Calculation time: 68.47326898574829 seconds ---


In [10]:
avg_recall = eval.get_avg_recall(ratings_df, top_n, k=50)

print("Average Recall:", avg_recall)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Recall: 0.015872159384655613
--- Calculation time: 129.53651404380798 seconds ---
