# Baseline - Most popular
User individual splitting of ratings

In [1]:
from bibrec.server.Utils import assign_popular_based_score
import pandas as pd
import numpy as np
import bibrec.server.evaluation as eval
from collections import defaultdict
import bibrec.server.Utils as Utils
import importlib
from sklearn.model_selection import train_test_split

importlib.reload(eval)
importlib.reload(Utils)

books, users, ratings = Utils.get_normalized_data(books_path="../data/normalized_books.csv",
                                                  users_path="../data/normalized_users.csv",
                                                  ratings_path="../data/normalized_ratings.csv")

  ratings = pd.read_csv(ratings_path, sep=",", encoding="utf-8", na_filter=False)


In [2]:
ratings = ratings[:150000]

In [3]:
users_with_ratings = ratings["user_id"].unique()

train = pd.DataFrame(columns=ratings.columns)
test = pd.DataFrame(columns=ratings.columns)

# split data for each user individually
for user_id in users_with_ratings:
    mask = ratings["user_id"] == user_id
    user_ratings = ratings[mask]
    if len(user_ratings) < 3:
        continue
    train_user, test_user = train_test_split(user_ratings, test_size=0.25)
    train = pd.concat([train, train_user])
    test = pd.concat([test, test_user])



In [4]:
print("Total Ratings Count:", len(train) + len(test))
print("Train Data Ratings Count:", len(train))
print("Test Data Ratings Count:", len(test))

Total Ratings Count: 129154
Train Data Ratings Count: 94371
Test Data Ratings Count: 34783


In [5]:
book_ratings_dict = {
    'user_id': train['user_id'].values,
    'isbn13': train['isbn13'].values,
    'book_rating': train['book_rating'].values
}

book_rating_df = train[['user_id', 'isbn13', 'book_rating']]

In [6]:
train["isbn13"] = train["isbn13"].astype("int")
books["isbn13"] = books["isbn13"].astype("int")
popular_books = assign_popular_based_score(train, books, "user_id", "isbn13", "book_rating")
popular_books = popular_books.sort_values("vote_count", ascending=False)

print(popular_books[:20])

              isbn13  vote_count  avg_rating  weighted_rating
13480  9780316666343         123    8.097561         8.094197
19862  9780385504201         110    8.554545         8.546671
47952  9780971880108          83    4.301205         4.341434
31790  9780590353427          73    9.123288         9.103790
6979   9780060928339          70    7.714286         7.713809
11769  9780312195519          64    8.390625         8.379699
10034  9780142001745          61    8.360656         8.349685
2393    978044667227          59    8.067797         8.061341
33383  9780671027360          59    8.237288         8.228007
38463  9780743418171          59    7.915254         7.911341
27500  9780452282155          56    7.839286         7.836499
10394  9780156027328          55    8.454545         8.440722
24305  9780446310789          54    8.962963         8.939644
35978  9780679764021          52    7.576923         7.578876
40905  9780786868711          51    7.823529         7.820778
23482  9

In [7]:
pop_dict = {
    'item_id': popular_books['isbn13'].values,
    'est_r': popular_books['avg_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

ratings_dict = {
    'user_id': test['user_id'].values,
    'item_id': test['isbn13'].values,
    'rating': test['book_rating'].values
}

ratings_df = pd.DataFrame.from_dict(ratings_dict)

In [8]:
test_uids = ratings_df["user_id"].unique()

print("User Count:", len(test_uids))

User Count: 7547


In [9]:
def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in test_uids:
    top_n[uid] = pop_df

In [10]:
import time

importlib.reload(eval)
importlib.reload(Utils)
start_time = time.time()
avg_precision = eval.get_avg_precision(ratings_df, top_n, k=50)

print("Average Precision:", avg_precision)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Precision: 0.001784418757095977
--- Calculation time: 31.682682991027832 seconds ---


In [11]:
avg_recall = eval.get_avg_recall(ratings_df, top_n, k=50)

print("Average Recall:", avg_recall)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Recall: 0.021852577433061477
--- Calculation time: 62.42608594894409 seconds ---
