# Baseline - Most popular

## Case Recommender Implementation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import bibrec.server.data_exploration as data_exploration


def prepare_string(string):
    return str(string).strip().lower().replace('-', '_')


book_ratings = data_exploration.filtered_ratings[:100000]
books = data_exploration.books

ratings_dict = {
    'userId': book_ratings['user_id'].values,
    'isbn13': book_ratings['isbn13'].values,
    'book_rating': book_ratings['book_rating'].values
}

df = pd.DataFrame.from_dict(ratings_dict)

train_bx, test_bx = train_test_split(df, test_size=0.2)

pd.DataFrame(train_bx).to_csv("./data/bx_ratings_train.csv", index=False, header=False)
pd.DataFrame(test_bx).to_csv("./data/bx_rating_test.csv", index=False, header=False)

With NaN values 34.862889904962536
used mean values 34.870185865015245


  location_seperated = users.location.str.split(',', 2, expand=True)
  books = pd.read_csv("./data/BX-Books.csv", sep=";", encoding="latin-1")


In [2]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular

MostPopular("./data/bx_ratings_train.csv", "./data/bx_rating_test.csv", output_file="./data/bx_rating_mp.csv",
            sep=",").compute()

[Case Recommender: Item Recommendation > Most Popular]

train data:: 15452 users and 48517 items (80000 interactions) | sparsity:: 99.99%
test data:: 6679 users and 15554 items (20000 interactions) | sparsity:: 99.98%

prediction_time:: 1312.260658 sec


Eval:: PREC@1: 0.004342 PREC@3: 0.002845 PREC@5: 0.003174 PREC@10: 0.00271 RECALL@1: 0.001193 RECALL@3: 0.002994 RECALL@5: 0.006785 RECALL@10: 0.011249 MAP@1: 0.004342 MAP@3: 0.006214 MAP@5: 0.007801 MAP@10: 0.009065 NDCG@1: 0.004342 NDCG@3: 0.008037 NDCG@5: 0.011302 NDCG@10: 0.014704 


In [6]:
mp = pd.read_csv("./data/bx_rating_mp.csv", sep=",", encoding="latin-1")

In [7]:
from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation

evaluation = ItemRecommendationEvaluation(n_ranks=[1, 3, 5, 10], metrics=['PREC', 'RECALL'], sep=",",
                                          as_table=True).evaluate_with_files("./data/bx_rating_mp.csv",
                                                                             "./data/bx_rating_test.csv")

PREC@1	PREC@3	PREC@5	PREC@10	RECALL@1	RECALL@3	RECALL@5	RECALL@10	
0.004342	0.002845	0.003174	0.00271	0.001193	0.002994	0.006785	0.011249	


## Own Implementation

In [8]:
from bibrec.server.Utils import assign_popular_based_score
import pandas as pd
import numpy as np
import bibrec.server.evaluation as eval
from collections import defaultdict
import bibrec.server.data_exploration as data_exploration
import importlib
from sklearn.model_selection import train_test_split

importlib.reload(eval)
importlib.reload(data_exploration)

books = data_exploration.books
ratings = data_exploration.filtered_ratings

train, test = train_test_split(ratings, test_size=0.20)

With NaN values 34.862889904962536
used mean values 34.881596148928175


  location_seperated = users.location.str.split(',', 2, expand=True)
  books = pd.read_csv("./data/BX-Books.csv", sep=";", encoding="latin-1")


In [9]:
print("Ratings Count:", len(ratings))
print("Train Data Ratings Count:", len(train))
print("Test Data Ratings Count:", len(test))

Ratings Count: 383963
Train Data Ratings Count: 307170
Test Data Ratings Count: 76793


In [10]:
book_ratings_dict = {
    'user_id': train['user_id'].values,
    'isbn13': train['isbn13'].values,
    'book_rating': train['book_rating'].values
}

book_rating_df = pd.DataFrame.from_dict(book_ratings_dict)

In [11]:
popular_books = assign_popular_based_score(train, books, "user_id", "isbn13", "book_rating")
popular_books = popular_books.sort_values("vote_count", ascending=False)

print(popular_books[:20])

              isbn13  vote_count  avg_rating  weighted_rating
6493   9780316666343         566    8.173145         8.170880
35163  9780971880108         460    4.360870         4.374588
12560  9780385504201         379    8.395778         8.391233
5146   9780312195519         302    8.178808         8.174539
23418  9780590353427         251    8.872510         8.861896
23419  9780590353427         251    8.872510         8.861896
17456   978044667227         244    8.147541         8.142519
1254   9780060928339         243    7.909465         7.906366
4022   9780142001745         239    8.401674         8.394439
15833  9780440237228         229    7.318777         7.320605
15834  9780440237228         229    7.318777         7.320605
24828  9780671027360         226    8.110619         8.105525
19765  9780452282155         226    8.000000         7.995876
6472   9780316601955         224    7.562500         7.562211
30794  9780786868711         197    8.071066         8.065627
27232  9

In [12]:
pop_dict = {
    'itemId': popular_books['isbn13'].values,
    'est_r': popular_books['avg_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

ratings_dict = {
    'userId': test['user_id'].values,
    'itemId': test['isbn13'].values,
    'rating': test['book_rating'].values
}

ratings_df = pd.DataFrame.from_dict(ratings_dict)

In [13]:
uids = ratings_df["userId"].unique()

print("User Count:", len(uids))

User Count: 25881


In [14]:
def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in uids:
    top_n[uid] = pop_df

In [15]:
mean_rating_test = np.mean(ratings_df["rating"])

print("Average Rating:", mean_rating_test)

Average Rating: 7.630435065696092


In [16]:
import time

start_time = time.time()
avg_precision = eval.get_avg_precision(ratings_df, top_n, k=20, threshold=mean_rating_test)

print("Average Precision:", avg_precision)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Precision: 0.00190294038097446
--- Calculation time: 49.03213095664978 seconds ---


In [17]:
avg_recall = eval.get_avg_recall(ratings_df, top_n, k=20, threshold=mean_rating_test)

print("Average Recall:", avg_recall)
print("--- Calculation time: %s seconds ---" % (time.time() - start_time))

Average Recall: 0.016136531546655578
--- Calculation time: 44.69033885002136 seconds ---
