# Baseline - Most popular

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split


def prepare_string(string):
    return str(string).strip().lower().replace('-', '_')


movie_ratings = pd.read_csv("./data/ratings.csv", sep=",", encoding="latin-1")
movies = pd.read_csv("./data/movies.csv", sep=",", encoding="latin-1")
#ratings.columns = ratings.columns.map(prepare_string)

ratings_dict = {
    'userId': movie_ratings['userId'].values,
    'movieId': movie_ratings['movieId'].values,
    'rating': movie_ratings['rating'].values
}

df = pd.DataFrame.from_dict(ratings_dict)

train_ml, test_ml = train_test_split(df, test_size=0.2)

pd.DataFrame(train_ml).to_csv("./data/ml_ratings_train.csv", index=False, header=False)
pd.DataFrame(test_ml).to_csv("./data/ml_rating_test.csv", index=False, header=False)

In [9]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular

MostPopular("./data/ml_ratings_train.csv", "./data/ml_rating_test.csv", output_file="./data/ml_rating_mp.csv",
            sep=",").compute()

[Case Recommender: Item Recommendation > Most Popular]

train data:: 610 users and 8974 items (80668 interactions) | sparsity:: 98.53%
test data:: 610 users and 5173 items (20168 interactions) | sparsity:: 99.36%

prediction_time:: 19.990337 sec


Eval:: PREC@1: 0.265574 PREC@3: 0.214208 PREC@5: 0.191475 PREC@10: 0.161311 RECALL@1: 0.016724 RECALL@3: 0.033105 RECALL@5: 0.04809 RECALL@10: 0.076972 MAP@1: 0.265574 MAP@3: 0.335792 MAP@5: 0.339133 MAP@10: 0.326335 NDCG@1: 0.265574 NDCG@3: 0.398905 NDCG@5: 0.417667 NDCG@10: 0.426227 


In [10]:
mp = pd.read_csv("./data/ml_rating_mp.csv", sep=",", encoding="latin-1")
mp

Unnamed: 0,1,318,1104.000000
0,1,356,1087.0
1,1,593,885.5
2,1,589,727.5
3,1,1198,701.5
4,1,858,658.0
...,...,...,...
6094,610,364,556.5
6095,610,4226,556.5
6096,610,4306,528.0
6097,610,590,503.0


In [11]:
from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation

ItemRecommendationEvaluation(n_ranks=[1, 3, 5, 10], metrics=['PREC', 'RECALL'], sep=",",
                             as_table=True).evaluate_with_files("./data/ml_rating_mp.csv",
                                                                "./data/ml_rating_test.csv")

PREC@1	PREC@3	PREC@5	PREC@10	RECALL@1	RECALL@3	RECALL@5	RECALL@10	
0.265574	0.214208	0.191475	0.161311	0.016724	0.033105	0.04809	0.076972	


{'PREC@1': 0.265574,
 'RECALL@1': 0.016724,
 'NDCG@1': 0.265574,
 'MAP@1': 0.265574,
 'MAP': 0.326335,
 'PREC@3': 0.214208,
 'RECALL@3': 0.033105,
 'NDCG@3': 0.398905,
 'MAP@3': 0.335792,
 'PREC@5': 0.191475,
 'RECALL@5': 0.04809,
 'NDCG@5': 0.417667,
 'MAP@5': 0.339133,
 'PREC@10': 0.161311,
 'RECALL@10': 0.076972,
 'NDCG@10': 0.426227,
 'MAP@10': 0.326335}

In [1]:
import numpy as np


def weighted_rating(v, m, R, C):
    """
    Calculate the weighted rating

    Args:
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)

    Returns:
    pd.Series
    """
    return ((v / (v + m)) * R) + ((m / (v + m)) * C)


def assign_popular_based_score(rating_df, item_df, user_col, item_col, rating_col):
    """

    Assigned popular based score based on the IMDB weighted average.

    Args:
    rating -> pd.DataFrame contains ['item_id', 'rating'] for each user.

    Returns
    popular_items -> pd.DataFrame contains item and IMDB weighted score.
    """

    # pre processing
    vote_count = rating_df.groupby(item_col, as_index=False).agg(
        {user_col: "count", rating_col: "mean"}
    )
    vote_count.columns = [item_col, "vote_count", "avg_rating"]

    # calcuate input parameters
    C = np.mean(vote_count["avg_rating"])
    m = np.percentile(vote_count["vote_count"], 70)
    vote_count = vote_count[vote_count["vote_count"] >= m]
    R = vote_count["avg_rating"]
    v = vote_count["vote_count"]
    vote_count["weighted_rating"] = weighted_rating(v, m, R, C)

    # post processing
    vote_count = vote_count.merge(item_df, on=[item_col], how="left")
    popular_items = vote_count.loc[
                    :, [item_col, "vote_count", "avg_rating", "weighted_rating"]
                    ]

    return popular_items

In [None]:
# init constant
USER_COL = "userId"
ITEM_COL = "movieId"
RATING_COL = "rating"

# print(pd.DataFrame(train))

# calcualte popularity based
pop_items = assign_popular_based_score(train_ml, movies, USER_COL, ITEM_COL, RATING_COL)
pop_items = pop_items.sort_values("weighted_rating", ascending=False)
pop_10_items = pop_items[:10]

print(pop_items[:10])


In [13]:
import bibrec.server.evaluation as eval
from collections import defaultdict
import importlib

importlib.reload(eval)

pop_dict = {
    'itemId': pop_items['movieId'].values,
    'est_r': pop_items['weighted_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

train_dict = {
    'userId': movie_ratings['userId'].values,
    'itemId': movie_ratings['movieId'].values,
    'rating': movie_ratings['rating'].values
}

train_df = pd.DataFrame.from_dict(train_dict)

uids = train_df["userId"].unique()


def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in uids:
    top_n[uid] = pop_df

avg_precision_ml = eval.get_avg_precision(train_df, top_n, k=10, threshold=3.5)

avg_recall_ml = eval.get_avg_recall(train_df, top_n, k=10, threshold=3.5)

print("Average Precision Movielens:", avg_precision_ml)
print("Average Recall Movielens:", avg_recall_ml)

Average Precision Movielens: 0.24672131147540952
Average Recall Movielens: 0.0021269078575466274


## Book Crossing Dataset

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import bibrec.server.evaluation as eval
from collections import defaultdict
import bibrec.server.data_exploration as data_exploration
import importlib

importlib.reload(eval)
importlib.reload(data_exploration)

books = data_exploration.books
ratings = data_exploration.filtered_ratings

print(books.dtypes)
print(ratings.dtypes)

With NaN values 34.862889904962536
used mean values 34.866379950089126


  location_seperated = users.location.str.split(',', 2, expand=True)
  books = pd.read_csv("./data/BX-Books.csv", sep=";", encoding="latin-1")


isbn                    object
book_title              object
book_author             object
year_of_publication    float64
publisher               object
image_url_s             object
image_url_m             object
image_url_l             object
isbn13                  object
rating_mean            float64
rating_count           float64
age                    float64
dtype: object
user_id         int64
isbn           object
book_rating     int64
isbn13         object
dtype: object


In [4]:
print(len(ratings['book_rating'].values))


384081


In [5]:
book_ratings_dict = {
    'user_id': ratings['user_id'].values,
    'isbn13': ratings['isbn13'].values,
    'book_rating': ratings['book_rating'].values
}

book_rating_df = pd.DataFrame.from_dict(book_ratings_dict)

        user_id         isbn13  book_rating
108892    75875  9780671677640           10
48451     30610  9780446602730            5
200068   137918  9780684801469           10
259638   182838  9780345428691            8
34391     21252  9780898154900           10
...         ...            ...          ...
207337   143175  9780787118556            7
123581    86243  9780807282595            8
349903   249695  9780440237228            7
177621   121517  9780345417626            7
179919   123517  9783499137907            7

[307264 rows x 3 columns]


In [6]:
popular_books = assign_popular_based_score(book_rating_df, books, "user_id", "isbn13", "book_rating")
popular_books = popular_books.sort_values("weighted_rating", ascending=False)
pop_10_books = popular_books[:10]

print(pop_10_books)

              isbn13  vote_count  avg_rating  weighted_rating
18249  9780439425223          23    9.869565         9.682201
46483  9781888054552          11   10.000000         9.619616
39688  9780836213317          13    9.923077         9.603668
28975  9780618002238          25    9.720000         9.557593
899    9780060256654          20    9.750000         9.547955
15963  9780394800899           8   10.000000         9.505501
15959  9780394800387          14    9.785714         9.503438
41634  9780894718380           7   10.000000         9.450557
44698   978157145698           7   10.000000         9.450557
39749  9780836220889          24    9.583333         9.425193


In [7]:
pop_dict = {
    'itemId': popular_books['isbn13'].values,
    'est_r': popular_books['weighted_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

ratings_dict = {
    'userId': ratings['user_id'].values,
    'itemId': ratings['isbn13'].values,
    'rating': ratings['book_rating'].values
}

ratings_df = pd.DataFrame.from_dict(ratings_dict)

print(pop_df)
print(ratings_df)


              itemId     est_r
0      9780439425223  9.682201
1      9781888054552  9.619616
2      9780836213317  9.603668
3      9780618002238  9.557593
4      9780060256654  9.547955
...              ...       ...
50712  9780971880108  4.401467
50713  9780689855511  4.263753
50714  9780060616595  4.263753
50715  9780345361882  4.263753
50716   978188098507  3.823183

[50717 rows x 2 columns]
        userId         itemId  rating
0       276726  9780155061224       5
1       276729  9780521656153       3
2       276729  9780521795029       6
3       276744  9780385501200       7
4       276747  9780060517793       9
...        ...            ...     ...
384076  276704  9780743211383       7
384077  276704  9780806917696       5
384078  276704  9781563526299       9
384079  276709  9780515107661      10
384080  276721  9780590442442      10

[384081 rows x 3 columns]


In [8]:
uids = ratings_df["userId"].unique()

print("User Count", len(uids))
print(uids)

User Count 68175
[276726 276729 276744 ... 276704 276709 276721]


In [9]:
def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in uids:
    top_n[uid] = pop_df

In [10]:
import time

start_time = time.time()
avg_precision = eval.get_avg_precision(ratings_df, top_n, k=10, threshold=5)
print("--- %s seconds ---" % (time.time() - start_time))

print("Average Precision:", avg_precision)

--- 185.1832571029663 seconds ---
Average Precision: 0.00022295562889622245


In [11]:
importlib.reload(eval)
start_time = time.time()

avg_recall = eval.get_avg_recall(ratings_df, top_n, k=10, threshold=5)
print("--- %s seconds ---" % (time.time() - start_time))

print("Average Recall:", avg_recall)



--- 193.45407891273499 seconds ---
Average Recall: 0.00024626284494808765
