# Baseline - Most popular

In [1]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular
from sklearn.model_selection import train_test_split
import pandas as pd

def prepare_string(string):
    return str(string).strip().lower().replace('-', '_')

ratings = pd.read_csv("./data/ratings.csv", sep=",", encoding="latin-1")
items = pd.read_csv("./data/movies.csv", sep=",", encoding="latin-1")
#ratings.columns = ratings.columns.map(prepare_string)

ratings_dict = {
    'userId': ratings['userId'].values,
    'movieId': ratings['movieId'].values,
    'rating': ratings['rating'].values
}

df = pd.DataFrame.from_dict(ratings_dict)

train, test = train_test_split(df, test_size=0.2)

pd.DataFrame(train).to_csv("./data/ml_ratings_train.csv", index=False, header=False)
pd.DataFrame(test).to_csv("./data/ml_rating_test.csv", index=False, header=False)

In [2]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular

MostPopular("./data/ml_ratings_train.csv", "./data/ml_rating_test.csv", output_file="./data/ml_rating_mp.csv", sep=",").compute()

[Case Recommender: Item Recommendation > Most Popular]

train data:: 610 users and 9005 items (80668 interactions) | sparsity:: 98.53%
test data:: 609 users and 5120 items (20168 interactions) | sparsity:: 99.35%

prediction_time:: 25.402501 sec


Eval:: PREC@1: 0.261084 PREC@3: 0.219485 PREC@5: 0.194417 PREC@10: 0.156158 RECALL@1: 0.013956 RECALL@3: 0.031315 RECALL@5: 0.045606 RECALL@10: 0.067141 MAP@1: 0.261084 MAP@3: 0.334291 MAP@5: 0.343493 MAP@10: 0.330071 NDCG@1: 0.261084 NDCG@3: 0.39956 NDCG@5: 0.422841 NDCG@10: 0.424179 


In [3]:
mp = pd.read_csv("./data/ml_rating_mp.csv", sep=",", encoding="latin-1")
mp

Unnamed: 0,1,318,1136.000000
0,1,1198,707.5
1,1,1210,688.0
2,1,589,679.5
3,1,4993,633.5
4,1,7153,632.5
...,...,...,...
6094,610,150,593.0
6095,610,3578,570.5
6096,610,2762,560.5
6097,610,364,552.0


In [4]:
from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation

eval = ItemRecommendationEvaluation(n_ranks=[1, 3, 5, 10],metrics=['PREC', 'RECALL'],sep=",", as_table=True).evaluate_with_files("./data/ml_rating_mp.csv", "./data/ml_rating_test.csv")

PREC@1	PREC@3	PREC@5	PREC@10	RECALL@1	RECALL@3	RECALL@5	RECALL@10	
0.261084	0.219485	0.194417	0.156158	0.013956	0.031315	0.045606	0.067141	


In [5]:
import numpy as np


def weighted_rating(v, m, R, C):
    """
    Calculate the weighted rating

    Args:
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)

    Returns:
    pd.Series
    """
    return ((v / (v + m)) * R) + ((m / (v + m)) * C)


def assign_popular_based_score(rating_df, item_df, user_col, item_col, rating_col):
    """

    Assigned popular based score based on the IMDB weighted average.

    Args:
    rating -> pd.DataFrame contains ['item_id', 'rating'] for each user.

    Returns
    popular_items -> pd.DataFrame contains item and IMDB weighted score.
    """

    # pre processing
    vote_count = rating_df.groupby(item_col, as_index=False).agg(
        {user_col: "count", rating_col: "mean"}
    )
    vote_count.columns = [item_col, "vote_count", "avg_rating"]

    # calcuate input parameters
    C = np.mean(vote_count["avg_rating"])
    m = np.percentile(vote_count["vote_count"], 70)
    vote_count = vote_count[vote_count["vote_count"] >= m]
    R = vote_count["avg_rating"]
    v = vote_count["vote_count"]
    vote_count["weighted_rating"] = weighted_rating(v, m, R, C)

    # post processing
    vote_count = vote_count.merge(item_df, on=[item_col], how="left")
    popular_items = vote_count.loc[
        :, [item_col, "vote_count", "avg_rating", "weighted_rating"]
    ]

    return popular_items


# init constant
USER_COL = "userId"
ITEM_COL = "movieId"
RATING_COL = "rating"

# print(pd.DataFrame(train))

# calcualte popularity based
pop_items = assign_popular_based_score(train, items, USER_COL, ITEM_COL, RATING_COL)
pop_items = pop_items.sort_values("weighted_rating", ascending=False)
pop_10_items = pop_items[:10]

print(pop_items[:10])


      movieId  vote_count  avg_rating  weighted_rating
178       318         256    4.437500         4.410653
394       858         142    4.292254         4.250616
549      1213         100    4.295000         4.236709
362       750          69    4.318841         4.234548
286       527         176    4.250000         4.217534
2382    58559         119    4.260504         4.212729
541      1204          37    4.364865         4.211422
557      1221          99    4.262626         4.205630
2264    48516          88    4.267045         4.203097
536      1198         167    4.236527         4.202839


In [6]:
# Precision

def get_precision_for_uid(merged_set, threshold=3.5):

    recommendation_count = sum((weighted_rating >= threshold) for weighted_rating in merged_set["weighted_rating"])
    
    true_positive = sum((weighted_rating >= threshold and rating >= threshold) for rating, weighted_rating in zip(merged_set["rating"],merged_set["weighted_rating"]))
    
    return true_positive / recommendation_count if recommendation_count != 0 else 0

In [7]:
# Recall

def get_recall(merged_set, user_predictions, threshold=3.5):
    
    rel_items_count = sum((weighted_rating >= threshold) for weighted_rating in user_predictions["weighted_rating"])
    
    true_positive = sum((weighted_rating >= threshold and rating >= threshold) for rating, weighted_rating in zip(merged_set["rating"],merged_set["weighted_rating"]))
    
    return true_positive / rel_items_count if rel_items_count != 0 else 0

In [8]:
def calc_avg_precision(threshold=3.5):

    uids = train["userId"].unique()
    sum = 0

    for uid in uids:
        user_ratings = train[train["userId"] == uid]

        merged = pop_10_items.merge(user_ratings, on="movieId", how="left")

        merged = merged.drop(["vote_count", "userId"], axis=1)

        precision = get_precision_for_uid(merged, threshold=threshold)

        sum += precision

    return sum / len(uids)


def get_avg_recall(threshold=3.5):
    uids = train["userId"].unique()

    sum = 0
    for uid in uids:
        user_ratings = train[train["userId"] == uid]
        merged = pop_10_items.merge(user_ratings, on="movieId", how="left")
        merged = merged.drop(["vote_count", "userId"], axis=1)
        recall = get_recall(merged, pop_items, threshold=threshold)

        sum += recall
    return sum / len(uids)


print("Average Precision:", calc_avg_precision(3.5))
print("Average Recall:", get_avg_recall(3.5))


' def calc_avg_precision(threshold=3.5):\n\n    uids = train["userId"].unique()\n    sum = 0\n\n    for uid in uids:\n        user_ratings = train[train["userId"] == uid]\n\n        merged = pop_10_items.merge(user_ratings, on="movieId", how="left")\n\n        merged = merged.drop(["vote_count", "userId"], axis=1)\n\n        precision = get_precision_for_uid(merged, threshold=threshold)\n\n        sum += precision\n\n    return sum / len(uids)\n\n\ndef get_avg_recall(threshold=3.5):\n    uids = train["userId"].unique()\n\n    sum = 0\n    for uid in uids:\n        user_ratings = train[train["userId"] == uid]\n        merged = pop_10_items.merge(user_ratings, on="movieId", how="left")\n        merged = merged.drop(["vote_count", "userId"], axis=1)\n        recall = get_recall(merged, pop_items, threshold=threshold)\n\n        sum += recall\n    return sum / len(uids)\n\n\nprint("Average Precision:", calc_avg_precision(3.5))\nprint("Average Recall:", get_avg_recall(3.5))\n '

In [35]:
import bibrec.server.evaluation as eval
from collections import defaultdict
import importlib
importlib.reload(eval)

pop_dict = {
    'itemId': pop_items['movieId'].values,
    'est_r': pop_items['weighted_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

train_dict = {
	'userId': ratings['userId'].values,
    'itemId': ratings['movieId'].values,
    'rating': ratings['rating'].values
}

train_df = pd.DataFrame.from_dict(train_dict)

uids = train_df["userId"].unique()


def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in uids:
    top_n[uid] = pop_df

avg_precision = eval.get_avg_precision(train_df, top_n, k=10, threshold=3.5)

avg_recall = eval.get_avg_recall(train_df, top_n, k=10, threshold=3.5)

print("Average Precision:", avg_precision)
print("Averagte Recall:", avg_recall)


Popular items       itemId     est_r
0        318  4.410653
1        858  4.250616
2       1213  4.236709
3        750  4.234548
4        527  4.217534
...      ...       ...
2828    1882  2.190348
2829    2643  2.171006
2830    1499  2.123832
2831    1760  2.072742
2832    1556  2.034773

[2833 rows x 2 columns]
Popular items       itemId     est_r
0        318  4.410653
1        858  4.250616
2       1213  4.236709
3        750  4.234548
4        527  4.217534
...      ...       ...
2828    1882  2.190348
2829    2643  2.171006
2830    1499  2.123832
2831    1760  2.072742
2832    1556  2.034773

[2833 rows x 2 columns]
Popular items       itemId     est_r
0        318  4.410653
1        858  4.250616
2       1213  4.236709
3        750  4.234548
4        527  4.217534
...      ...       ...
2828    1882  2.190348
2829    2643  2.171006
2830    1499  2.123832
2831    1760  2.072742
2832    1556  2.034773

[2833 rows x 2 columns]
Popular items       itemId     est_r
0        318  4.41