# Baseline - Most popular

In [1]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular
from sklearn.model_selection import train_test_split
import pandas as pd

def prepare_string(string):
    return str(string).strip().lower().replace('-', '_')

ratings = pd.read_csv("./data/ratings.csv", sep=",", encoding="latin-1")
items = pd.read_csv("./data/movies.csv", sep=",", encoding="latin-1")
#ratings.columns = ratings.columns.map(prepare_string)

ratings_dict = {
    'userId': ratings['userId'].values,
    'movieId': ratings['movieId'].values,
    'rating': ratings['rating'].values
}

df = pd.DataFrame.from_dict(ratings_dict)

#print(df)

train, test = train_test_split(df, test_size=0.2)

#print(pd.DataFrame(train))

pd.DataFrame(train).to_csv("./data/ml_ratings_train.csv", index=False, header=False)
pd.DataFrame(test).to_csv("./data/ml_rating_test.csv", index=False, header=False)

In [2]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular

MostPopular("./data/ml_ratings_train.csv", "./data/ml_rating_test.csv", output_file="./data/ml_rating_mp.csv", sep=",").compute()

[Case Recommender: Item Recommendation > Most Popular]

train data:: 610 users and 8954 items (80668 interactions) | sparsity:: 98.52%
test data:: 608 users and 5196 items (20168 interactions) | sparsity:: 99.36%

prediction_time:: 22.435605 sec


Eval:: PREC@1: 0.258224 PREC@3: 0.213268 PREC@5: 0.191447 PREC@10: 0.159704 RECALL@1: 0.014495 RECALL@3: 0.030138 RECALL@5: 0.045124 RECALL@10: 0.069913 MAP@1: 0.258224 MAP@3: 0.337308 MAP@5: 0.346779 MAP@10: 0.333337 NDCG@1: 0.258224 NDCG@3: 0.410438 NDCG@5: 0.429273 NDCG@10: 0.429976 


In [3]:
mp = pd.read_csv("./data/ml_rating_mp.csv", sep=",", encoding="latin-1")
mp

Unnamed: 0,1,318,1145.500000
0,1,589,734.0
1,1,1,708.5
2,1,858,691.0
3,1,4993,665.0
4,1,5952,613.5
...,...,...,...
6094,610,150,586.5
6095,610,608,577.5
6096,610,588,575.0
6097,610,4226,534.0


In [4]:
from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation

eval = ItemRecommendationEvaluation(n_ranks=[1, 3, 5, 10],metrics=['PREC', 'RECALL'],sep=",", as_table=True).evaluate_with_files("./data/ml_rating_mp.csv", "./data/ml_rating_test.csv")

PREC@1	PREC@3	PREC@5	PREC@10	RECALL@1	RECALL@3	RECALL@5	RECALL@10	
0.258224	0.213268	0.191447	0.159704	0.014495	0.030138	0.045124	0.069913	


In [5]:
import numpy as np


def weighted_rating(v, m, R, C):
    """
    Calculate the weighted rating

    Args:
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)

    Returns:
    pd.Series
    """
    return ((v / (v + m)) * R) + ((m / (v + m)) * C)


def assign_popular_based_score(rating_df, item_df, user_col, item_col, rating_col):
    """

    Assigned popular based score based on the IMDB weighted average.

    Args:
    rating -> pd.DataFrame contains ['item_id', 'rating'] for each user.

    Returns
    popular_items -> pd.DataFrame contains item and IMDB weighted score.
    """

    # pre processing
    vote_count = rating_df.groupby(item_col, as_index=False).agg(
        {user_col: "count", rating_col: "mean"}
    )
    vote_count.columns = [item_col, "vote_count", "avg_rating"]

    # calcuate input parameters
    C = np.mean(vote_count["avg_rating"])
    m = np.percentile(vote_count["vote_count"], 70)
    vote_count = vote_count[vote_count["vote_count"] >= m]
    R = vote_count["avg_rating"]
    v = vote_count["vote_count"]
    vote_count["weighted_rating"] = weighted_rating(v, m, R, C)

    # post processing
    vote_count = vote_count.merge(item_df, on=[item_col], how="left")
    popular_items = vote_count.loc[
        :, [item_col, "genres", "vote_count", "avg_rating", "weighted_rating"]
    ]

    return popular_items


# init constant
USER_COL = "userId"
ITEM_COL = "movieId"
RATING_COL = "rating"

# print(pd.DataFrame(train))

# calcualte popularity based
pop_items = assign_popular_based_score(train, items, USER_COL, ITEM_COL, RATING_COL)
pop_items = pop_items.sort_values("weighted_rating", ascending=False)

print(pop_items[:10])


      movieId                       genres  vote_count  avg_rating  \
179       318                  Crime|Drama         260    4.405769   
396       858                  Crime|Drama         161    4.291925   
2391    58559      Action|Crime|Drama|IMAX         106    4.306604   
1240     2959  Action|Crime|Drama|Thriller         180    4.275000   
550      1213                  Crime|Drama         109    4.288991   
282       527                    Drama|War         175    4.251429   
992      2329                  Crime|Drama         105    4.261905   
148       260      Action|Adventure|Sci-Fi         199    4.231156   
363       750                   Comedy|War          77    4.272727   
496      1089       Crime|Mystery|Thriller          99    4.252525   

      weighted_rating  
179          4.380002  
396          4.254974  
2391         4.250720  
1240         4.242369  
550          4.235484  
282          4.218677  
992          4.207934  
148          4.202832  
363          

In [6]:
from caserec.evaluation.rating_prediction import RatingPredictionEvaluation

pop_items_user_1 = pop_items[:10]

#print(test)

user_id = 1;

user_ratings = train[train["userId"] == user_id]

#user_items = list(filter(lambda user_rating: user_rating["userId"] == user_id, train.items()))

#for (user_id, movie_id) in train.items():
	#print(user_id, movie_id)

print(user_ratings)

def get_precision_for_uid(user_predictions, threshold=3.5):

    recommendation_count = sum((weighted_rating >= threshold) for (_, _, _, _, weighted_rating) in pop_items_user_1)
    
    true_positive = sum((est >= threshold and true_r >= threshold) for (_, est, true_r) in user_predictions)
    
    return true_positive / recommendation_count if recommendation_count != 0 else 0

     userId  movieId  rating
12        1      223     3.0
3         1       47     5.0
163       1     2528     3.0
139       1     2143     4.0
36        1      608     5.0
..      ...      ...     ...
191       1     2949     5.0
67        1     1136     5.0
183       1     2826     4.0
178       1     2692     5.0
101       1     1580     3.0

[193 rows x 3 columns]
