# Baseline - Most popular

## Movie Lens Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split


def prepare_string(string):
    return str(string).strip().lower().replace('-', '_')


movie_ratings = pd.read_csv("./data/ratings.csv", sep=",", encoding="latin-1")
movies = pd.read_csv("./data/movies.csv", sep=",", encoding="latin-1")

ratings_dict = {
    'userId': movie_ratings['userId'].values,
    'movieId': movie_ratings['movieId'].values,
    'rating': movie_ratings['rating'].values
}

df = pd.DataFrame.from_dict(ratings_dict)

train_ml, test_ml = train_test_split(df, test_size=0.2)

pd.DataFrame(train_ml).to_csv("./data/ml_ratings_train.csv", index=False, header=False)
pd.DataFrame(test_ml).to_csv("./data/ml_rating_test.csv", index=False, header=False)

In [2]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular

MostPopular("./data/ml_ratings_train.csv", "./data/ml_rating_test.csv", output_file="./data/ml_rating_mp.csv",
            sep=",").compute()

[Case Recommender: Item Recommendation > Most Popular]

train data:: 610 users and 8963 items (80668 interactions) | sparsity:: 98.52%
test data:: 610 users and 5157 items (20168 interactions) | sparsity:: 99.36%

prediction_time:: 23.169007 sec


Eval:: PREC@1: 0.272131 PREC@3: 0.222951 PREC@5: 0.198689 PREC@10: 0.168197 RECALL@1: 0.014961 RECALL@3: 0.035595 RECALL@5: 0.049174 RECALL@10: 0.08018 MAP@1: 0.272131 MAP@3: 0.336612 MAP@5: 0.344362 MAP@10: 0.336662 NDCG@1: 0.272131 NDCG@3: 0.396456 NDCG@5: 0.41846 NDCG@10: 0.435217 


In [3]:
mp = pd.read_csv("./data/ml_rating_mp.csv", sep=",", encoding="latin-1")
mp

Unnamed: 0,1,318,1082.500000
0,1,593,905.0
1,1,480,710.5
2,1,589,697.0
3,1,50,678.5
4,1,4993,654.5
...,...,...,...
6094,610,150,640.5
6095,610,2028,639.5
6096,610,608,612.5
6097,610,588,565.5


In [4]:
from caserec.evaluation.item_recommendation import ItemRecommendationEvaluation

ItemRecommendationEvaluation(n_ranks=[1, 3, 5, 10], metrics=['PREC', 'RECALL'], sep=",",
                             as_table=True).evaluate_with_files("./data/ml_rating_mp.csv",
                                                                "./data/ml_rating_test.csv")

PREC@1	PREC@3	PREC@5	PREC@10	RECALL@1	RECALL@3	RECALL@5	RECALL@10	
0.272131	0.222951	0.198689	0.168197	0.014961	0.035595	0.049174	0.08018	


{'PREC@1': 0.272131,
 'RECALL@1': 0.014961,
 'NDCG@1': 0.272131,
 'MAP@1': 0.272131,
 'MAP': 0.336662,
 'PREC@3': 0.222951,
 'RECALL@3': 0.035595,
 'NDCG@3': 0.396456,
 'MAP@3': 0.336612,
 'PREC@5': 0.198689,
 'RECALL@5': 0.049174,
 'NDCG@5': 0.41846,
 'MAP@5': 0.344362,
 'PREC@10': 0.168197,
 'RECALL@10': 0.08018,
 'NDCG@10': 0.435217,
 'MAP@10': 0.336662}

In [6]:
from bibrec.server.Utils import assign_popular_based_score

In [6]:
# init constant
USER_COL = "userId"
ITEM_COL = "movieId"
RATING_COL = "rating"

# print(pd.DataFrame(train))

# calcualte popularity based
pop_items = assign_popular_based_score(train_ml, movies, USER_COL, ITEM_COL, RATING_COL)
pop_items = pop_items.sort_values("weighted_rating", ascending=False)
pop_10_items = pop_items[:10]

print(pop_items[:10])


      movieId  vote_count  avg_rating  weighted_rating
184       318         244    4.436475         4.408244
2259    48516          86    4.331395         4.261534
399       858         150    4.280000         4.240776
551      1213         103    4.296117         4.239093
1104     2571         220    4.250000         4.223722
543      1204          37    4.378378         4.222352
1239     2959         182    4.252747         4.221070
152       260         202    4.242574         4.214236
585      1252          45    4.333333         4.207081
607      1276          48    4.322917         4.204835


In [7]:
import bibrec.server.evaluation as eval
from collections import defaultdict
import importlib

importlib.reload(eval)

pop_dict = {
    'itemId': pop_items['movieId'].values,
    'est_r': pop_items['weighted_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

train_dict = {
    'userId': movie_ratings['userId'].values,
    'itemId': movie_ratings['movieId'].values,
    'rating': movie_ratings['rating'].values
}

train_df = pd.DataFrame.from_dict(train_dict)

uids = train_df["userId"].unique()


def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in uids:
    top_n[uid] = pop_df

avg_precision_ml = eval.get_avg_precision(train_df, top_n, k=10, threshold=3.5)

avg_recall_ml = eval.get_avg_recall(train_df, top_n, k=10, threshold=3.5)

print("Average Precision Movielens:", avg_precision_ml)
print("Average Recall Movielens:", avg_recall_ml)

Average Precision Movielens: 0.2413114754098358
Average Recall Movielens: 0.03754385981640022


## Book Crossing Dataset

In [59]:
import pandas as pd
import bibrec.server.evaluation as eval
from collections import defaultdict
import bibrec.server.data_exploration as data_exploration
import importlib
from sklearn.model_selection import train_test_split

importlib.reload(eval)
importlib.reload(data_exploration)

books = data_exploration.books
ratings = data_exploration.filtered_ratings

train, test = train_test_split(ratings, test_size=0.20)

print(books.dtypes)
print(train.dtypes)

With NaN values 34.862889904962536
used mean values 34.900867433406724


  location_seperated = users.location.str.split(',', 2, expand=True)
  books = pd.read_csv("./data/BX-Books.csv", sep=";", encoding="latin-1")


isbn                    object
book_title              object
book_author             object
year_of_publication    float64
publisher               object
image_url_s             object
image_url_m             object
image_url_l             object
isbn13                  object
rating_mean            float64
rating_count           float64
age                    float64
dtype: object
user_id         int64
isbn           object
book_rating     int64
isbn13         object
dtype: object


In [60]:
print(len(train))

307170


In [61]:
book_ratings_dict = {
    'user_id': train['user_id'].values,
    'isbn13': train['isbn13'].values,
    'book_rating': train['book_rating'].values
}

book_rating_df = pd.DataFrame.from_dict(book_ratings_dict)

In [62]:
popular_books = assign_popular_based_score(train, books, "user_id", "isbn13", "book_rating")
popular_books = popular_books.sort_values("vote_count", ascending=False)
pop_10_books = popular_books[:10]

print(pop_10_books)

              isbn13  vote_count  avg_rating  weighted_rating
6520   9780316666343         568    8.198944         8.196623
35191  9780971880108         478    4.372385         4.385573
12556  9780385504201         391    8.450128         8.445484
5184   9780312195519         295    8.091525         8.087795
1243   9780060928339         260    7.896154         7.893417
23380  9780590353427         253    8.928854         8.917942
23379  9780590353427         253    8.928854         8.917942
4065   9780142001745         247    8.534413         8.526407
17415   978044667227         238    8.176471         8.171147
19733  9780452282155         226    7.991150         7.987172


In [63]:
pop_dict = {
    'itemId': popular_books['isbn13'].values,
    'est_r': popular_books['avg_rating'].values,
}

pop_df = pd.DataFrame.from_dict(pop_dict)

ratings_dict = {
    'userId': test['user_id'].values,
    'itemId': test['isbn13'].values,
    'rating': test['book_rating'].values
}

ratings_df = pd.DataFrame.from_dict(ratings_dict)

print(pop_df)
print(ratings_df)


              itemId      est_r
0      9780316666343   8.198944
1      9780971880108   4.372385
2      9780385504201   8.450128
3      9780312195519   8.091525
4      9780060928339   7.896154
...              ...        ...
41477  9780451628817  10.000000
41478  9780451628497   9.000000
41479   978045162826   6.500000
41480   978045162817   8.000000
41481  9789879601709   6.500000

[41482 rows x 2 columns]
       userId         itemId  rating
0      255099  9783150000472       7
1      127384  9780425155417       9
2      257024  9780425181102      10
3      127200  9780380757497       3
4      109574  9780679452355       8
...       ...            ...     ...
76788   66789  9781400031351       5
76789   44190  9781879181212       9
76790   58612  9780380016990      10
76791   95359   978044920472       8
76792   30276  9780373027576       6

[76793 rows x 3 columns]


In [64]:
uids = ratings_df["userId"].unique()

print("User Count", len(uids))
print(uids)

User Count 26104
[255099 127384 257024 ...  57379 193017  93820]


In [65]:
def def_value():
    return "Not Present"


top_n = defaultdict(def_value)
for uid in uids:
    top_n[uid] = pop_df

In [66]:
import numpy as np

mean_rating_test = np.mean(ratings_df["rating"])

print("Average Rating over whole test set:", mean_rating_test)

Average Rating over whole test set: 7.614300782623416


In [67]:
import time

start_time = time.time()
avg_precision = eval.get_avg_precision(ratings_df, top_n, k=20, threshold=mean_rating_test)
print("--- %s seconds ---" % (time.time() - start_time))

print("Average Precision:", avg_precision)

--- 51.14799380302429 seconds ---
Average Precision: 0.0017672898150985963


In [68]:
importlib.reload(eval)
start_time = time.time()

avg_recall = eval.get_avg_recall(ratings_df, top_n, k=20, threshold=mean_rating_test)
print("--- %s seconds ---" % (time.time() - start_time))

print("Average Recall:", avg_recall)



--- 53.551602840423584 seconds ---
Average Recall: 0.014365815322288186
