# use surprise to build recsys engine

In [73]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
%matplotlib inline

## load source data

In [74]:
src_folder = '../dataset/dataset1'
movie_file_name = 'movie.csv'
user_file_name = 'user.csv'

In [75]:
movie_df = pd.read_csv(os.path.join(src_folder, movie_file_name))
user_df = pd.read_csv(os.path.join(src_folder, user_file_name))

## simple eda on user rating and movie data

In [76]:
# analyse data
print(('-'*10 + ' {} ' + '-'*10).format("is movie contains nan"))
print(movie_df.isna().any())
print(('-'*10 + ' {} ' + '-'*10).format("is user contains nan"))
print(user_df.isna().any())
print(('-'*10 + ' {} ' + '-'*10).format("movie data set info"))
print(movie_df.info())
print(('-'*10 + ' {} ' + '-'*10).format("user data set info"))
print(user_df.info())

---------- is movie contains nan ----------
类型     False
主演     False
地区     False
导演     False
特色     False
评分     False
电影名    False
dtype: bool
---------- is user contains nan ----------
评分      False
用户名     False
评论时间    False
用户ID    False
电影名     False
类型      False
dtype: bool
---------- movie data set info ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93160 entries, 0 to 93159
Data columns (total 7 columns):
类型     93160 non-null object
主演     93160 non-null object
地区     93160 non-null object
导演     93160 non-null object
特色     93160 non-null object
评分     93160 non-null float64
电影名    93160 non-null object
dtypes: float64(1), object(6)
memory usage: 5.0+ MB
None
---------- user data set info ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199813 entries, 0 to 199812
Data columns (total 6 columns):
评分      199813 non-null int64
用户名     199813 non-null object
评论时间    199813 non-null object
用户ID    199813 non-null int64
电影名     199813 non-null object

In [77]:
user_df_movie_set = set(np.unique(user_df['电影名']))
movie_df_movie_set = set(np.unique(movie_df['电影名']))
print("movie in user_df length is {} and movie in movie_df length is {}".format(len(user_df_movie_set), len(movie_df_movie_set)))
print("difference")
print(movie_df_movie_set - user_df_movie_set)
print("number of users is {}".format(len(np.unique(user_df['用户ID']))))
print("number of movies is {}".format(len(np.unique(user_df['电影名']))))

movie in user_df length is 23031 and movie in movie_df length is 23034
difference
{'粉骚大联盟', '新雪国', '当狗狗在停车场'}
number of users is 13545
number of movies is 23031


# generate surprise data format

In [78]:
surprise_df = user_df[['用户ID', '电影名', '评分']]
user_m = user_df[['用户ID', '用户名']].drop_duplicates
rating_scale = (np.unique(surprise_df['评分']).min(), np.unique(surprise_df['评分']).max())
reader = Reader(rating_scale = rating_scale)
surprise_data =  Dataset.load_from_df(surprise_df, reader)

# model selection

In [80]:
from surprise import SVD
import time
alg = SVD(random_state=0)
# fit on the whole data set
# define the parameters

param_grid = {
              'lr_all': [0.001, 0.005],
              'reg_all': [0.01, 0.1]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

print(('-'*10 + "{}" + '-'*10).format("start training"))
start = time.time()
gs.fit(surprise_data)
duration = round(time.time() - start, 3)
print(('-'*10 + "{}" + '-'*10).format("duration " + str(duration) + " s"))

----------start training----------
----------duration 116.931 s----------


In [81]:
#best params
print('best rmse params')
print(gs.best_params['rmse'])
print('best maae params')
print(gs.best_params['mae'])
print('best score rmse')
print(gs.best_score['rmse'])
print('best score mae')
print(gs.best_score['mae'])
algo = gs.best_estimator['rmse']

best rmse params
{'lr_all': 0.005, 'reg_all': 0.01}
best maae params
{'lr_all': 0.005, 'reg_all': 0.01}
best score rmse
2.3292249281154214
best score mae
1.9202335468921257


In [82]:
# retrain the model with all the data
all_train_set=surprise_data.build_full_trainset()
algo.fit(all_train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa5bbb4c828>

# predict and recommendation

In [83]:
DEFAULT_USER_COL = "用户ID"
DEFAULT_ITEM_COL = "电影名"
DEFAULT_RATING_COL = "评分"
DEFAULT_LABEL_COL = "label"
DEFAULT_TIMESTAMP_COL = "timestamp"
DEFAULT_PREDICTION_COL = "预估评分"
COL_DICT = {
    "col_user": DEFAULT_USER_COL,
    "col_item": DEFAULT_ITEM_COL,
    "col_rating": DEFAULT_RATING_COL,
    "col_prediction": DEFAULT_PREDICTION_COL,
}

# Filtering variables
DEFAULT_K = 10
DEFAULT_THRESHOLD = 10

# Other
SEED = 42

In [84]:
#utils
def predict(algo,
    data,
    usercol=DEFAULT_USER_COL,
    itemcol=DEFAULT_ITEM_COL,
    predcol=DEFAULT_PREDICTION_COL
):
    '''
    given the dataframe and predict the r_ui
    '''
    # get the predictions of current data frame
    # prediction: uid iid r_ui estimated rating
    predictions = [
        algo.predict(getattr(row, usercol), getattr(row, itemcol))
        for row in data.itertuples()
    ]
    predictions = pd.DataFrame(predictions)
    predictions = predictions.rename(
        index=str, columns={"uid": usercol, "iid": itemcol, "est": predcol}
    )
    return predictions.drop(["details", "r_ui"], axis=1)

In [85]:
def compute_ranking(algo,
    data,
    usercol=DEFAULT_USER_COL,
    itemcol=DEFAULT_ITEM_COL,
    predcol=DEFAULT_PREDICTION_COL,
    remove_seen=False
):
    '''
    calculate the score and rank
    '''
    preds_lst = []
    users = data[usercol].unique()
    items = data[itemcol].unique()

    for user in users:
        for item in items:
            preds_lst.append([user, item, algo.predict(user, item).est])

    all_predictions = pd.DataFrame(data=preds_lst, columns=[usercol, itemcol, predcol])

    if remove_seen:
        tempdf = pd.concat(
            [
                data[[usercol, itemcol]],
                pd.DataFrame(
                    data=np.ones(data.shape[0]), columns=["dummycol"], index=data.index
                ),
            ],
            axis=1,
        )
        merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
        return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
    else:
        return all_predictions

In [86]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(surprise_df, test_size=0.25, random_state=0)
test_predictions = predict(algo, test_set)
print(test_predictions.head())
test_ranking = compute_ranking(algo, test_set)
print(test_ranking.head())

    用户ID   电影名      预估评分
0  11279  前路漫漫  6.137563
1  19564   麦尔斯  7.982444
2  13273  青春罩杯  9.088771
3   4549  甘泉玛侬  8.689167
4  12644  一个勺子  4.862729
    用户ID   电影名      预估评分
0  11279  前路漫漫  6.137563
1  11279   麦尔斯  7.998595
2  11279  青春罩杯  7.989352
3  11279  甘泉玛侬  7.712024
4  11279  一个勺子  6.523887


# evaluate the SVD model

In [None]:
def get_top_k_items(
    dataframe, col_user=DEFAULT_USER_COL, col_rating=DEFAULT_RATING_COL, k=DEFAULT_K
):
    if k is None:
        top_k_items = dataframe
    else:
        top_k_items = (
            dataframe.groupby(col_user, as_index=False)
            .apply(lambda x: x.nlargest(k, col_rating))
            .reset_index(drop=True)
        )
    top_k_items["rank"] = top_k_items.groupby(col_user, sort=False).cumcount() + 1
    return top_k_items

def merge_ranking_true_pred(
    rating_true,
    rating_pred,
    col_user,
    col_item,
    col_rating,
    col_prediction,
    relevancy_method='top_k',
    k=DEFAULT_K,
    threshold=DEFAULT_THRESHOLD
):
    common_users = set(rating_true[col_user]).intersection(set(rating_pred[col_user]))
    rating_true_common = rating_true[rating_true[col_user].isin(common_users)]
    rating_pred_common = rating_pred[rating_pred[col_user].isin(common_users)]
    n_users = len(common_users)
    
    if relevancy_method == "top_k":
        top_k = k
    elif relevancy_method == "by_threshold":
        top_k = threshold
    elif relevancy_method is None:
        top_k = None
    else:
        raise NotImplementedError("Invalid relevancy_method")
    df_hit = get_top_k_items(
        dataframe=rating_pred_common,
        col_user=col_user,
        col_rating=col_prediction,
        k=top_k,
    )
    df_hit = pd.merge(df_hit, rating_true_common, on=[col_user, col_item])[
        [col_user, col_item, "rank"]
    ]

    # count the number of hits vs actual relevant items per user
    df_hit_count = pd.merge(
        df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
        rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
            {"actual": "count"}
        ),
        on=col_user,
    )

    return df_hit, df_hit_count, n_users

def map_eval(
    rating_true,
    rating_pred,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    relevancy_method="top_k",
    k=DEFAULT_K,
    threshold=DEFAULT_THRESHOLD,
):
    df_hit, df_hit_count, n_users = merge_ranking_true_pred(
        rating_true=rating_true,
        rating_pred=rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
        relevancy_method=relevancy_method,
        k=k,
        threshold=threshold,
    )

    if df_hit.shape[0] == 0:
        return 0.0

    # calculate reciprocal rank of items for each user and sum them up
    df_hit_sorted = df_hit.copy()
    df_hit_sorted["rr"] = (
        df_hit_sorted.groupby(col_user).cumcount() + 1
    ) / df_hit_sorted["rank"]
    df_hit_sorted = df_hit_sorted.groupby(col_user).agg({"rr": "sum"}).reset_index()

    df_merge = pd.merge(df_hit_sorted, df_hit_count, on=col_user)
    return (df_merge["rr"] / df_merge["actual"]).sum() / n_users

def ndcg_eval(rating_true,
    rating_pred,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    relevancy_method="top_k",
    k=DEFAULT_K,
    threshold=DEFAULT_THRESHOLD
):
    df_hit, df_hit_count, n_users = merge_ranking_true_pred(
        rating_true=rating_true,
        rating_pred=rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
        relevancy_method=relevancy_method,
        k=k,
        threshold=threshold,
    )

    if df_hit.shape[0] == 0:
        return 0.0

    # calculate discounted gain for hit items
    df_dcg = df_hit.copy()
    # relevance in this case is always 1
    df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"])
    # sum up discount gained to get discount cumulative gain
    df_dcg = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})
    # calculate ideal discounted cumulative gain
    df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user])
    df_ndcg["idcg"] = df_ndcg["actual"].apply(
        lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1)))
    )

    # DCG over IDCG is the normalized DCG
    return (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users

def precision_eval(rating_true,
    rating_pred,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    relevancy_method="top_k",
    k=DEFAULT_K,
    threshold=DEFAULT_THRESHOLD
):
    df_hit, df_hit_count, n_users = merge_ranking_true_pred(
        rating_true=rating_true,
        rating_pred=rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
        relevancy_method=relevancy_method,
        k=k,
        threshold=threshold,
    )

    if df_hit.shape[0] == 0:
        return 0.0

    return (df_hit_count["hit"] / k).sum() / n_users


def recall_eval(rating_true,
    rating_pred,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    relevancy_method="top_k",
    k=DEFAULT_K,
    threshold=DEFAULT_THRESHOLD
):
    df_hit, df_hit_count, n_users = merge_ranking_true_pred(
        rating_true=rating_true,
        rating_pred=rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
        relevancy_method=relevancy_method,
        k=k,
        threshold=threshold,
    )

    if df_hit.shape[0] == 0:
        return 0.0
    return (df_hit_count["hit"] / df_hit_count["actual"]).sum() / n_users

In [None]:
all_predictions = compute_ranking(algo, train_set, remove_seen=True)

In [None]:
k = 10
eval_map = map_eval(test, all_predictions, col_prediction='prediction', k=k)
eval_ndcg = ndcg_eval(test, all_predictions, col_prediction='prediction', k=k)
eval_precision = precision_eval(test, all_predictions, col_prediction='prediction', k=k)
eval_recall = recall_eval(test, all_predictions, col_prediction='prediction', k=k)

In [None]:
print("map: {}".format(eval_map))
print("ndcg: {}".format(eval_ndcg))
print("precision: {}".format(eval_precision))
print("recall: {}".format(eval_recall))