In [4]:
import random
import numpy as np
import pandas as pd
my_seed = 1337
random.seed(my_seed)
np.random.seed(my_seed)

from typing import *
from IPython.display import display, HTML, Markdown

import warnings
warnings.filterwarnings('ignore')

#### Step one加载数据

In [70]:
def name_to_id():
    movie_data_columns = [
    'type', 'actor', 'region', 'director', 'characteristic',
    'score', 'moviename']
    movie_data = pd.read_csv(data_dir + './dataset1/movie.csv')
    movie_data.columns = movie_data_columns
    movie_unique = movie_data['moviename'].unique()
    movie_name_to_uninque_index = dict()
    movie_index_to_uninque_name = dict()
    for  i,j in enumerate(movie_unique):
        if i <=22971:
            movie_name_to_uninque_index[j] = i
            movie_index_to_uninque_name[i] = j
        elif i > 22971:
            print('error')
    return movie_name_to_uninque_index,movie_index_to_uninque_name

def load_movies_dataset():
    movie_data = pd.read_csv(data_dir + './dataset1/movie.csv')
    movie_data = movie_data.rename(columns = {'评分': "豆瓣网评分"})
    return movie_data 

def load_user_and_ratings() :
    user_data = pd.read_csv(data_dir + './dataset1/user.csv')
    print('user_data\n',user_data.shape)
    user_data['评论时间'] = pd.to_datetime(user_data['评论时间'])   
    return user_data 



data_dir = 'D:/python/Jupyter_Last_project/dataset/'
user_rating_data = load_user_and_ratings()
print(user_rating_data.head())
u_data = user_rating_data[['用户ID', '电影名', '评分']]

user_data
 (188843, 6)
   评分     用户名                评论时间  用户ID    电影名  类型
0   2      身似 2018-01-05 15:05:06     1   心雨花露  爱情
1   4  有意识的贱民 2018-01-05 15:05:06     3  战争的恐怖  战争
2   2    亿万露电 2018-01-05 15:05:06     4  豪勇七蛟龙  战争
3   2   Marni 2018-01-05 15:05:06     5   无序之主  犯罪
4   4   马西嘻嘻嘻 2018-01-05 15:05:06     6  时装店风波  同性


In [42]:
u_temp = u_data.groupby('用户ID').电影名.size().reset_index()

u_temp = u_temp.rename(columns = {'电影名': "用户观看电影个数"})
u_temp.head()

Unnamed: 0,用户ID,用户观看电影个数
0,1,5
1,3,132
2,4,15
3,5,27
4,6,87


In [43]:
u_temp['用户观看电影个数'].max()

2625

In [48]:
u_temp['用户观看电影个数'].mean()

13.941897379106681

In [44]:
u_temp['用户观看电影个数'].min()

2

#### 一些必要的函数

In [35]:
from timeit import default_timer
from datetime import timedelta
from sklearn.model_selection import train_test_split as sk_split

class Timer(object):
    """Timer class.

    `Original code <https://github.com/miguelgfierro/pybase/blob/2298172a13fb4a243754acbc6029a4a2dcf72c20/log_base/timer.py>`_.
    
    Examples:
        >>> import time
        >>> t = Timer()
        >>> t.start()
        >>> time.sleep(1)
        >>> t.stop()
        >>> t.interval < 1
        True
        >>> with Timer() as t:
        ...   time.sleep(1)
        >>> t.interval < 1
        True
        >>> "Time elapsed {}".format(t) #doctest: +ELLIPSIS
        'Time elapsed 1...'
    """

    def __init__(self):
        self._timer = default_timer
        self._interval = 0
        self.running = False

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def __str__(self):
        return "{:0.4f}".format(self.interval)

    def start(self):
        """Start the timer."""
        self.init = self._timer()
        self.running = True

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        try:
            self._interval = self.end - self.init
            self.running = False
        except AttributeError:
            raise ValueError(
                "Timer has not been initialized: use start() or the contextual form with Timer() as t:"
            )

    @property
    def interval(self):
        """Get time interval in seconds.

        Returns:
            float: Seconds.
        """
        if self.running:
            raise ValueError("Timer has not been stopped, please use stop().")
        else:
            return self._interval
        
def process_split_ratio(ratio):
    """Generate split ratio lists.

    Args:
        ratio (float or list): a float number that indicates split ratio or a list of float
        numbers that indicate split ratios (if it is a multi-split).

    Returns:
        tuple: a tuple containing
            bool: A boolean variable multi that indicates if the splitting is multi or single.
            list: A list of normalized split ratios.
    """
    if isinstance(ratio, float):
        if ratio <= 0 or ratio >= 1:
            raise ValueError("Split ratio has to be between 0 and 1")

        multi = False
    elif isinstance(ratio, list):
        if any([x <= 0 for x in ratio]):
            raise ValueError(
                "All split ratios in the ratio list should be larger than 0."
            )

        # normalize split ratios if they are not summed to 1
        if math.fsum(ratio) != 1.0:
            ratio = [x / math.fsum(ratio) for x in ratio]

        multi = True
    else:
        raise TypeError("Split ratio should be either float or a list of floats.")

    return multi, ratio

def python_random_split(data, ratio=0.75, seed=42):
    multi_split, ratio = process_split_ratio(ratio)

    if multi_split:
        splits = split_pandas_data_with_ratios(data, ratio, shuffle=True, seed=seed)
        splits_new = [x.drop("split_index", axis=1) for x in splits]

        return splits_new
    else:
        return sk_split(data, test_size=None, train_size=ratio, random_state=seed)
    
def compute_ranking_predictions(
    algo,
    data,
    usercol=DEFAULT_USER_COL,
    itemcol=DEFAULT_ITEM_COL,
    predcol=DEFAULT_PREDICTION_COL,
    remove_seen=False,
):
    """Computes predictions of an algorithm from Surprise on all users and items in data. It can be used for computing
    ranking metrics like NDCG.
    
    Args:
        algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
        data (pd.DataFrame): the data from which to get the users and items
        usercol (str): name of the user column
        itemcol (str): name of the item column
        remove_seen (bool): flag to remove (user, item) pairs seen in the training data
    
    Returns:
        pd.DataFrame: dataframe with usercol, itemcol, predcol
    """
    preds_lst = []
    users = data[usercol].unique()
    items = data[itemcol].unique()

    for user in users:
        for item in items:
            preds_lst.append([user, item, algo.predict(user, item).est])
    #all_predictions 所有用户对每个电影的 用户iD 电影iD 预测评分
    all_predictions = pd.DataFrame(data=preds_lst, columns=[usercol, itemcol, predcol])

    
    if remove_seen:
        #tempdf 存储的是用户看过的电影
        tempdf = pd.concat(
            [
                data[[usercol, itemcol]],
                pd.DataFrame(
                    data=np.ones(data.shape[0]), columns=["dummycol"], index=data.index
                ),
            ],
            axis=1,
        )
        # 看过的电影和所有电影merge
        merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
        #在结果集中去掉用户看过的电影
        return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
    else:
        return all_predictions

#### step two SVD建模

In [20]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
import os

def Svd_train():
    # Step 1: create a Reader.
    # A reader tells our SVD what the lower and upper bound of our ratings is.
    # MovieLens ratings are from 1 to 5
    reader = Reader(rating_scale=(2, 10))
    # Step 2: create a new Dataset instance with a DataFrame and the reader
    # The DataFrame needs to have 3 columns in this specific order: [user_id, product_id, rating]
    train, test = python_random_split(u_data, 0.75)
    print(train.shape)
    print(test.shape)
    train_set = Dataset.load_from_df(train, reader=Reader(rating_scale=(2, 10))).build_full_trainset()
    svd = SVD(random_state=0, n_factors=200, n_epochs=20, verbose=True)

    with Timer() as train_time:
        svd.fit(train_set)

    print("Took {} seconds for training.".format(train_time.interval))

In [21]:
Svd_train()

(141632, 3)
(47211, 3)
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Took 19.9701002999999 seconds for training.


In [13]:
DEFAULT_USER_COL = '用户ID'
DEFAULT_ITEM_COL =  '电影名'
DEFAULT_PREDICTION_COL = '评分'

def Svd_predict(
    algo,
    data,
    usercol=DEFAULT_USER_COL,
    itemcol=DEFAULT_ITEM_COL,
    predcol=DEFAULT_PREDICTION_COL,
):
    """Computes predictions of an algorithm from Surprise on the data. Can be used for computing rating metrics like RMSE.
    
    Args:
        algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
        data (pd.DataFrame): the data on which to predict
        usercol (str): name of the user column
        itemcol (str): name of the item column
    
    Returns:
        pd.DataFrame: dataframe with usercol, itemcol, predcol
    """
    predictions = [
        algo.predict(getattr(row, usercol), getattr(row, itemcol))
        for row in data.itertuples()
    ]
    predictions = pd.DataFrame(predictions)
    predictions = predictions.rename(
        index=str, columns={"uid": usercol, "iid": itemcol, "est": predcol}
    )
    return predictions.drop(["details", "r_ui"], axis="columns")

In [16]:
def get_predictions():
    predictions = Svd_predict(svd, test, usercol='用户ID', itemcol='电影名')
    print(predictions.head())
    print(predictions.shape)

In [17]:
get_predictions()

    用户ID    电影名        评分
0   3352    猪之日  6.751689
1   7303  拉撒路报告  6.630192
2   8946   一丝偶然  6.924515
3  11592  再见艳阳天  7.116095
4   9885  夏天的故事  7.474689
(47211, 3)


#### step three 从总数据集中采样一部分样本参加训练

In [31]:
def sampledata_train():
    train_all,test  = python_random_split(u_data, 0.999)
    
    train_all = train_all.sample(frac=0.02, replace=True, random_state=1)
    print(train_all.shape)
    train_all_set = Dataset.load_from_df(train_all, reader=Reader(rating_scale=(2, 10))).build_full_trainset()

    svd = SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

    with Timer() as train_time:
        svd.fit(train_all_set)

    print("Took {} seconds for training.".format(train_time.interval))
    return svd,train_all

#### step four 根据用户观看过的电影的平均数13个。给每个用户推荐其最有可能观看的10个电影

In [77]:
def get_all_predictions():
    svd,train_all = sampledata_train()
    with Timer() as test_time:
        all_predictions = compute_ranking_predictions(svd, train_all, usercol='用户ID', itemcol='电影名', remove_seen=True)
    
    print("Took {} seconds for prediction.".format(test_time.interval))
    
    #按评分简单排序
    all_predictions  = all_predictions.sort_values(by=['用户ID','评分'],ascending=False)
    #召回
    #对每个用户取排序后前20的电影名
    user_count = 0
    for user in all_predictions.用户ID.unique():
        usertemp_df = all_predictions[all_predictions['用户ID'] == user][:11]
        if user_count == 0:
            dfret = usertemp_df.copy(deep=True)
            user_count += 1
        else:
            dfret = pd.concat([dfret,usertemp_df],axis=0,ignore_index=True)
    print('dfret shape',dfret.shape)
    
    #召回结果保存
    dfret.to_csv("./dfrecall.csv",index=False)

In [78]:
get_all_predictions()

(3773, 3)
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 0.5960879000012937 seconds for training.
Took 76.36160069999823 seconds for prediction.
dfret shape (32241, 3)


#### step five召回集数据特征处理

In [81]:
def call_data_process(dfuserin,dfmoviein):
    ur1 = dfuserin.groupby(['用户ID']).评分.agg( {'user_rating_avg':np.mean ,
                                           'user_rating_max':np.max ,
                                           'user_rating_min':np.min ,
                                           'user_rating_median':np.median
                                          }).reset_index()

    dfuserin['用户评论次数_观看电影个数'] = dfuserin.groupby(['用户ID'])['评分'].transform('count')

    u_data = pd.merge(ur1,dfuserin,on='用户ID',how = 'inner')

    user_and_movies_df = pd.merge(u_data,movies_df,on='电影名',how='inner')
   
    return user_and_movies_df
    

In [82]:
movies_df = load_movies_dataset()
dfdata = pd.read_csv('./dfrecall.csv')
dfa = call_data_process(dfdata,movies_df).copy(deep=True)

In [83]:
dfa.shape

(138660, 14)

In [85]:
dfa.columns

Index(['用户ID', 'user_rating_avg', 'user_rating_max', 'user_rating_min',
       'user_rating_median', '电影名', '评分', '用户评论次数_观看电影个数', '类型', '主演', '地区',
       '导演', '特色', '豆瓣网评分'],
      dtype='object')

In [87]:
dfa.to_csv('./dfrecallfeature.csv',index=False)