In [3]:
import random
import numpy as np

my_seed = 1337
random.seed(my_seed)
np.random.seed(my_seed)

from typing import *
from IPython.display import display, HTML, Markdown

import warnings
warnings.filterwarnings('ignore')

### 数据读取与合并

In [4]:
def name_to_id():
    movie_data_columns = [
    'type', 'actor', 'region', 'director', 'characteristic',
    'score', 'moviename']
    movie_data = pd.read_csv(data_dir + './dataset1/movie.csv')
    movie_data.columns = movie_data_columns
    movie_unique = movie_data['moviename'].unique()
    movie_name_to_uninque_index = dict()
    movie_index_to_uninque_name = dict()
    for  i,j in enumerate(movie_unique):
        if i <=22971:
            movie_name_to_uninque_index[j] = i
            movie_index_to_uninque_name[i] = j
        elif i > 22971:
            print('error')
    return movie_name_to_uninque_index,movie_index_to_uninque_name

def load_movies_dataset():
    movie_data_columns = [
    'type', 'actor', 'region', 'director', 'characteristic',
    'score_douban', 'moviename']
    movie_data = pd.read_csv(data_dir + './dataset1/movie.csv')
    print('movie_data\n',movie_data.shape)
    movie_data.columns = movie_data_columns    
    movie_unique = movie_data['moviename'].unique()   
    dict_name_to_id,dict_id_to_name = name_to_id( )    
    movie_data['movie_id'] = movie_data['moviename'].map(lambda x:dict_name_to_id[x])
    
    return movie_data 

def load_user_and_ratings() :
    user_data_columns = ['score_user','user_name','comment_time','user_id','moviename','type']
    user_data = pd.read_csv(data_dir + './dataset1/user.csv')
    print('user_data\n',user_data.shape)
    user_data.columns = user_data_columns
    user_data['comment_time'] = pd.to_datetime(user_data['comment_time'])
    dict_name_to_id,dict_id_to_name = name_to_id()   
    user_data['movie_id'] = user_data['moviename'].map(lambda x:dict_name_to_id[x] if x in dict_name_to_id.keys() else '-1')
    
    return user_data 


def load_movielens() :  
    user_data = load_user_and_ratings()
    user_data['user_id'] = user_data['user_id'].map(lambda k: f"User {k}")
    movies_data = load_movies_dataset()
    ratings_and_movies  = pd.merge( user_data, movies_data ,on=['moviename','movie_id','type'])
    ratings_and_movies['rating'] = ratings_and_movies['score_user']
    del ratings_and_movies['score_user']
    return ratings_and_movies.sample(frac=1).reset_index(drop=True)


data_dir = 'D:/python/Jupyter_Last_project/dataset/'
all_data_df = load_movielens()
print('all_data\n',all_data_df.shape)
print('data info',all_data_df.info())
print('data head\n',all_data_df.head())

user_data
 (188843, 6)
movie_data
 (89524, 7)
all_data
 (253054, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253054 entries, 0 to 253053
Data columns (total 12 columns):
user_name         253054 non-null object
comment_time      253054 non-null datetime64[ns]
user_id           253054 non-null object
moviename         253054 non-null object
type              253054 non-null object
movie_id          253054 non-null object
actor             253054 non-null object
region            253054 non-null object
director          253054 non-null object
characteristic    253054 non-null object
score_douban      253054 non-null float64
rating            253054 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(1), object(9)
memory usage: 23.2+ MB
data info None
data head
   user_name        comment_time     user_id moviename type movie_id  \
0        佳宋 2018-01-05 17:24:27  User 14564     极限特工2   冒险     2284   
1       萌妹纸 2018-01-05 15:40:40   User 8760      真爱无尽   奇幻     2617   

In [5]:
#用户的评分是 2 4 6 8 10

print(all_data_df.rating.value_counts())
#豆瓣的评分范围是7.3~10  暂不清楚豆瓣评分机制
print(all_data_df.score_douban.value_counts())

8     100946
4      67663
10     66404
2      18023
6         18
Name: rating, dtype: int64
7.5    9998
7.3    9992
7.6    9533
7.4    9352
7.7    8917
       ... 
2.3     114
2.2     109
9.7      80
2.1      52
9.8       3
Name: score_douban, Length: 78, dtype: int64


Exploring the  dataset 
Douban dataset
user_data 用户评论数据
 (188843, 6)
movie_data 电影数据 
 (89524, 7)
all_data 拼接后的数据
 (741343, 13)
  
 unique len 不重复电影个数
 21722

Training a SVD using Surprise in 4 simple steps

In [6]:
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
# Step 1: create a Reader.
# A reader tells our SVD what the lower and upper bound of our ratings is.
# MovieLens ratings are from 1 to 5
reader = Reader(rating_scale=(1, 10))
# Step 2: create a new Dataset instance with a DataFrame and the reader
# The DataFrame needs to have 3 columns in this specific order: [user_id, product_id, rating]
data = Dataset.load_from_df(all_data_df[['user_id','moviename','rating']], reader)
# Step 3: keep 25% of your trainset for testing
trainset, testset = train_test_split(data, test_size=.25)
# Step 4: train a new SVD with 100 latent features (number was chosen arbitrarily)
model = SVD(n_factors=200)
model.fit(trainset)
model.qi /= np.linalg.norm(model.qi, ord=2, axis=1).reshape(-1, 1)
predictions = model.test(testset)

# Then compute RMSE
print(accuracy.rmse(predictions))

RMSE: 2.0068
2.0068417022626788


In [7]:
print(trainset.n_users) # 
print(trainset.n_items) # 

13537
20750


Inspecting our Product Matrix 
Surprise SVD stores the product matrix under the model.qi attribute.

In [8]:
print('model.qi.shape\n',model.qi.shape)

model.qi.shape
 (20750, 200)


Mapping every vector back to it's movie 
Every row is mapped to a movie. How do we map every movie to it's vector?

In [9]:
#返回的结果是 movie name 和内部的id
def display(df ):
    print('in display')
    #print('list item_to_row_idx.items \n',list(item_to_row_idx.items()))
    #[('怪物大乱斗', 0), ('我想戴上戒指', 1), ('乌云背后的幸福线', 2), ('法官老爹', 3), ('千年追凶', 4), ('日出日落', 5)
    item_to_row_idx_df = pd.DataFrame(
        list(item_to_row_idx.items()),
        columns=['Movie name', 'model.qi row idx'],
    ).set_index('Movie name')
    return item_to_row_idx_df.head(5)

#print('model.trainset._raw2inner_id_items\n',model.trainset._raw2inner_id_items)
#{'怪物大乱斗': 0, '我想戴上戒指': 1, '乌云背后的幸福线': 2, '法官老爹': 3, '千年追凶': 4, '日出日落': 5,
item_to_row_idx  = model.trainset._raw2inner_id_items

# `display()` is a utility function to make `item_to_row_idx` more readable
display(item_to_row_idx)

in display


Unnamed: 0_level_0,model.qi row idx
Movie name,Unnamed: 1_level_1
中国最后一个太监,0
母亲！,1
活尸的城堡,2
忍者乱太郎 暑假作业大作战！之段,3
关于恶女,4


Identifying One Movie

In [10]:
#通过电影名得到内部id
toy_story_row_idx  = item_to_row_idx['千年追凶']
#得到内部id 对应的向量
print(model.qi[toy_story_row_idx])
print(f"Every product has {model.qi[toy_story_row_idx].shape[0]} features")

[ 0.14144005 -0.00410393  0.07910609  0.05002832  0.10511502  0.07465865
 -0.0243615   0.02741672  0.01787008  0.10048001  0.02520749  0.02591682
  0.16779199 -0.00647828  0.04335985  0.03416477 -0.03607353 -0.02452203
 -0.05548903 -0.02594348  0.08805245 -0.01024728 -0.05666609 -0.08392388
 -0.04942402 -0.07051616 -0.10146397  0.09167373 -0.01748571 -0.02919874
  0.08560237 -0.10585175 -0.14235175  0.03215912  0.01640436  0.07924654
 -0.04800492 -0.00124989 -0.04758321  0.00338165 -0.00527807  0.02206229
  0.0174934  -0.08959726  0.12011462 -0.10501012 -0.00404565 -0.00560562
  0.01869081 -0.043857   -0.04461003 -0.01520467  0.0455597  -0.10625512
 -0.06621415 -0.05306128 -0.11781278  0.0915248   0.07275846 -0.0089562
  0.12468439 -0.03818936  0.02936205 -0.08812495  0.12443781 -0.11379569
  0.06802886  0.05327685 -0.06503029  0.11422278 -0.09767897 -0.06017931
 -0.14784159 -0.02400457 -0.03283592 -0.06803498 -0.04181446  0.02357049
 -0.04612731  0.07911536  0.00550872  0.07745877  0.

Recommendations via Product based CF: Finding similarity between vectors 
2 products are "similar" when the cosine distance is close to 0
#一个向量空间中两个向量夹角间的余弦值作为衡量两个个体之间差异的大小，
#余弦值接近1，夹角趋于0，表明两个向量越相似，
#余弦值接近于0，夹角趋于90度，表明两个向量越不相似。
#scipy.spatial.distance.cosine(u，v) 中
#余弦值接近0，夹角趋于0，表明两个向量越相似，
#余弦值接近于1，夹角趋于90度，表明两个向量越不相似。
#u和v之间的余弦距离定义为：1 - u .v / \\u\\ \\v\\

### 通过矩阵重构进行推荐

#### Recommendations via Matrix Reconstruction

Use cases:
Predict a score between any combination of user and a product

Recommendations via Matrix Reconstruction: Using the predict() API inside of Surprise
    
Computes the rating prediction for given user and movie with model.predict(). 
Pick a random user and movie, and calculate the score between them


In [11]:
# Refresher: ratings data-frame.
print(all_data_df.columns)
a1 = all_data_df[['user_id','moviename','rating']]
a1.head(10)

Index(['user_name', 'comment_time', 'user_id', 'moviename', 'type', 'movie_id',
       'actor', 'region', 'director', 'characteristic', 'score_douban',
       'rating'],
      dtype='object')


Unnamed: 0,user_id,moviename,rating
0,User 14564,极限特工2,10
1,User 8760,真爱无尽,8
2,User 12681,小野猫吃大老虎,10
3,User 9498,20 30 40,8
4,User 5825,心的羽毛,4
5,User 30140,去年冬天,10
6,User 615,索女罗拉,4
7,User 63873,十分钟年华老去：大提琴篇,4
8,User 38237,听见天堂,8
9,User 4279,1985年中央电视台春节联欢晚会,8


In [12]:
a_user = "User 15668"
a_product = "梁祝"
model.predict(a_user, a_product)

Prediction(uid='User 15668', iid='梁祝', r_ui=None, est=6.294317759553696, details={'was_impossible': False})

### 通过基础CF 进行推荐：查找向量之间的相似度

Item-based collaborative filtering
Recommendations via Item Similarity: Finding similarity between vectors
2 products are "similar" when the cosine distance is close to 0

In [13]:
#这里的cosine是余弦距离，也就是1-余弦相似度
from scipy.spatial.distance import cosine

#通过电影名得到内部id
#返回值：内部id 对应的向量
def get_vector_by_movie_title(movie_title ,trained_model )  :
    """Returns the latent features of a movie in the form of a numpy array"""
    movie_row_idx = trained_model.trainset._raw2inner_id_items[movie_title]
    return trained_model.qi[movie_row_idx]


def cosine_distance(vector_a , vector_b ) :
    """Returns a float indicating the similarity between two vectors"""
    return cosine(vector_a, vector_b)

In [14]:
from scipy.spatial.distance import cosine as cosine_distance

# Fetch indices for Toy Story and Wizard of Oz
djs_idx = model.trainset._raw2inner_id_items['独角兽']
hy_idx = model.trainset._raw2inner_id_items['后裔']
westar_idx = model.trainset._raw2inner_id_items['我们的追星之路']

# Get vectors for both movies
starwars_vector = model.qi[djs_idx]
return_of_jedi_vector = model.qi[hy_idx]
aladdin_vector = model.qi[westar_idx]
# Distance 1
print(cosine_distance(starwars_vector, return_of_jedi_vector))
# Distance 2
print(cosine_distance(starwars_vector, aladdin_vector))

1.0432872133000166
1.081955409844456


### Finding similar movies by ranking

In [15]:
def display_similarity(similarity_table):
    similarity_table = pd.DataFrame(
        similarity_table,
        columns=['vector cosine distance', 'movie title']
    ).sort_values('vector cosine distance', ascending=True)
    return similarity_table.iloc[:6]

In [16]:
def get_top_similarities(movie_title , model ):
    """Returns the top 5 most similar movies to a specified movie
    
    This function iterates over every possible movie in MovieLens and calculates
    distance between `movie_title` vector and that movie's vector.
    """
    
    # Get the first movie vector
    #通过电影名得到内部id
    #返回值：内部id 对应的向量
    movie_vector  = get_vector_by_movie_title(movie_title, model)
    similarity_table = []
    
    # Iterate over every possible movie and calculate similarity
    #print('model.trainset._raw2inner_id_items\n',model.trainset._raw2inner_id_items)
    #dict '怪物大乱斗': 0, '我想戴上戒指': 1, '乌云背后的幸福线': 2, '法官老爹': 3, '千年追凶': 4, 
    #key 电影名  value  内部id（inner id）
    for other_movie_title in model.trainset._raw2inner_id_items.keys():
        other_movie_vector = get_vector_by_movie_title(other_movie_title, model)   
        # Get the second movie vector, and calculate distance
        similarity_score = cosine_distance(other_movie_vector, movie_vector)
        similarity_table.append((similarity_score, other_movie_title))

    #列表的每个元素是元组 （相似度，电影名）
    print('similarity_table len\n',len(similarity_table))
    
    # sort movies by ascending similarity
    return display_similarity(sorted(similarity_table))

In [17]:
get_top_similarities('王牌对王牌', model)

similarity_table len
 20750


Unnamed: 0,vector cosine distance,movie title
0,0.0,王牌对王牌
1,0.718639,魔法科高校的劣等生 呼唤星辰的少女
2,0.750206,麦兜
3,0.755203,选秀日
4,0.756501,深谷尸变
5,0.757208,拳击烈女
