## 10.基于协同过滤、矩阵分解推荐算法的电影的评分预测


In [1]:
import pandas as pd
from surprise import SVD
from surprise import KNNBasic 
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

In [2]:
#导入并查看数据集
data_df=pd.read_csv('data/data_u.csv',encoding='gbk')
data_df

Unnamed: 0,用户原始ID,电影原始ID,评分,时间戳
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [3]:
#显示用户原始ID为192打分的电影原始ID，显示前十个
data_df[data_df['用户原始ID']==192]['电影原始ID'][0:10]

269       476
555      1061
725       948
4333     1160
7210      127
8995      301
10129     252
11609     340
12119    1405
12917     302
Name: 电影原始ID, dtype: int64

In [4]:
data_df[data_df['电影原始ID']==242]['用户原始ID'][0:10]

0       196
253      63
629     226
1232    154
2159    306
2335    296
2600     34
3484    271
4082    201
5548    209
Name: 用户原始ID, dtype: int64

### 从数据框中删除时间戳

In [5]:
#删除时间戳，转为dataset数据集备用
data_df=data_df.drop(['时间戳'],axis=1)
reader = Reader()
dataset = Dataset.load_from_df(data_df,reader)

## 1.2建立GridSearch模型优化机制

In [6]:
#用SVD算法创建矩阵分解模型
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.02,0.005],'reg_all': [0.2,0.4]}
model_svdGridSearch = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3,refit='rmse')

## 1.3训练模型，评估算法效果

In [7]:
model_svdGridSearch.fit(dataset)#训练模型
results_df = pd.DataFrame.from_dict(model_svdGridSearch.cv_results)#从from_dict函数可以从字典中创建DataFrame的对象
results_df[['mean_test_rmse','rank_test_rmse','mean_test_mae','rank_test_mae','param_n_epochs','param_lr_all','param_reg_all']]#展示结果

Unnamed: 0,mean_test_rmse,rank_test_rmse,mean_test_mae,rank_test_mae,param_n_epochs,param_lr_all,param_reg_all
0,0.952073,2,0.757533,2,5,0.02,0.2
1,0.961698,5,0.769631,5,5,0.02,0.4
2,0.965819,7,0.770992,6,5,0.005,0.2
3,0.973686,8,0.781479,8,5,0.005,0.4
4,0.948956,1,0.754516,1,10,0.02,0.2
5,0.959637,4,0.767698,4,10,0.02,0.4
6,0.954978,3,0.760869,3,10,0.005,0.2
7,0.963772,6,0.77215,7,10,0.005,0.4


## 1.4根据训练结果，预测某用户对某电影的评分
### 对用户已经评分过的电影进行评分预测

In [8]:
uid=196#设置用户原始ID
iid=242#设置电影原始ID
r_ui=r_ui=data_df[(data_df['用户原始ID']==uid)&( data_df['电影原始ID']==iid)]['评分']#查询原始得分
y_pred = model_svdGridSearch.predict(uid,iid,r_ui[0])#预测值
y_pred#打印

Prediction(uid=196, iid=242, r_ui=3, est=3.884172846127149, details={'was_impossible': False})

### 对用户没有评分过的电影进行预测

In [9]:
uid=196#设置用户原始ID
iid=302#设置电影原始ID
y_pred = model_svdGridSearch.predict(uid,iid)#预测值
y_pred#打印

Prediction(uid=196, iid=302, r_ui=None, est=4.021196171625454, details={'was_impossible': False})

## 基于用户的协同过滤模型的评分预测

### 2.1建立KNNBasic基于用户的协同过滤模型

In [10]:
model_userCF = KNNBasic()

### 2.2 训练模型，评估算法效果

In [11]:
cross_validate(model_userCF, dataset, measures=['RMSE','MAE'],cv=3,verbose=True)
#训练模型，verbose=True代表打印交叉算法的过程

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9941  0.9853  0.9868  0.9887  0.0038  
MAE (testset)     0.7864  0.7783  0.7796  0.7814  0.0035  
Fit time          0.30    0.31    0.31    0.31    0.01    
Test time         4.13    4.42    3.95    4.16    0.19    


{'test_rmse': array([0.9940848 , 0.98532802, 0.98676819]),
 'test_mae': array([0.78636561, 0.77833435, 0.77964308]),
 'fit_time': (0.29720211029052734, 0.30714917182922363, 0.31186580657958984),
 'test_time': (4.126725673675537, 4.415165185928345, 3.9460952281951904)}

In [12]:
#预测评分与真实评分对比
uid=196
iid=242
r_ui=r_ui=data_df[(data_df['用户原始ID']==uid)&( data_df['电影原始ID']==iid)]['评分']
y_pred = model_userCF.predict(uid,iid,r_ui[0])
y_pred

Prediction(uid=196, iid=242, r_ui=3, est=3.697768654629064, details={'actual_k': 40, 'was_impossible': False})

In [13]:
#预测此用户还没有预测过的电影
uid=196
iid=302
y_pred = model_userCF.predict(uid,iid)#预测值
y_pred#打印 

Prediction(uid=196, iid=302, r_ui=None, est=4.139915664767948, details={'actual_k': 40, 'was_impossible': False})

In [14]:
sim_options = {'name': 'pearson_baseline', 'user_based': False}
#相似度计算方法为pearson_baseline，计算失误之间的相似度
model_itemCF = KNNBasic(sim_options=sim_options)#实例化算法对象

#### 建立trainset、,testset 对象，训练模型，评估算法效果

In [15]:
trainset,testset = train_test_split(dataset)#将原始数据转换成trainset评分矩阵的形式
model_itemCF.fit(trainset)#带入数据训练模型
predictions = model_itemCF.test(testset)#模型预测
accuracy.rmse(predictions)#模型评估
accuracy.mae(predictions)#模型评估

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9943
MAE:  0.7800


0.7800139914875311

### 对用户已评分过的电影进行评分

In [16]:
uid=196#设置用户原始ID
iid=242#设置电影原始ID
r_ui=r_ui=data_df[(data_df['用户原始ID']==uid)&( data_df['电影原始ID']==iid)]['评分']
y_pred = model_userCF.predict(uid,iid,r_ui[0])
y_pred#打印

Prediction(uid=196, iid=242, r_ui=3, est=3.697768654629064, details={'actual_k': 40, 'was_impossible': False})

In [17]:
uid=196
iid=302
y_pred = model_userCF.predict(uid,iid)
y_pred

Prediction(uid=196, iid=302, r_ui=None, est=4.139915664767948, details={'actual_k': 40, 'was_impossible': False})

## 查找某电影的相似电影

### 导入电影名称数据文件，存储成数据框的形式,查看数据

In [18]:
name_rid_df=pd.read_csv('data/data_item.csv')
name_rid_df

Unnamed: 0,电影名称,电影原始ID
0,Toy Story (1995),1
1,GoldenEye (1995),2
2,Four Rooms (1995),3
3,Get Shorty (1995),4
4,Copycat (1995),5
...,...,...
1677,Mat' i syn (1997),1678
1678,B. Monkey (1998),1679
1679,Sliding Doors (1998),1680
1680,You So Crazy (1994),1681


## 根据电影名称和电影原始ID，构建互相查找的字典

In [19]:
rid_to_name = {}#创建字典，存放电影原始ID和电影名称的键值对
name_to_rid = {}#创建字典，存放电影原始ID和电影原始ID的键值对

for i in range(len(name_rid_df)):#建立字典
    rid_to_name[name_rid_df['电影原始ID'][i]]=name_rid_df['电影名称'][i]
    name_to_rid[name_rid_df['电影名称'][i]]=name_rid_df['电影原始ID'][i]

### 查看上述俩个字典中的5个元素

In [20]:
print("字典rid_to_name中的前五个键值对如下所示:")
for i, (k, v) in enumerate(rid_to_name.items()):#遍历字典中的键值对
    if i in range(5):                           #打印前5各键值对
        print(k, v)
print("字典name_to_rid中的前五个键值对如下所示:")        
for i, (k, v) in enumerate(name_to_rid.items()):
    if i in range(5):
        print(k, v)

字典rid_to_name中的前五个键值对如下所示:
1 Toy Story (1995)
2 GoldenEye (1995)
3 Four Rooms (1995)
4 Get Shorty (1995)
5 Copycat (1995)
字典name_to_rid中的前五个键值对如下所示:
Toy Story (1995) 1
GoldenEye (1995) 2
Four Rooms (1995) 3
Get Shorty (1995) 4
Copycat (1995) 5


### 利用字典，可以查询电影原始ID和电影名称

In [21]:
raw_id = name_to_rid['GoldenEye (1995)']#利用字典查询某部电影的原始ID
print("电影原始ID为：",raw_id)#打印

name = rid_to_name[raw_id]#查看raw_id对应的电影名称
print("电影名称为：",name)#打印

电影原始ID为： 2
电影名称为： GoldenEye (1995)


## 使用电影的原始ID和to_inner_iid函数，查找对应的电影内部ID

In [22]:
inner_id = model_itemCF.trainset.to_inner_iid(raw_id)#查看GoldenEye的内部ID
print("GoldenEye的内部ID为：",inner_id)

GoldenEye的内部ID为： 398


#### 利用get_neighbors函数和上述电影的内部ID，可以找到与之最相似的k部电影的内部ID

In [23]:
neighbors_iid_temp = model_itemCF.get_neighbors(inner_id, k=10)
print("与GoldenEye最相似的10部电影的内部ID为：",neighbors_iid_temp)

与GoldenEye最相似的10部电影的内部ID为： [91, 165, 204, 237, 322, 120, 629, 95, 488, 341]


### 通过to_raw_iid函数可以查询到内部ID对应的原始ID

In [5]:
neighbors_rid_temp= (model_itemCF.trainset.to_raw_iid(inner_id)for inner_id in neighbors_iid_temp)
#neighbors_rid_temp的类型是generator,调用一次生成器将失效
type(neighbors_rid_temp)

NameError: name 'neighbors_iid_temp' is not defined

In [4]:
#再创建一个neighbors_rid_temp_print生成器用于打印原始ID
neighbors_rid_temp_print= (model_itemCF.trainset.to_raw_iid(inner_id)for inner_id in neighbors_iid_temp)

print("最相似的10部电影原始ID为：",list(neighbors_rid_temp_print))

NameError: name 'neighbors_iid_temp' is not defined

### 通过电影原始ID和上述建立的rid_to_name字典，获取电影名称

In [26]:
neighbors_name_temp = (rid_to_name[rid] for rid in neighbors_rid_temp)
print('The 10 nearest neighbors of GoldenEye are:')
for movie in neighbors_name_temp:
    print(movie)

The 10 nearest neighbors of GoldenEye are:
True Lies (1994)
Clear and Present Danger (1994)
Mission: Impossible (1996)
Sabrina (1995)
Cliffhanger (1993)
Nutty Professor, The (1996)
Demolition Man (1993)
Searching for Bobby Fischer (1993)
Down Periscope (1996)
Sneakers (1992)
