# Goodbooks-10k数据集，10000本图书，53424个用户，NCF模型

In [2]:
import numpy as np
import pandas as pd
import random

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import os

In [4]:
seed = 2022
np.random.seed(seed)
random.seed(seed)
BATCH_SIZE = 512

hidden_dim = 16
epochs = 1
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cpu


In [5]:
df = pd.read_csv("./goodbooks-10k/ratings.csv")

In [6]:
df.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [11]:
print('共{}个用户，{}本图书，{}条记录'.format(max(df['user_id']), max(df['book_id']), len(df)))

共53424个用户，10000本图书，5976479条记录


In [15]:
df.describe()

Unnamed: 0,user_id,book_id,rating
count,5976479.0,5976479.0,5976479.0
mean,26224.46,2006.477,3.919866
std,15413.23,2468.499,0.9910868
min,1.0,1.0,1.0
25%,12813.0,198.0,3.0
50%,25938.0,885.0,4.0
75%,39509.0,2973.0,5.0
max,53424.0,10000.0,5.0


In [16]:
for index, row in df.iterrows():
    if index == 2:
        print(row)

user_id      2
book_id    260
rating       5
Name: 2, dtype: int64


In [34]:
import tqdm
class Goodbooks(Dataset):
    def __init__(self, df, mode='training', negs=99):
        super().__init__()
        
        self.df = df
        self.mode = mode
        
        self.book_nums = max(df['book_id'])+1
        self.user_nums = max(df['user_id'])+1
        
        self._init_dataset()
    
    def _init_dataset(self):
        self.Xs = []
        
        self.user_book_map = {}
        for i in range(self.user_nums):
            self.user_book_map[i] = []
        
        for index, row in self.df.iterrows():
            user_id, book_id, rating = row
            self.user_book_map[user_id].append(book_id)
        
        if self.mode == 'training':
            for user, items in tqdm.tqdm(self.user_book_map.items()):
                for item in items[:-1]:
                    self.Xs.append((user, item, 1))
                    for _ in range(3):
                        while True:
                            neg_sample = random.randint(0, self.book_nums - 1)
                            if neg_sample not in self.user_book_map[user]:
                                self.Xs.append((user, neg_sample, 0))
                                break
        elif self.mode == 'validation':
            for user, items in tqdm.tqdm(self.user_book_map.items()):
                if len(items) == 0:
                    continue
                self.Xs.append((user, items[-1]))
    
    def __getitem__(self, index):
        '''
            __getitem__方法也是必须的，该方法支持从0到len(self)的索引
        '''
        if self.mode == 'training':
            user_id, book_id, label = self.Xs[index]
            return user_id, book_id, label
        elif self.mode == 'validation':
            user_id, book_id = self.Xs[index]
            negs = list(random.sample(
                list(set(range(self.book_nums)) - set(self.user_book_map[user_id])),
                k = 99
            ))
            return user_id, book_id, torch.LongTensor(negs)
    
    def __len__(self):
        '''
            继承Dataset类时，需要重写__len__方法，该方法提供了dataset的大小
        '''
        return len(self.Xs)

In [35]:
traindataset = Goodbooks(df, 'training')
validdataset = Goodbooks(df, 'validation')

trainloader = DataLoader(traindataset, batch_size = BATCH_SIZE, shuffle = True, drop_last = False, num_workers = 0)
validloader = DataLoader(validdataset, batch_size = BATCH_SIZE, shuffle = True, drop_last = False, num_workers = 0)

100%|██████████| 53425/53425 [01:05<00:00, 821.07it/s] 
100%|██████████| 53425/53425 [00:13<00:00, 3945.48it/s]


In [36]:
len(traindataset.user_book_map)

53425

NCF模型由GMF和MLP部分组成
![model](./model.png)
Embedding layer: 嵌入层，将稀疏的one-hot用户/物品向量转化为稠密的低维向量。
GMF layer: 通过传统的矩阵分解算法，将以用户和物品的嵌入向量做内积，有效地提取浅层特征。
MLP layer: 通过n层全连接层，提取深层特征。
Concatenation layer: 将GMF和MLP输出的结果做concat，结合其中的深层和浅层信息。
Output layer: 输出层，输出用户-物品对的最终评分。

In [37]:
class NCFModel(torch.nn.Module):
    def __init__(self, hidden_dim, user_num, item_num, mlp_layer_num=4, weight_decay=1e-5, dropout=0.5):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.user_num = user_num
        self.item_num = item_num
        self.mlp_layer_num = mlp_layer_num
        self.weight_decay = weight_decay
        self.dropout = dropout
        
        self.mlp_user_embedding = torch.nn.Embedding(user_num, hidden_dim * (2 ** (self.mlp_layer_num - 1)))
        self.mlp_item_embedding = torch.nn.Embedding(item_num, hidden_dim * (2 ** (self.mlp_layer_num - 1)))
        
        self.gmf_user_embedding = torch.nn.Embedding(user_num, hidden_dim)
        self.gmf_item_embedding = torch.nn.Embedding(item_num, hidden_dim)
        
        mlp_layers = []
        input_size = int(hidden_dim * (2 ** self.mlp_layer_num))
        for i in range(self.mlp_layer_num):
            mlp_layers.append(torch.nn.Linear(int(input_size), int(input_size / 2)))
            mlp_layers.append(torch.nn.Dropout(self.dropout))
            mlp_layers.append(torch.nn.ReLU())
            input_size /= 2
        self.mlp_layers = torch.nn.Sequential(*mlp_layers)
        
        self.output_layer = torch.nn.Linear(2 * self.hidden_dim, 1)
        
    def forward(self, user, item):
        user_gmf_embedding = self.gmf_user_embedding(user)
        item_gmf_embedding = self.gmf_item_embedding(item)
        
        user_mlp_embedding = self.mlp_user_embedding(user)
        item_mlp_embedding = self.mlp_item_embedding(item)
        
        gmf_output = user_gmf_embedding * item_gmf_embedding
        
        mlp_input = torch.cat([user_mlp_embedding, item_mlp_embedding], dim=-1)
        mlp_output = self.mlp_layers(mlp_input)
        
        output = torch.sigmoid(self.output_layer(torch.cat([gmf_output, mlp_output], dim=-1))).squeeze(-1)
        
        return output
    
    def predict(self, user, item):
        self.eval() # 评估模式，和torch.no_grad()配合使用
        with torch.no_grad():
            user_gmf_embedding = self.gmf_user_embedding(user)
            item_gmf_embedding = self.gmf_item_embedding(item)
            
            user_mlp_embedding = self.mlp_user_embedding(user)
            item_mlp_embedding = self.mlp_item_embedding(item)
        
            gmf_output = user_gmf_embedding.unsqueeze(1) * item_gmf_embedding
            
            user_mlp_embedding = user_mlp_embedding.unsqueeze(1).expand(-1, item_mlp_embedding.shape[1], -1)
            mlp_input = torch.cat([user_mlp_embedding, item_mlp_embedding], dim=-1)
            mlp_output = self.mlp_layers(mlp_input)
        
        output = torch.sigmoid(self.output_layer(torch.cat([gmf_output, mlp_output], dim=-1))).squeeze(-1)
        return output

训练模型，固定步数会计算准确率
模型保存
可视化训练过程，对比训练机和验证集的准确率

In [39]:
model = NCFModel(hidden_dim, traindataset.user_nums, traindataset.book_nums).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
crit = torch.nn.BCELoss()

loss_for_plot = []
hits_for_plot = []

for epoch in range(epochs):
    
    losses = []
    for index, data in enumerate(trainloader):
        user, item, label = data
        user, item, label = user.to(device), item.to(device), label.to(device).float()
        y_ = model(user, item).squeeze()
        
        loss = crit(y_, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.detach().cpu().item())
        
    hits = []
    for index, data in enumerate(validloader):
        user, pos, neg = data
        pos = pos.unsqueeze(1)
        all_data = torch.cat([pos, neg], dim=-1)
        output = model.predict(user.to(device), all_data.to(device)).detach().cpu()
        
        for batch in output:
            if 0 not in (-batch).argsort()[:10]:
                hits.append(0)
            else:
                hits.append(1)
    print('Epoch {} finished, average loss {}, hits@20 {}'.format(epoch, sum(losses)/len(losses), sum(hits)/len(hits)))
    loss_for_plot.append(sum(losses)/len(losses))
    hits_for_plot.append(sum(hits)/len(hits))

Epoch 0 finished, average loss 0.4613660700745131, hits@20 0.36268343815513626


In [40]:
# 模型保存
torch.save(model.state_dict(), './model.h5')

In [41]:
import matplotlib.pyplot as plt

x = list(range(1, len(hits_for_plot) + 1))
plt.subplot(1,2,1)
plt.xlabel('epochs')
plt.ylabel('loss')
plt.plot(x, loss_for_plot, 'r')

plt.subplot(1,2,2)
plt.xlabel('epochs')
plt.ylabel('acc')
plt.plot(x, hits_for_plot, 'r')

plt.show()

ModuleNotFoundError: No module named 'matplotlib'