In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sample_dataset import SampleDataset

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_ds = SampleDataset("train")
test_ds = SampleDataset("test")
train_ds.items.columns

Index(['movieId', 'userId', 'rating', 'timestamp', 'label', 'releaseYear',
       'movieGenre1', 'movieGenre2', 'movieGenre3', 'movieRatingCount',
       'movieAvgRating', 'movieRatingStddev', 'userRatedMovie1',
       'userRatedMovie2', 'userRatedMovie3', 'userRatedMovie4',
       'userRatedMovie5', 'userRatingCount', 'userAvgReleaseYear',
       'userReleaseYearStddev', 'userAvgRating', 'userRatingStddev',
       'userGenre1', 'userGenre2', 'userGenre3', 'userGenre4', 'userGenre5'],
      dtype='object')

In [2]:
numeric_float_cols = [
    "releaseYear", 
    "movieAvgRating", 
    "movieRatingStddev", 
    "userAvgRating", 
    "userRatingStddev"
    ]
numeric_int_cols = [
    "movieRatingCount", "userRatingCount"
]
# 定义了一组分类特征的列名，这些列代表不同的电影或用户的类型
genre_cols = [
    'userGenre1', 'userGenre2','userGenre3', 'userGenre4', 'userGenre5', 
    'movieGenre1', 'movieGenre2', 'movieGenre3'
]
# 定义了一个词汇表，将电影类型映射到唯一的索引
genre_vocab = {key: idx for idx, key in enumerate([
    'N/A',
    'Film-Noir', 'Action', 'Adventure', 
    'Horror', 'Romance', 'War', 'Comedy', 
    'Western', 'Documentary','Sci-Fi', 
    'Drama', 'Thriller', 'Crime', 'Fantasy', 
    'Animation', 'IMAX', 'Mystery', 'Children', 'Musical'
    ])}

def genre2idx(gener):
    idx = [genre_vocab[g] for g in gener] # 使用词汇表将类型转换为索引

    return np.array(idx)

In [3]:
genre_vocab

{'N/A': 0,
 'Film-Noir': 1,
 'Action': 2,
 'Adventure': 3,
 'Horror': 4,
 'Romance': 5,
 'War': 6,
 'Comedy': 7,
 'Western': 8,
 'Documentary': 9,
 'Sci-Fi': 10,
 'Drama': 11,
 'Thriller': 12,
 'Crime': 13,
 'Fantasy': 14,
 'Animation': 15,
 'IMAX': 16,
 'Mystery': 17,
 'Children': 18,
 'Musical': 19}

In [4]:
#写一个np.stack的例子
a = np.array([1,2,3])
b = np.array([4,5,6])
c = np.array([7,8,9])
d = np.stack([a,b,c], axis=1)
d

array([[1, 4, 7],
       [2, 5, 8],
       [3, 6, 9]])

In [5]:
def collate_fn(batch):
    # 定义了一个辅助函数，用于从一批数据中提取特定键的值，并转换为NumPy数组
    collate = lambda key: np.array([b[key] for b in batch])
    # 提取基本特征并转换为PyTorch张量
    userId = torch.LongTensor(collate("userId")).to(device)  # 将用户ID转换为长整型张量并移至指定设备
    movieId = torch.LongTensor(collate("movieId")).to(device)  # 将电影ID转换为长整型张量并移至指定设备
    label = torch.FloatTensor(collate("label")).to(device)  # 将标签转换为浮点型张量并移至指定设备
    data_dict = {
        "label": label,  # 存储标签数据
        "userId": userId,  # 存储用户ID数据
        "movieId": movieId,  # 存储电影ID数据
    }

    # numerical
    num_int_features = np.stack([collate(key) for key in numeric_int_cols], axis=-1).astype(np.int64)  # 堆叠整型数值特征并转换为int64类型
    num_float_features = np.stack([collate(key) for key in numeric_float_cols], axis=-1).astype(np.float32)  # 堆叠浮点型数值特征并转换为float32类型
    data_dict.update({"int_features": torch.LongTensor(num_int_features).to(device)})  # 将整型特征转换为张量并添加到字典，tensor有2列
    data_dict.update({"float_features": torch.FloatTensor(num_float_features).to(device)})  # 将浮点型特征转换为张量并添加到字典，tensor有5列

    # categorical
    categorical_features = {key: torch.LongTensor(genre2idx(collate(key).tolist())).to(device) for key in genre_cols}  # 将类别特征转换为索引，再转为张量
    data_dict.update(categorical_features)  # 将类别特征添加到数据字典中

    return data_dict  # 返回处理后的数据字典
    

# 设置批处理大小为12
batch_size = 12
# 创建训练数据加载器，启用随机打乱，使用自定义的collate_fn函数
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 创建测试数据加载器，不启用随机打乱，使用自定义的collate_fn函数
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [6]:
input = torch.randn(32, 10)
input=input.unsqueeze(-2)
print(input.shape)
mat2 = torch.randn(32, 10)
mat2=mat2.unsqueeze(-1)
print(mat2.shape)
res = torch.bmm(input, mat2) # 矩阵乘法
res.size()

torch.Size([32, 1, 10])
torch.Size([32, 10, 1])


torch.Size([32, 1, 1])

In [7]:
class DeepFM(nn.Module):
    def __init__(self, config):
        super().__init__()  # 调用父类初始化方法
        self.config = config  # 保存配置参数
        self.num_users = config['num_users']  # 用户数量，30001
        self.num_items = config['num_items']  # 物品数量，1001
        self.latent_dim = config['latent_dim']  # 隐向量维度，10
        self.num_geners = config['num_genre']  # 电影类型数量，20

        self.embedding_user = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim)  # 用户嵌入层
        self.embedding_item = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim)  # 物品嵌入层
        self.embedding_item_genre = nn.Embedding(num_embeddings=self.num_geners, embedding_dim=self.latent_dim)  # 电影类型嵌入层
        self.embedding_user_genre = nn.Embedding(num_embeddings=self.num_geners, embedding_dim=self.latent_dim)  # 用户喜欢类型嵌入层

        self.deep = nn.Sequential(  # 深度网络部分
            nn.Linear(
                in_features=len(numeric_float_cols) + len(numeric_int_cols) + self.latent_dim * 2, # 输入特征维度,总和是27
                out_features=64),  # 第一层全连接，输入为数值特征和嵌入特征
            nn.ReLU(),  # ReLU激活函数
            nn.Linear(in_features=64, out_features=64),  # 第二层全连接
            nn.ReLU(),  # ReLU激活函数
        )

        self.wide = nn.Sequential(  # 宽度网络部分
            nn.Linear(in_features=64 + 4 + self.num_users + self.num_items + self.num_geners * 2, out_features=1),  # 全连接层，输出为1
            nn.Sigmoid(),  # Sigmoid激活函数，输出范围为[0,1]
        )

    
    def dot_layer(self, x, y):
        # 计算两个嵌入向量的点积
        z=torch.bmm(x.unsqueeze(-2), y.unsqueeze(-1)).squeeze(-1)  # 批量矩阵乘法，计算点积
        return z


    def forward(self, batch):
        # 获取嵌入向量
        item_embeddings = self.embedding_item(batch["movieId"])  # 电影嵌入，(batch_size, 1)-->(batch_size, 10)
        user_embeddings = self.embedding_user(batch["userId"])  # 用户嵌入，(batch_size, 1)-->(batch_size, 10)
        item_genre_embeddings = self.embedding_item_genre(batch["movieGenre1"])  # 电影类型嵌入，只用了1个
        user_genre_embeddings = self.embedding_user_genre(batch["userGenre1"])  # 用户喜欢类型嵌入，只用了1个

        # FM层的一阶项
        fm_first_order = torch.cat([
            F.one_hot(batch["movieId"], self.num_items).float(),  # 电影ID的one-hot编码
            F.one_hot(batch["userId"], self.num_users).float(),  # 用户ID的one-hot编码
            F.one_hot(batch["userGenre1"], self.num_geners).float(),  # 用户喜欢类型的one-hot编码
            F.one_hot(batch["movieGenre1"], self.num_geners).float(),  # 电影类型的one-hot编码
            ], dim=-1)  # 拼接所有one-hot编码，总维度30001+1001+20=31022

        # FM部分，交叉不同类别特征的嵌入
        product_item_user = self.dot_layer(item_embeddings, user_embeddings)  # 电影和用户嵌入的交互
        product_item_genre_user_genre = self.dot_layer(item_genre_embeddings, user_genre_embeddings)  # 电影类型和用户喜欢类型的交互
        product_item_genre_user = self.dot_layer(item_genre_embeddings, user_embeddings)  # 电影类型和用户的交互
        product_user_genre_item = self.dot_layer(item_embeddings, user_genre_embeddings)  # 电影和用户喜欢类型的交互

        # 深度网络特征
        deep_inputs = torch.concat([batch["int_features"], batch["float_features"], item_embeddings, user_embeddings], dim=1)  # 拼接整型特征、浮点型特征和嵌入向量
        deep_features = self.deep(deep_inputs)  # 通过深度网络处理

        # 拼接所有特征
        concated_features = torch.concat([fm_first_order, product_item_user, product_item_genre_user_genre, product_item_genre_user, product_user_genre_item, deep_features], dim=1)  # 31042+1+1+1+1+64=31110
        
        # 输出层
        output = self.wide(concated_features)  # 通过宽度网络得到最终输出

        return output  # 返回预测结果
    
config = {  # 模型配置参数
    "num_users": 30001,  # 用户数量
    "num_items": 1001,  # 物品数量
    "latent_dim": 10,  # 隐向量维度
    "num_genre": len(genre_vocab),  # 电影类型数量
}

model = DeepFM(config)  # 创建DeepFM模型实例

# torch.save(model, "./DeepFM_model.pth")  # 保存模型（已注释）
print(model)  # 打印模型结构

DeepFM(
  (embedding_user): Embedding(30001, 10)
  (embedding_item): Embedding(1001, 10)
  (embedding_item_genre): Embedding(20, 10)
  (embedding_user_genre): Embedding(20, 10)
  (deep): Sequential(
    (0): Linear(in_features=27, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
  )
  (wide): Sequential(
    (0): Linear(in_features=31110, out_features=1, bias=True)
    (1): Sigmoid()
  )
)


In [8]:
30001+1001+20

31022

In [9]:
30001+1001+20+20+64+4

31110

In [10]:
from tqdm.auto import tqdm
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score

num_epochs = 5
lr = 0.001

criterion = nn.BCELoss() #二进制交叉熵损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

@torch.no_grad()
def evaluate(model, dl, criterion):
    # 遍历dl，计算loss和accuracy、auc
    model.eval()
    losses = []
    truth = []
    prediction = []
    for batch in dl:
        labels = batch["label"]
        outputs = model(batch)
        loss = criterion(outputs.reshape(-1), labels)
        losses.append(loss.item())
        truth.append(labels.cpu().numpy())
        prediction.append(outputs.cpu().numpy())

    # compute metrics
    loss = np.mean(losses)
    truth = np.concatenate(truth)
    prediction = np.concatenate(prediction)

    accuracy = accuracy_score(truth, prediction > 0.5)
    auc_roc = roc_auc_score(truth, prediction)
    auc_pr = average_precision_score(truth, prediction)
    return loss, accuracy, auc_roc, auc_pr

def train(model, train_dl, val_dl, criterion, optimizer):
    model.to(device)
    for epoch in tqdm(range(num_epochs)):
        model.train()
        for batch in train_dl:
            labels = batch["label"]
            
            outputs = model(batch)
            loss = criterion(outputs.reshape(-1), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        val_loss, val_acc, val_auc_roc, val_auc_pr = evaluate(model, val_dl, criterion)
        print(f"Epoch: {epoch}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}, val_auc_roc: {val_auc_roc:.4f}, val_auc_pr: {val_auc_pr:.4f}")

    return


In [11]:
train(model, train_dl, test_dl, criterion, optimizer)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 0, val_loss: 0.6121, val_acc: 0.6684, val_auc_roc: 0.7285, val_auc_pr: 0.7609
Epoch: 1, val_loss: 0.5965, val_acc: 0.6789, val_auc_roc: 0.7445, val_auc_pr: 0.7762
Epoch: 2, val_loss: 0.5863, val_acc: 0.6865, val_auc_roc: 0.7483, val_auc_pr: 0.7787
Epoch: 3, val_loss: 0.5864, val_acc: 0.6857, val_auc_roc: 0.7483, val_auc_pr: 0.7781
Epoch: 4, val_loss: 0.6004, val_acc: 0.6807, val_auc_roc: 0.7403, val_auc_pr: 0.7701


In [12]:
with torch.no_grad():
    # predict
    model.eval()
    for batch in test_dl:
        labels = batch["label"]
        outputs = model(batch)
        break

for output, label in zip(outputs[:12, 0], labels[:12]):
    print("Predicted good rating: {:.2%}".format(output),
          " | Actual rating label: ",
          ("Good Rating" if bool(label) else "Bad Rating"))


Predicted good rating: 81.25%  | Actual rating label:  Bad Rating
Predicted good rating: 60.10%  | Actual rating label:  Bad Rating
Predicted good rating: 57.03%  | Actual rating label:  Good Rating
Predicted good rating: 28.08%  | Actual rating label:  Good Rating
Predicted good rating: 8.40%  | Actual rating label:  Good Rating
Predicted good rating: 79.06%  | Actual rating label:  Good Rating
Predicted good rating: 63.24%  | Actual rating label:  Good Rating
Predicted good rating: 77.32%  | Actual rating label:  Good Rating
Predicted good rating: 72.07%  | Actual rating label:  Good Rating
Predicted good rating: 69.66%  | Actual rating label:  Bad Rating
Predicted good rating: 13.54%  | Actual rating label:  Bad Rating
Predicted good rating: 74.21%  | Actual rating label:  Bad Rating
