In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from sample_dataset import SampleDataset

    

train_ds = SampleDataset("train")
test_ds = SampleDataset("test")
train_ds.items.head()

Unnamed: 0,movieId,userId,rating,timestamp,label,releaseYear,movieGenre1,movieGenre2,movieGenre3,movieRatingCount,...,userRatingCount,userAvgReleaseYear,userReleaseYearStddev,userAvgRating,userRatingStddev,userGenre1,userGenre2,userGenre3,userGenre4,userGenre5
0,1,15555,3.0,900953740,0,1995,Adventure,Animation,Children,10759,...,92,1992,8.98,3.86,0.74,Drama,Comedy,Thriller,Action,Crime
1,1,25912,3.5,1111631768,1,1995,Adventure,Animation,Children,10759,...,21,1988,14.09,3.48,1.28,Action,Comedy,Romance,Adventure,Thriller
2,1,29912,3.0,866820360,0,1995,Adventure,Animation,Children,10759,...,4,1995,0.5,3.0,0.0,,,,,
3,10,17686,0.5,1195555011,0,1995,Action,Adventure,Thriller,6330,...,35,1992,8.35,2.97,1.48,Comedy,Drama,Adventure,Action,Thriller
4,104,20158,4.0,1155357691,1,1996,Comedy,,,3954,...,81,1991,8.7,3.6,0.72,Thriller,Drama,Action,Crime,Adventure


In [2]:
train_ds.items.describe()

Unnamed: 0,movieId,userId,rating,timestamp,label,releaseYear,movieRatingCount,movieAvgRating,movieRatingStddev,userRatedMovie1,userRatedMovie2,userRatedMovie3,userRatedMovie4,userRatedMovie5,userRatingCount,userAvgReleaseYear,userReleaseYearStddev,userAvgRating,userRatingStddev
count,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0,88827.0
mean,413.595709,14862.569117,3.526152,1001914000.0,0.563027,1990.03723,4986.582368,3.522074,0.945078,413.104461,399.370574,381.512401,364.856935,349.494681,40.132561,1989.395848,8.648859,3.565731,0.922317
std,270.106033,8667.174783,1.06105,163833900.0,0.496014,12.821209,3900.254323,0.477437,0.096474,278.296721,283.112315,288.241854,291.743071,295.629331,32.331057,6.584858,7.507843,0.531259,0.289116
min,1.0,1.0,0.5,824133900.0,0.0,1926.0,2.0,1.33,0.5,0.0,0.0,0.0,0.0,0.0,2.0,1936.0,0.0,0.5,0.0
25%,193.0,7390.0,3.0,847748500.0,0.0,1993.0,1737.0,3.2,0.88,175.0,156.0,111.0,70.0,47.0,13.0,1988.0,1.76,3.26,0.75
50%,370.0,14836.0,3.5,956707700.0,1.0,1994.0,3954.0,3.52,0.94,367.0,357.0,353.0,339.0,318.0,30.0,1992.0,6.44,3.59,0.9
75%,593.0,22411.0,4.0,1120375000.0,1.0,1995.0,7392.0,3.91,1.0,593.0,592.0,590.0,589.0,587.0,62.0,1993.0,14.29,3.91,1.08
max,1000.0,30000.0,5.0,1427740000.0,1.0,1998.0,14616.0,4.45,1.89,1000.0,1000.0,1000.0,1000.0,1000.0,100.0,1996.0,43.13,5.0,3.18


In [3]:
def collate_fn(batch):
    """
    batch代表一个批次数据，类型是df
    """
    userId = [b["userId"] for b in batch]
    movieId = [b["movieId"] for b in batch]
    label = [b["label"] for b in batch]
    return {
        "userId": torch.LongTensor(userId),
        "movieId": torch.LongTensor(movieId),
        "label": torch.FloatTensor(label)
    }


batch_size = 32
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [4]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.num_users = config['num_users'] #用户数量
        self.num_items = config['num_items'] #电影数量
        self.latent_dim = config['latent_dim'] #隐向量维度
        # 定义embedding层，把用户id，电影id映射到密集向量
        self.embedding_user = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim)
        self.embedding_item = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim)

        self.interact_layers = nn.Sequential(
            nn.Linear(in_features=self.latent_dim * 2, out_features=10),#这里的in_features是2*latent_dim，因为user_embedding和item_embedding的维度都是latent_dim
            nn.ReLU(),
            nn.Linear(in_features=10, out_features=10), #这里的in_features是10，因为前面经过两层线性层后输出的维度是10
            nn.ReLU(),
        )
        
        self.output_layer = nn.Linear(in_features=10, out_features=1) #输出层，输出一个值
        self.logistic = torch.nn.Sigmoid() #输出层使用sigmoid函数，把输出值映射到0-1之间，表示用户对电影的评分

    def forward(self, user_indices, item_indices):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)  # the concat latent vector
        vector = self.interact_layers(vector)
        
        logits = self.output_layer(vector) # 输出层，输出一个值
        rating = self.logistic(logits)
        return rating



In [5]:
class GMF(nn.Module):
    def __init__(self, config):
        super(GMF, self).__init__()
        self.num_users = config['num_users']
        self.num_items = config['num_items']
        self.latent_dim = config['latent_dim']

        self.embedding_user = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim)
        self.embedding_item = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim)

        self.affine_output = nn.Linear(in_features=self.latent_dim, out_features=1)
        self.logistic = nn.Sigmoid()

    def forward(self, user_indices, item_indices):
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        element_product = torch.mul(user_embedding, item_embedding) # 内积
        logits = self.affine_output(element_product)
        rating = self.logistic(logits)
        return rating

In [6]:
class NeuMF(nn.Module):
    def __init__(self, config):
        super(NeuMF, self).__init__()
        self.config = config
        self.num_users = config['num_users']
        self.num_items = config['num_items']
        self.latent_dim_mf = config['latent_dim_mf']
        self.latent_dim_mlp = config['latent_dim_mlp']

        self.embedding_user_mlp = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mlp)
        self.embedding_item_mlp = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mlp)
        self.embedding_user_mf = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mf)
        self.embedding_item_mf = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mf)

        self.interact_layers = nn.Sequential(
            nn.Linear(in_features=self.latent_dim_mlp * 2, out_features=10),
            nn.ReLU(),
            nn.Linear(in_features=10, out_features=10),
            nn.ReLU(),
        )

        self.affine_output = torch.nn.Linear(in_features=10 + self.latent_dim_mf, out_features=1)
        self.logistic = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices):
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)
        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)  # the concat latent vector
        mf_vector =torch.mul(user_embedding_mf, item_embedding_mf)

        mlp_vector = self.interact_layers(mlp_vector)

        vector = torch.cat([mlp_vector, mf_vector], dim=-1)
        logits = self.affine_output(vector)
        rating = self.logistic(logits)
        return rating

In [7]:
config = {
    "num_users": 30001,
    "num_items": 1001,
    "latent_dim": 1,
}

mlp_model = MLP(config)   #可以更换为NeuMF、GMF、MLP  
mlp_model(torch.Tensor([0, 0]).long(), torch.Tensor([0, 0]).long())

# torch.save(mlp_model, "./NCF_mlp_model.pth")

tensor([[0.4959],
        [0.4959]], grad_fn=<SigmoidBackward0>)

In [8]:
from tqdm.auto import tqdm
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score

num_epochs = 5
lr = 0.01
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
criterion = nn.BCELoss() # 二分类交叉熵损失函数
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=lr)

@torch.no_grad()
def evaluate(model, dl, criterion):
    # 遍历dl，计算loss和accuracy、auc
    model.eval()
    losses = []
    truth = []
    prediction = []
    for batch in dl:
        user_indices = batch["userId"].to(device)
        item_indices = batch["movieId"].to(device)
        labels = batch["label"].to(device)
        outputs = model(user_indices, item_indices)
        loss = criterion(outputs.reshape(-1), labels) # 因为输出是概率，需要reshape成一维
        losses.append(loss.item())
        truth.append(labels.cpu().numpy())
        prediction.append(outputs.cpu().numpy()) #预测输出

    # compute metrics
    loss = np.mean(losses)
    truth = np.concatenate(truth)
    prediction = np.concatenate(prediction)

    accuracy = accuracy_score(truth, prediction > 0.5) # 计算accuracy
    auc_roc = roc_auc_score(truth, prediction) # 计算auc
    auc_pr = average_precision_score(truth, prediction) # 计算精确率
    return loss, accuracy, auc_roc, auc_pr

def train(model, train_dl, val_dl, criterion, optimizer):
    model.to(device)
    for epoch in tqdm(range(num_epochs)):
        train_losses = []
        model.train()
        for batch in train_dl:
            user_indices = batch["userId"].to(device)
            item_indices = batch["movieId"].to(device)
            labels = batch["label"].to(device)
            outputs = model(user_indices, item_indices)
            loss = criterion(outputs.reshape(-1), labels)
            optimizer.zero_grad() # 梯度清零
            loss.backward() # 反向传播
            optimizer.step() # 更新参数
            train_losses.append(loss.item())
        mean_train_loss = np.mean(train_losses)
        val_loss, val_acc, val_auc_roc, val_auc_pr = evaluate(model, val_dl, criterion)
        print(f"Epoch: {epoch},train_loss: {mean_train_loss:.4f}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}, val_auc_roc: {val_auc_roc:.4f}, val_auc_pr: {val_auc_pr:.4f}") #每一个epoch就会在验证集上进行验证，并打印相关指标

    return
        

In [9]:
train(mlp_model, train_dl, test_dl, criterion, optimizer)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 0,train_loss: 0.6269, val_loss: 0.6092, val_acc: 0.6674, val_auc_roc: 0.7247, val_auc_pr: 0.7616
Epoch: 1,train_loss: 0.5843, val_loss: 0.5961, val_acc: 0.6810, val_auc_roc: 0.7396, val_auc_pr: 0.7702
Epoch: 2,train_loss: 0.5357, val_loss: 0.5991, val_acc: 0.6873, val_auc_roc: 0.7460, val_auc_pr: 0.7753
Epoch: 3,train_loss: 0.5009, val_loss: 0.6197, val_acc: 0.6829, val_auc_roc: 0.7374, val_auc_pr: 0.7629
Epoch: 4,train_loss: 0.4819, val_loss: 0.6200, val_acc: 0.6750, val_auc_roc: 0.7335, val_auc_pr: 0.7575


In [10]:
# with torch.no_grad():
#     # predict
#     mlp_model.eval()
#     for batch in test_dl:
#         user_indices = batch["userId"].to(device)
#         item_indices = batch["movieId"].to(device)
#         labels = batch["label"].to(device)
#         outputs = mlp_model(user_indices, item_indices)
#         break
#
# for output, label in zip(outputs[:12, 0], labels[:12]):
#     print("Predicted good rating: {:.2%}".format(output),
#           " | Actual rating label: ",
#           ("Good Rating" if bool(label) else "Bad Rating"))
