In [8]:
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sample_dataset import SampleDataset

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_ds = SampleDataset("train")
test_ds = SampleDataset("test")
crossed_cols = ["movieId", "userRatedMovie1"] # 交叉特征
train_ds.set_crossed_features(crossed_cols, 10_000)
test_ds.set_crossed_features(crossed_cols, 10_000)
train_ds.items.columns

Index(['movieId', 'userId', 'rating', 'timestamp', 'label', 'releaseYear',
       'movieGenre1', 'movieGenre2', 'movieGenre3', 'movieRatingCount',
       'movieAvgRating', 'movieRatingStddev', 'userRatedMovie1',
       'userRatedMovie2', 'userRatedMovie3', 'userRatedMovie4',
       'userRatedMovie5', 'userRatingCount', 'userAvgReleaseYear',
       'userReleaseYearStddev', 'userAvgRating', 'userRatingStddev',
       'userGenre1', 'userGenre2', 'userGenre3', 'userGenre4', 'userGenre5',
       'crossed_features'],
      dtype='object')

In [9]:
numeric_float_cols = [
    "releaseYear",
    "movieAvgRating",
    "movieRatingStddev",
    "userAvgRating",
    "userRatingStddev"
    ]
numeric_int_cols = [
    "movieRatingCount", "userRatingCount"
]

genre_cols = [
    'userGenre1', 'userGenre2','userGenre3', 'userGenre4', 'userGenre5',
    'movieGenre1', 'movieGenre2', 'movieGenre3'
] #用户喜欢的电影类型，电影类型，都是字符串类型

genre_vocab = {key: idx for idx, key in enumerate([
    'N/A',
    'Film-Noir', 'Action', 'Adventure',
    'Horror', 'Romance', 'War', 'Comedy',
    'Western', 'Documentary','Sci-Fi',
    'Drama', 'Thriller', 'Crime', 'Fantasy',
    'Animation', 'IMAX', 'Mystery', 'Children', 'Musical'
    ])} #剧情和id的对应关系

def genre2idx(genre):#将电影类型 字符串转化为id
    idx = [genre_vocab[g] for g in genre]

    return np.array(idx)

In [10]:
genre_vocab

{'N/A': 0,
 'Film-Noir': 1,
 'Action': 2,
 'Adventure': 3,
 'Horror': 4,
 'Romance': 5,
 'War': 6,
 'Comedy': 7,
 'Western': 8,
 'Documentary': 9,
 'Sci-Fi': 10,
 'Drama': 11,
 'Thriller': 12,
 'Crime': 13,
 'Fantasy': 14,
 'Animation': 15,
 'IMAX': 16,
 'Mystery': 17,
 'Children': 18,
 'Musical': 19}

In [11]:
def collate_fn(batch):
    collate = lambda key: np.array([b[key] for b in batch]) #将batch中的key列转化为numpy数组

    # basic
    userId = torch.LongTensor(collate("userId")).to(device) #将userId转化为tensor
    movieId = torch.LongTensor(collate("movieId")).to(device) #将movieId转化为tensor
    label = torch.FloatTensor(collate("label")).to(device) #将label转化为tensor
    data_dict = {
        "label": label,
        "userId": userId,
        "movieId": movieId,
    }

    #把用户名评分数量和电影评分数量转化为tensor
    num_int_features = {key: torch.LongTensor(collate(key)).to(device).reshape(-1, 1) for key in numeric_int_cols}
    #把numeric_float_cols列表里的5个列转化为tensor
    num_float_features = {key: torch.FloatTensor(collate(key)).to(device).reshape(-1, 1) for key in numeric_float_cols}
    data_dict.update(num_int_features)
    data_dict.update(num_float_features)

    # categorical，用户喜欢的电影类型和电影类型
    categorical_features = {key: torch.LongTensor(genre2idx(collate(key))).to(device) for key in genre_cols}
    data_dict.update(categorical_features)

    # crossed features，将movieId和userRatedMovie1交叉，得到10000维向量
    crossed_features = np.array([b["crossed_features"] for b in batch])
    data_dict["crossed_features"] = torch.LongTensor(crossed_features).to(device)

    return data_dict



batch_size = 12
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [12]:
# 模型
class WideDeep(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.num_users = config['num_users'] # 30001
        self.num_items = config['num_items'] #1001
        self.latent_dim = config['latent_dim'] # 10，这是一个超参数，隐含特征的维度
        self.num_genres = config['num_genre'] #20
        self.crossed_dim = config['crossed_dim'] # 10000

        self.embedding_user = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim)
        self.embedding_item = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim)
        self.embedding_genre = nn.ModuleDict({key: nn.Embedding(num_embeddings=self.num_genres, embedding_dim=self.latent_dim) for key in genre_cols}) #字典形式，key为电影genre列名，value为Embedding层，原来每个是（batch_size，1）,变为（batch_size，latent_dim）

        self.deep = nn.Sequential(
            nn.Linear(
                in_features=len(numeric_float_cols) + len(numeric_int_cols) + len(genre_cols) * self.latent_dim + self.latent_dim * 2,
                out_features=128),#107*128
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=128),#128*128
            nn.ReLU(),
        )
        #self.crossed_dim是1万，one-hot类型
        self.wide = nn.Sequential(
            nn.Linear(in_features=128 + self.crossed_dim, out_features=1),
            nn.Sigmoid(),
        )


    def forward(self, batch):
        # deep part
        numerical_features = [batch[col] for col in numeric_float_cols + numeric_int_cols] #7个特征列
        user_embeddings = self.embedding_user(batch["userId"])  #10
        movie_embeddings = self.embedding_item(batch["movieId"]) #10
        genre_features = [self.embedding_genre[col](batch[col]) for col in genre_cols] #80
        deep_features = torch.cat(
            numerical_features + genre_features + [user_embeddings, movie_embeddings], dim=1)

        deep_features = self.deep(deep_features)
        # wide part
        crossed_features = batch["crossed_features"]
        rating = self.wide(torch.cat([deep_features, crossed_features], dim=1)) #deep的输出加上wide的输入，得到1维的输出

        return rating

config = {
    "num_users": 30001,
    "num_items": 1001,
    "latent_dim": 10,
    "num_genre": len(genre_vocab),
    "crossed_dim": 10_000
}

model = WideDeep(config)
# model(torch.Tensor([0, 0]).long(), torch.Tensor([0, 0]).long())
print(model)#输出模型结构
# torch.save(model, "./WideDeep_model.pth")

WideDeep(
  (embedding_user): Embedding(30001, 10)
  (embedding_item): Embedding(1001, 10)
  (embedding_genre): ModuleDict(
    (userGenre1): Embedding(20, 10)
    (userGenre2): Embedding(20, 10)
    (userGenre3): Embedding(20, 10)
    (userGenre4): Embedding(20, 10)
    (userGenre5): Embedding(20, 10)
    (movieGenre1): Embedding(20, 10)
    (movieGenre2): Embedding(20, 10)
    (movieGenre3): Embedding(20, 10)
  )
  (deep): Sequential(
    (0): Linear(in_features=107, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ReLU()
  )
  (wide): Sequential(
    (0): Linear(in_features=10128, out_features=1, bias=True)
    (1): Sigmoid()
  )
)


In [13]:
from tqdm.auto import tqdm
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score

num_epochs = 5
lr = 0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

@torch.no_grad()
def evaluate(model, dl, criterion):
    # 遍历dl，计算loss和accuracy、auc
    model.eval()
    losses = []
    truth = []
    prediction = []
    for batch in dl:
        labels = batch["label"]
        outputs = model(batch)
        loss = criterion(outputs.reshape(-1), labels)
        losses.append(loss.item())
        truth.append(labels.cpu().numpy())
        prediction.append(outputs.cpu().numpy())

    # compute metrics
    loss = np.mean(losses)
    truth = np.concatenate(truth)
    prediction = np.concatenate(prediction)

    accuracy = accuracy_score(truth, prediction > 0.5)
    auc_roc = roc_auc_score(truth, prediction)
    auc_pr = average_precision_score(truth, prediction)
    return loss, accuracy, auc_roc, auc_pr

def train(model, train_dl, val_dl, criterion, optimizer):
    model.to(device)
    for epoch in tqdm(range(num_epochs)):
        train_losses = []
        model.train()
        for batch in train_dl:
            labels = batch["label"]

            outputs = model(batch)
            loss = criterion(outputs.reshape(-1), labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        train_loss = np.mean(train_losses)
        val_loss, val_acc, val_auc_roc, val_auc_pr = evaluate(model, val_dl, criterion)
        print(f"Epoch: {epoch},train_loss: {train_loss:.4f}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}, val_auc_roc: {val_auc_roc:.4f}, val_auc_pr: {val_auc_pr:.4f}")

    return


In [14]:
train(model, train_dl, test_dl, criterion, optimizer)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 0,train_loss: 0.7146, val_loss: 0.6374, val_acc: 0.6449, val_auc_roc: 0.6837, val_auc_pr: 0.7170
Epoch: 1,train_loss: 0.6082, val_loss: 0.6030, val_acc: 0.6762, val_auc_roc: 0.7303, val_auc_pr: 0.7608
Epoch: 2,train_loss: 0.5727, val_loss: 0.5910, val_acc: 0.6873, val_auc_roc: 0.7431, val_auc_pr: 0.7689
Epoch: 3,train_loss: 0.5462, val_loss: 0.5903, val_acc: 0.6882, val_auc_roc: 0.7458, val_auc_pr: 0.7713
Epoch: 4,train_loss: 0.5245, val_loss: 0.5944, val_acc: 0.6857, val_auc_roc: 0.7488, val_auc_pr: 0.7759


In [15]:
# with torch.no_grad():
#     # predict
#     model.eval()
#     for batch in test_dl:
#         labels = batch["label"]
#         outputs = model(batch)
#         break
#
# for output, label in zip(outputs[:12, 0], labels[:12]):
#     print("Predicted good rating: {:.2%}".format(output),
#           " | Actual rating label: ",
#           ("Good Rating" if bool(label) else "Bad Rating"))
