In [1]:
import pandas as pd

data1 = pd.read_csv('./sampledata/trainingSamples.csv')
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88827 entries, 0 to 88826
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   movieId                88827 non-null  int64  
 1   userId                 88827 non-null  int64  
 2   rating                 88827 non-null  float64
 3   timestamp              88827 non-null  int64  
 4   label                  88827 non-null  int64  
 5   releaseYear            88827 non-null  int64  
 6   movieGenre1            88827 non-null  object 
 7   movieGenre2            75109 non-null  object 
 8   movieGenre3            48071 non-null  object 
 9   movieRatingCount       88827 non-null  int64  
 10  movieAvgRating         88827 non-null  float64
 11  movieRatingStddev      88827 non-null  float64
 12  userRatedMovie1        87600 non-null  float64
 13  userRatedMovie2        84810 non-null  float64
 14  userRatedMovie3        81027 non-null  float64
 15  us

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# 配置
RECENT_MOVIES = 5  # userRatedMovie{1-5}, 最近看过的5部 喜欢的电影
EMBEDDING_SIZE = 10


def fill_missing_values(series):
    # 如果列的数据类型为浮点型或整型，则填充0
    if series.dtype == float or series.dtype == int:
        return series.fillna(0)
    # 如果列的数据类型为object（字符串），则填充字符串"0"
    elif series.dtype == object:
        return series.fillna("0")
    else:
        return series


# 数据集类
class MovieDataset(Dataset):
    def __init__(self, file_path):
        data = pd.read_csv(file_path)
        self.data = data.apply(fill_missing_values, axis=0)
        self.label_encoder = LabelEncoder()  # 标签编码器
        genre_cols = ['userGenre1', 'userGenre2', 'userGenre3', 'userGenre4', 'userGenre5',
                      'movieGenre1', 'movieGenre2', 'movieGenre3']
        for col in genre_cols:
            self.data[col] = self.label_encoder.fit_transform(self.data[col].fillna('N/A'))  # 填充空值，并进行标签编码,剧情会转为id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        inputs = {col: torch.tensor(row[col], dtype=torch.float if 'Rating' in col or 'Stddev' in col else torch.long)
                  for col in self.data.columns if col != 'label'}  #包含Rating、Stddev的列转为float，其余列转为long
        label = torch.tensor(row['label'], dtype=torch.float)
        return inputs, label


# 数据加载器
train_dataset = MovieDataset('./sampledata/trainingSamples.csv')
test_dataset = MovieDataset('./sampledata/testSamples.csv')
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=False)



In [3]:
for inputs, label in train_loader:
    print(inputs)
    print(label)
    break

{'movieId': tensor([500,   5, 588, 153, 223, 799, 908, 353, 356, 350, 429,  21]), 'userId': tensor([16175,   900, 26809, 19044, 29499, 23452,  4801, 13732,  1723, 11254,
        27473, 15478]), 'rating': tensor([3, 3, 2, 4, 4, 3, 4, 4, 3, 3, 2, 2]), 'timestamp': tensor([1076899799, 1101207229, 1155062848,  842517126, 1061184798, 1379656294,
        1109133312,  884987876,  835521761,  923162819,  846347273, 1097824144]), 'releaseYear': tensor([1993, 1995, 1992, 1995, 1994, 1996, 1959, 1994, 1994, 1994, 1994, 1995]), 'movieGenre1': tensor([4, 4, 1, 0, 4, 4, 0, 0, 4, 7, 4, 4]), 'movieGenre2': tensor([ 7,  0,  2,  1,  0, 10,  1,  5,  7, 13,  0,  5]), 'movieGenre3': tensor([ 0,  0,  2,  3,  0, 13, 10,  6, 11, 13,  0, 13]), 'movieRatingCount': tensor([ 7349.,  2629.,  8980.,  7100.,  5102.,  1358.,  3429.,  3640., 14426.,
         3475.,   320.,  5164.]), 'movieAvgRating': tensor([3.3900, 3.0600, 3.6700, 2.8900, 3.8600, 3.3700, 4.2300, 3.5000, 4.0300,
        3.4800, 2.3300, 3.5800]), 'movi

In [4]:
inputs

{'movieId': tensor([500,   5, 588, 153, 223, 799, 908, 353, 356, 350, 429,  21]),
 'userId': tensor([16175,   900, 26809, 19044, 29499, 23452,  4801, 13732,  1723, 11254,
         27473, 15478]),
 'rating': tensor([3, 3, 2, 4, 4, 3, 4, 4, 3, 3, 2, 2]),
 'timestamp': tensor([1076899799, 1101207229, 1155062848,  842517126, 1061184798, 1379656294,
         1109133312,  884987876,  835521761,  923162819,  846347273, 1097824144]),
 'releaseYear': tensor([1993, 1995, 1992, 1995, 1994, 1996, 1959, 1994, 1994, 1994, 1994, 1995]),
 'movieGenre1': tensor([4, 4, 1, 0, 4, 4, 0, 0, 4, 7, 4, 4]),
 'movieGenre2': tensor([ 7,  0,  2,  1,  0, 10,  1,  5,  7, 13,  0,  5]),
 'movieGenre3': tensor([ 0,  0,  2,  3,  0, 13, 10,  6, 11, 13,  0, 13]),
 'movieRatingCount': tensor([ 7349.,  2629.,  8980.,  7100.,  5102.,  1358.,  3429.,  3640., 14426.,
          3475.,   320.,  5164.]),
 'movieAvgRating': tensor([3.3900, 3.0600, 3.6700, 2.8900, 3.8600, 3.3700, 4.2300, 3.5000, 4.0300,
         3.4800, 2.3300, 3.

In [5]:
x = torch.tensor([1, 2, 3, 4])
torch.unsqueeze(x, 1)

tensor([[1],
        [2],
        [3],
        [4]])

In [6]:
# DIN模型类
class DIN(nn.Module):
    def __init__(self, num_users, num_items, num_genres, embedding_dim):
        super(DIN, self).__init__()
        self.embedding_user = nn.Embedding(num_users, embedding_dim)  #30001*10
        self.embedding_item = nn.Embedding(num_items, embedding_dim)  #1001*10
        self.embedding_genre = nn.Embedding(num_genres, embedding_dim)  #20*10

        self.user_profile = nn.Sequential(
            nn.Linear(embedding_dim * 2 + 3, 128),
            nn.PReLU(),
            nn.Linear(128, 64),
            nn.PReLU(),
            nn.Linear(64, embedding_dim)
        )

        self.context_features = nn.Sequential(
            nn.Linear(embedding_dim + 4, 128),
            nn.PReLU(),
            nn.Linear(128, 64),
            nn.PReLU(),
            nn.Linear(64, embedding_dim)
        )

        self.activation_unit = nn.Sequential(
            nn.Linear(embedding_dim * 4, 32),
            nn.PReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

        self.output_layer = nn.Sequential(
            nn.Linear(embedding_dim * 4, 128),
            nn.PReLU(),
            nn.Linear(128, 64),
            nn.PReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, inputs):
        user_emb = self.embedding_user(inputs['userId']) #(batch_size, 1)-->(batch_size, embedding_dim)
        user_genre_emb = self.embedding_genre(inputs['userGenre1']) # (batch_size, 1)-->(batch_size,embedding_dim)
        # inputs['userRatingCount'],inputs['userAvgRating'], inputs['userRatingStddev']原本都是1维的，这里unsqueeze(1)变成2维的
        user_profile_emb = torch.cat([user_emb, user_genre_emb, inputs['userRatingCount'].unsqueeze(1),
                                      inputs['userAvgRating'].unsqueeze(1), inputs['userRatingStddev'].unsqueeze(1)],
                                     axis=1)
        # print(f'user_profile_emb.shape: {user_profile_emb.shape}') 值 [12, 23]
        user_profile_emb = self.user_profile(user_profile_emb) #做全连接层处理

        item_emb = self.embedding_item(inputs['movieId'])
        item_genre_emb = self.embedding_genre(inputs['movieGenre1']) # (batch_size, 1)-->(batch_size,embedding_dim)
        #context_features_emb输出形状为(batch_size, embedding_dim+4)
        context_features_emb = torch.cat([item_genre_emb, inputs['releaseYear'].unsqueeze(1),
                                          inputs['movieRatingCount'].unsqueeze(1),
                                          inputs['movieAvgRating'].unsqueeze(1),
                                          inputs['movieRatingStddev'].unsqueeze(1)], axis=1)
        context_features_emb = self.context_features(context_features_emb)

        recent_items_emb = torch.stack(
            [self.embedding_item(inputs[f'userRatedMovie{i}']) for i in range(1, RECENT_MOVIES + 1)], dim=1)
        print(f'recent_items_emb.shape: {recent_items_emb.shape}') #值 [12, 5, 10]
        repeated_item_emb = item_emb.unsqueeze(1).repeat(1, RECENT_MOVIES,
                                                         1)  #重复item_emb，5次形状为(batch_size, recent_movies, embedding_dim)

        activation_sub = recent_items_emb - recent_items_emb
        activation_product = recent_items_emb * repeated_item_emb  #每一个评分电影的embedding与最近的5个用户行为的embedding的相乘
        activation_input = torch.cat([activation_sub, recent_items_emb, repeated_item_emb, activation_product],
                                     dim=-1)  #相减，各自，相乘，进行拼接
        print(f'activation_input.shape: {activation_input.shape}')
        activation_output = self.activation_unit(activation_input).squeeze(-1)  #激活函数输出，形状为(batch_size, recent_movies)
        print(f'activation_output.shape: {activation_output.shape}')
        activation_output = activation_output.unsqueeze(-1).repeat(1, 1, EMBEDDING_SIZE)
        print(f'activation_output repeated.shape: {activation_output.shape}')
        activation_output = activation_output * recent_items_emb
        print(f'activation_output.shape: {activation_output.shape}')
        user_behaviors_pooled = torch.sum(activation_output, dim=1)
        print(f'user_behaviors_pooled.shape: {user_behaviors_pooled.shape}')
        #4个分别是 用户属性特征（user_profile_emb）、用户行为特征（user_behaviors_pooled）、候选广告特征（item_emb）和场景特征（context_features_emb）。
        concat_input = torch.cat([user_profile_emb, user_behaviors_pooled, item_emb, context_features_emb],
                                 axis=-1)  #每一个都是长度为10的密集向量
        output = self.output_layer(concat_input)

        return output.squeeze(-1)


# 定义超参数和设备
num_users = 30001
num_items = 1001
num_genres = 20
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 实例化模型
model = DIN(num_users, num_items, num_genres, EMBEDDING_SIZE).to(device)
#随机一个输入，测试DIN模型
outputs = model(inputs)
print(outputs.shape)

recent_items_emb.shape: torch.Size([12, 5, 10])
activation_input.shape: torch.Size([12, 5, 40])
activation_output.shape: torch.Size([12, 5])
activation_output repeated.shape: torch.Size([12, 5, 10])
activation_output.shape: torch.Size([12, 5, 10])
user_behaviors_pooled.shape: torch.Size([12, 10])
torch.Size([12])


In [None]:
model

In [None]:
# 损失函数和优化器
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score
def evaluate(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    all_labels = []
    all_outputs = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = {key: value.to(device) for key, value in inputs.items()}
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            predictions = (outputs > 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
            all_labels.extend(labels.cpu().numpy())
            all_outputs.extend(predictions.cpu().numpy())
    accuracy = correct / total
    auc_roc = roc_auc_score(all_labels, all_outputs)
    auc_pr = average_precision_score(all_labels, all_outputs)
    return test_loss / len(test_loader), accuracy, auc_roc, auc_pr


# 训练和评估函数
def train(model, train_loader, test_loader,criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader):
            inputs = {key: value.to(device) for key, value in inputs.items()}
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        val_loss, val_acc, val_auc_roc, val_auc_pr = evaluate(model, test_loader, criterion)
        print(f"Epoch: {epoch+1}, train_Loss: {running_loss / len(train_loader)},val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}, val_auc_roc: {val_auc_roc:.4f}, val_auc_pr: {val_auc_pr:.4f}")
        # print(f"Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}")

In [2]:
# 训练模型
train(model, train_loader,test_loader, criterion, optimizer, epochs=5)

  inputs = {col: torch.tensor(row[col], dtype=torch.float if 'Rating' in col or 'Stddev' in col else torch.long) for col in self.data.columns if col != 'label'}
  8%|▊         | 563/7403 [00:11<02:16, 49.94it/s]


KeyboardInterrupt: 

In [None]:



# 评估模型
test_loss, test_accuracy = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# 打印一些预测结果
model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = {key: value.to(device) for key, value in inputs.items()}
        outputs = model(inputs)
        predictions = (outputs > 0.5).float()
        for prediction, label in zip(predictions[:12], labels[:12]):
            print(f"Predicted: {prediction.item():.2f}, Actual: {label.item()}")
        break