In [None]:
# Train | [15/50], Loss: 0.6643809300285202
# Valid | [15/50], Loss: 1.0953459154095566, Pre@5: 0.6538192349463015, Rec@5: 0.23049696680932164, NDCG@5: 0.72794158555819

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
import pandas as pd
import random
from torch.utils.data import DataLoader, TensorDataset

In [2]:
NUM_USERS = 8287
NUM_ITEMS = 113613

### MF Model

In [10]:
class MF(nn.Module):
    def __init__(self, k):
        super(MF, self).__init__()
        self.W = nn.Parameter(torch.rand(NUM_USERS, k))    
        self.H = nn.Parameter(torch.rand(NUM_ITEMS, k))  
    
    def forward(self, user_ids, item_ids):
        user_latent = self.W[user_ids]
        item_latent = self.H[item_ids]
    
        predicted_ratings = torch.sum(user_latent * item_latent, dim=1)
        return predicted_ratings

### Necessary Functions

In [4]:
def like_generator(df, ratio=0.5):
    # Grouping and sorting
    grouped = df.groupby('uid').apply(lambda x: x.sort_values('rating', ascending=False))
    # Calculate count for each group
    count = np.ceil(grouped.groupby(level=0).size() * ratio).astype(int)
    # Filter data based on count
    filtered_data = grouped.groupby(level=0).apply(lambda x: x.head(count[x.name]))
    return filtered_data.reset_index(drop=True)

def _predict(uid, items, n, model):
    with torch.no_grad():
        scores = model(uid, items)
    if n > scores.shape[0]: 
        n = scores.shape[0]
    top_N_val, top_N_idx = torch.topk(scores, k=n)
    
    if n == 1:
        return [(top_N_idx.cpu().item(), top_N_val.cpu().item())]
    
    return list(zip(items[top_N_idx.cpu()], top_N_val.cpu()))

def NDCG(uid, n, test_df):         # 用模型排序+真实分数计算 DCG, 重排后计算 iDCG
    # test 集中，uid 评过的 items
    test_user = test_df[test_df.iloc[:, 0] == uid]
    
    # 对这些 items 做 top-k
    rating = _predict(uid, test_user.iloc[:, 1].values, n, model)
    
    # 排序真实评分
    irating =sorted(test_user.iloc[:, 2].values, reverse=True)
    irating = np.asarray(irating)
    
    if n > len(irating): n = len(irating) 
        
    # 取出模型排序下 merge 到的真实分数    
    rating_df = pd.DataFrame(rating, columns=['iid', 'pred_rating'])
    merged_df = pd.merge(rating_df, test_user, on='iid')
    r = np.array(merged_df['rating'])    
        
    # 求 log 分母
    log = np.log(np.arange(2, n + 2))
    
    # 求 dcg 和 idcg
    dcg = np.log(2) * np.sum((2**r[:n] - 1) / log)
    idcg = np.log(2) * np.sum((2**irating[:n] - 1) / log)
    
    return dcg / idcg

def performance(n, model, user_items, like_user_items, test_df):      # Output recall@n, precision@n, NDCG@n
    hit = 0
    n_recall = 0
    n_precision = 0
    ndcg = 0
    iid = np.arange(NUM_ITEMS)
    for i in range(NUM_USERS):
        # Items that User i tried in testing set
        unknown_items = user_items[i]
        
        # Items that User i likes testing set
        known_items = like_user_items[i]

        #目标：预测 unknown items 中的top_N，若击中test中的items，则为有效预测
        ru = _predict(i, unknown_items, n, model)

        hit += sum(1 for item, pui in ru if item in known_items)
        n_recall += len(known_items)
        n_precision += n
        ndcg += NDCG(i, n, test_df)

    recall = hit / (1.0 * n_recall)
    precision = hit / (1.0 * n_precision)
    ndcg /= NUM_USERS
    return recall, precision, ndcg

### Hyper parameters

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch_size = 4096
epoch = 50
lr = 1e-3

# Latent factor dim
k = 20
# 评价指标
n = 5

### Data Preparation

In [6]:
data_dir = './data'

train = pd.read_csv(os.path.join(data_dir, "train.csv"))

test = pd.read_csv(os.path.join(data_dir, "test.csv"))
test_like = like_generator(test)

# 用户u对应他访问过的所有items集合
train_user_items = train.groupby('uid')['iid'].apply(lambda x: np.array(x)).to_dict()

test_user_items = test.groupby('uid')['iid'].apply(lambda x: np.array(x)).to_dict()
test_like_user_items = test_like.groupby('uid')['iid'].apply(lambda x: np.array(x)).to_dict()

# 创建训练集张量
train_data = torch.tensor(train[['uid', 'iid']].values, dtype=torch.long).to(device)
train_targets = torch.tensor(train['rating'].values, dtype=torch.float).to(device)

# 创建测试集张量
test_data = torch.tensor(test[['uid', 'iid']].values, dtype=torch.long).to(device)
test_targets = torch.tensor(test['rating'].values, dtype=torch.float).to(device)

# 使用 TensorDataset 封装数据
train_dataset = TensorDataset(train_data, train_targets)
test_dataset = TensorDataset(test_data, test_targets)

# Dataloader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### Training

In [12]:
# MF model
model = MF(k).to(device)
model.device = device

# Mean Sqaured Error
criterion = nn.MSELoss()

# Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)     # 主模型优化器

for x in range(epoch):
    # Training
    model.train()
    train_loss = 0

    for batch, rating in train_loader:
        uids = batch[:, 0]
        iids = batch[:, 1]
        
        pred_rating = model(uids, iids)
        loss = criterion(pred_rating, rating)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    print(f'Train | [{x+1}/{epoch}], Loss: {train_loss/len(train_loader)}')
    
    # Validation
    model.eval()
    val_loss = 0.0
    
    with torch.no_grad():
        for batch, rating in test_loader:
            uids = batch[:, 0]
            iids = batch[:, 1]
            
            pred_rating = model(uids, iids)
            loss = criterion(pred_rating, rating)
            
            val_loss += loss.item()
    
    rec, pre, ndcg = performance(n, model, test_user_items, test_like_user_items, test)
    
    print(f'Valid | [{x+1}/{epoch}], Loss: {val_loss/len(test_loader)}, Pre@{n}: {pre}, Rec@{n}: {rec}, NDCG@{n}: {ndcg}')

    print('---------------------------------------------------')

Train | [1/50], Loss: 2.3521375763523684
Valid | [1/50], Loss: 1.7304647408033673, Pre@5: 0.5338723301556655, Rec@5: 0.18821097053593458, NDCG@5: 0.6173944021608259
---------------------------------------------------
Train | [2/50], Loss: 1.4587412422841735
Valid | [2/50], Loss: 1.3630202234837048, Pre@5: 0.5656570532158803, Rec@5: 0.1994163341359448, NDCG@5: 0.6458328354431132
---------------------------------------------------
Train | [3/50], Loss: 1.1906393207945265
Valid | [3/50], Loss: 1.2343238529406095, Pre@5: 0.5921081211536141, Rec@5: 0.20874137476283255, NDCG@5: 0.6701747165998612
---------------------------------------------------
Train | [4/50], Loss: 1.0622745346915614
Valid | [4/50], Loss: 1.1747482032106633, Pre@5: 0.610667310244962, Rec@5: 0.2152842180493989, NDCG@5: 0.688012102832278
---------------------------------------------------
Train | [5/50], Loss: 0.9821927713918256
Valid | [5/50], Loss: 1.1422260497745715, Pre@5: 0.6237480390973814, Rec@5: 0.21989568887035982

Train | [39/50], Loss: 0.3479383009242582
Valid | [39/50], Loss: 1.2609573414451198, Pre@5: 0.634270544225896, Rec@5: 0.22360528532412174, NDCG@5: 0.7112882297815295
---------------------------------------------------
Train | [40/50], Loss: 0.34044711748221973
Valid | [40/50], Loss: 1.2698929979090106, Pre@5: 0.6323639435260046, Rec@5: 0.22293313367309606, NDCG@5: 0.7098840140470357
---------------------------------------------------
Train | [41/50], Loss: 0.3333852116589074
Valid | [41/50], Loss: 1.2783331431840594, Pre@5: 0.6313503077108724, Rec@5: 0.22257578722571533, NDCG@5: 0.7091396226688037
---------------------------------------------------
Train | [42/50], Loss: 0.3266485412647058
Valid | [42/50], Loss: 1.287014741646616, Pre@5: 0.6302401351514421, Rec@5: 0.22218440778334594, NDCG@5: 0.7081017103853415
---------------------------------------------------
Train | [43/50], Loss: 0.3201957585306855
Valid | [43/50], Loss: 1.295099751991138, Pre@5: 0.6289851574755642, Rec@5: 0.22174