In [3]:
!pip install xgboost lightgbm



In [4]:
import os
import zipfile
import urllib.request

import pandas as pd
import numpy as np
import scipy.sparse as sp

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [5]:
#######################################
# 1. 下載並解壓 MovieLens 100K 資料
#######################################

data_url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
zip_path = "ml-100k.zip"
if not os.path.exists(zip_path):
    print("Downloading dataset...")
    urllib.request.urlretrieve(data_url, zip_path)

if not os.path.exists("ml-100k"):
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(".")

Downloading dataset...
Extracting dataset...


In [6]:
#######################################
# 2. 載入並預處理資料 (u.data)
#######################################

data_path = "ml-100k/u.data"  # 解壓後路徑
df = pd.read_csv(data_path, sep='\t', header=None, names=['user', 'item', 'rating', 'timestamp'])
# 二值化評分 (rating>=4 為正樣本)
df['label'] = (df['rating'] >= 4).astype(np.float32)

# 建立使用者與物品索引（連續整數）
user_ids = df['user'].unique()
item_ids = df['item'].unique()
num_users = len(user_ids)
num_items = len(item_ids)

user2idx = {uid: idx for idx, uid in enumerate(user_ids)}
item2idx = {iid: idx for idx, iid in enumerate(item_ids)}
df['user_idx'] = df['user'].apply(lambda x: user2idx[x])
df['item_idx'] = df['item'].apply(lambda x: item2idx[x])

# 切分訓練與測試資料
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
#######################################
# 3. 定義 Dataset 與 DataLoader (PyTorch)
#######################################

class RecDataset(Dataset):
    def __init__(self, df):
        self.users = df['user_idx'].values
        self.items = df['item_idx'].values
        self.labels = df['label'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'user': torch.tensor(self.users[idx], dtype=torch.long),
            'item': torch.tensor(self.items[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)
        }

batch_size = 256
train_dataset = RecDataset(train_df)
test_dataset = RecDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
#######################################
# 4. 為 LightGCN 建構歸一化稀疏鄰接矩陣
#######################################

def build_norm_adj(train_data, num_users, num_items):
    # 建立 user-item 交互的 bipartite graph
    rows = np.array(train_data['user_idx'].tolist() + (train_data['item_idx'] + num_users).tolist())
    cols = np.array(train_data['item_idx'] + num_users).tolist() + train_data['user_idx'].tolist()
    data = np.ones(len(rows))
    adj = sp.coo_matrix((data, (rows, cols)), shape=(num_users + num_items, num_users + num_items))

    # 計算 D^(-1/2)
    rowsum = np.array(adj.sum(1)).flatten()
    d_inv_sqrt = np.power(rowsum, -0.5)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)

    norm_adj = d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo()
    # 轉成 torch.sparse.FloatTensor
    indices = torch.from_numpy(np.vstack((norm_adj.row, norm_adj.col)).astype(np.int64))
    values = torch.from_numpy(norm_adj.data.astype(np.float32))
    shape = torch.Size(norm_adj.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

norm_adj = build_norm_adj(train_df, num_users, num_items)

  d_inv_sqrt = np.power(rowsum, -0.5)
  return torch.sparse.FloatTensor(indices, values, shape)


In [9]:
#######################################
# 5. 定義模型
#######################################

# 5-1. NCF 模型
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embed_dim=32, hidden_dims=[64, 32, 16, 8]):
        super(NCF, self).__init__()
        self.user_embed = nn.Embedding(num_users, embed_dim)
        self.item_embed = nn.Embedding(num_items, embed_dim)

        layers = []
        input_size = embed_dim * 2
        for hidden in hidden_dims:
            layers.append(nn.Linear(input_size, hidden))
            layers.append(nn.ReLU())
            input_size = hidden
        self.mlp = nn.Sequential(*layers)
        self.output_layer = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, user, item):
        user_emb = self.user_embed(user)
        item_emb = self.item_embed(item)
        x = torch.cat([user_emb, item_emb], dim=-1)
        x = self.mlp(x)
        x = self.output_layer(x)
        return self.sigmoid(x).squeeze()

# 5-2. LightGCN 模型
class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embed_dim=32, n_layers=2, norm_adj=None):
        super(LightGCN, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embed_dim = embed_dim
        self.n_layers = n_layers
        self.norm_adj = norm_adj  # 預先計算好的歸一化鄰接矩陣

        # 初始化使用者與物品 embedding
        self.user_embed = nn.Embedding(num_users, embed_dim)
        self.item_embed = nn.Embedding(num_items, embed_dim)
        nn.init.xavier_uniform_(self.user_embed.weight)
        nn.init.xavier_uniform_(self.item_embed.weight)

    def computer(self):
        # 將使用者與物品的 embedding 串接
        all_embeddings = torch.cat([self.user_embed.weight, self.item_embed.weight], dim=0)
        embeddings_list = [all_embeddings]
        for layer in range(self.n_layers):
            all_embeddings = torch.sparse.mm(self.norm_adj, all_embeddings)
            embeddings_list.append(all_embeddings)
        # 取各層平均
        final_embedding = torch.stack(embeddings_list, dim=1).mean(dim=1)
        user_final, item_final = torch.split(final_embedding, [self.num_users, self.num_items], dim=0)
        return user_final, item_final

    def forward(self, users, items):
        user_emb, item_emb = self.computer()
        u_e = user_emb[users]
        i_e = item_emb[items]
        logits = (u_e * i_e).sum(dim=1)
        return torch.sigmoid(logits)

In [10]:
#######################################
# 6. 定義訓練與評估函式（供深度模型使用）
#######################################

def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0.0
    for batch in train_loader:
        user = batch['user'].to(device)
        item = batch['item'].to(device)
        label = batch['label'].to(device)

        optimizer.zero_grad()
        output = model(user, item)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * user.size(0)
    return epoch_loss / len(train_loader.dataset)

def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            user = batch['user'].to(device)
            item = batch['item'].to(device)
            label = batch['label'].to(device)
            output = model(user, item)
            preds = (output >= 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(label.cpu().numpy())
    accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
    return accuracy

In [20]:
#######################################
# 7. 設定裝置與超參數，訓練深度模型
#######################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_epochs = 10
lr = 0.001
criterion = nn.BCELoss()

# 7-1. 訓練 NCF
ncf_model = NCF(num_users, num_items).to(device)
optimizer_ncf = torch.optim.Adam(ncf_model.parameters(), lr=lr)
print("Training NCF...")
for epoch in range(n_epochs):
    loss = train_model(ncf_model, train_loader, optimizer_ncf, criterion, device)
    acc = evaluate_model(ncf_model, test_loader, device)
    print(f"NCF Epoch {epoch+1}/{n_epochs} - Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
ncf_acc = evaluate_model(ncf_model, test_loader, device)

# 7-2. 訓練 LightGCN
lightgcn_model = LightGCN(num_users, num_items, norm_adj=norm_adj.to(device)).to(device)
optimizer_lg = torch.optim.Adam(lightgcn_model.parameters(), lr=lr)
print("\nTraining LightGCN...")
for epoch in range(n_epochs):
    loss = train_model(lightgcn_model, train_loader, optimizer_lg, criterion, device)
    acc = evaluate_model(lightgcn_model, test_loader, device)
    print(f"LightGCN Epoch {epoch+1}/{n_epochs} - Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
lg_acc = evaluate_model(lightgcn_model, test_loader, device)

Training NCF...
NCF Epoch 1/10 - Loss: 0.6705, Test Accuracy: 0.6260
NCF Epoch 2/10 - Loss: 0.6167, Test Accuracy: 0.6613
NCF Epoch 3/10 - Loss: 0.5846, Test Accuracy: 0.6798
NCF Epoch 4/10 - Loss: 0.5653, Test Accuracy: 0.6858
NCF Epoch 5/10 - Loss: 0.5520, Test Accuracy: 0.6891
NCF Epoch 6/10 - Loss: 0.5406, Test Accuracy: 0.6909
NCF Epoch 7/10 - Loss: 0.5320, Test Accuracy: 0.6932
NCF Epoch 8/10 - Loss: 0.5239, Test Accuracy: 0.6912
NCF Epoch 9/10 - Loss: 0.5167, Test Accuracy: 0.6919
NCF Epoch 10/10 - Loss: 0.5089, Test Accuracy: 0.6908

Training LightGCN...
LightGCN Epoch 1/10 - Loss: 0.6782, Test Accuracy: 0.5622
LightGCN Epoch 2/10 - Loss: 0.6389, Test Accuracy: 0.6085
LightGCN Epoch 3/10 - Loss: 0.6240, Test Accuracy: 0.6246
LightGCN Epoch 4/10 - Loss: 0.6158, Test Accuracy: 0.6349
LightGCN Epoch 5/10 - Loss: 0.6101, Test Accuracy: 0.6390
LightGCN Epoch 6/10 - Loss: 0.6058, Test Accuracy: 0.6424
LightGCN Epoch 7/10 - Loss: 0.6023, Test Accuracy: 0.6469
LightGCN Epoch 8/10 - Los

In [17]:
#######################################
# 8. 準備 XGBoost 與 LightGBM 的資料
#######################################
# 對於樹模型，採用 one-hot encoding 方式產生特徵 (使用 user_idx 與 item_idx)
def prepare_tree_features(df):
    df_feat = df.copy()
    user_dummies = pd.get_dummies(df_feat['user_idx'], prefix='user')
    item_dummies = pd.get_dummies(df_feat['item_idx'], prefix='item')
    df_feat = pd.concat([df_feat, user_dummies, item_dummies], axis=1)
    # 特徵欄位為所有 one-hot 的欄位
    feature_cols = list(user_dummies.columns) + list(item_dummies.columns)
    return df_feat[feature_cols], df_feat['label']

X_train, y_train = prepare_tree_features(train_df)
X_test, y_test = prepare_tree_features(test_df)
# reindex 測試資料，確保與訓練資料的欄位一致
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)


In [18]:
#######################################
# 9. 訓練 XGBoost 與 LightGBM
#######################################

print("\nTraining XGBoost...")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_preds)
print(f"XGBoost Test Accuracy: {xgb_acc:.4f}")

print("\nTraining LightGBM...")
lgb_model = LGBMClassifier()
lgb_model.fit(X_train, y_train)
lgb_preds = lgb_model.predict(X_test)
lgb_acc = accuracy_score(y_test, lgb_preds)
print(f"LightGBM Test Accuracy: {lgb_acc:.4f}")



Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



XGBoost Test Accuracy: 0.6483

Training LightGBM...
[LightGBM] [Info] Number of positive: 44385, number of negative: 35615
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.671113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3400
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 1700
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.554813 -> initscore=0.220135
[LightGBM] [Info] Start training from score 0.220135
LightGBM Test Accuracy: 0.6633


In [21]:
#######################################
# 10. 統一比較各模型結果
#######################################
print("\n=== Quantitative Comparison ===")
print(f"NCF Test Accuracy: {ncf_acc:.4f}")
print(f"LightGCN Test Accuracy: {lg_acc:.4f}")
print(f"XGBoost Test Accuracy: {xgb_acc:.4f}")
print(f"LightGBM Test Accuracy: {lgb_acc:.4f}")


=== Quantitative Comparison ===
NCF Test Accuracy: 0.6908
LightGCN Test Accuracy: 0.6501
XGBoost Test Accuracy: 0.6483
LightGBM Test Accuracy: 0.6633
