In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv, GATConv

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

from tqdm.auto import tqdm
import json
from datetime import datetime

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
torch.set_num_threads(10)
# Verify the number of threads
print("Number of threads:", torch.get_num_threads())
# Load your preprocessed BRAD data
df = pd.read_csv("../../data/processed/brad_final.csv")
print(df.columns)

Device: cpu
Number of threads: 10
Index(['rating', 'review_id', 'book_id', 'user_id', 'review', 'review_clean',
       'camel_sentiment', 'camel_score', 'gt_sentiment', 'rating_normalized',
       'sentiment_score', 'final_score'],
      dtype='object')


In [3]:
# Map user_id and book_id to consecutive node indices
user_ids = df["user_id"].unique()
book_ids = df["book_id"].unique()

user_id2idx = {u: i for i, u in enumerate(user_ids)}
offset = len(user_ids)
book_id2idx = {b: offset + i for i, b in enumerate(book_ids)}

num_nodes = offset + len(book_ids)
print("Num users:", len(user_ids), "Num books:", len(book_ids), "Num nodes:", num_nodes)

# Edge index: user -> book
user_idx = df["user_id"].map(user_id2idx).to_numpy()
book_idx = df["book_id"].map(book_id2idx).to_numpy()

edge_index = torch.tensor(
    np.vstack([user_idx, book_idx]),
    dtype=torch.long
)

# Labels: map gt_sentiment {-1,0,1} -> {0,1,2}
sent_map = {-1: 0, 0: 1, 1: 2}
y = torch.tensor(df["gt_sentiment"].map(sent_map).to_numpy(), dtype=torch.long)

num_edges = y.size(0)
num_classes = 3
print("Num edges:", num_edges)

data = Data(edge_index=edge_index, y=y, num_nodes=num_nodes)

Num users: 4993 Num books: 75215 Num nodes: 80208
Num edges: 503516


In [4]:
class_counts = np.bincount(y.numpy(), minlength=num_classes)
class_weights = 1.0 / np.maximum(class_counts, 1)
class_weights = class_weights / class_weights.sum() * num_classes
class_weights = torch.tensor(class_weights, dtype=torch.float32)
print("Class counts:", class_counts)
print("Class weights:", class_weights)

Class counts: [ 77039 105654 320823]
Class weights: tensor([1.5234, 1.1108, 0.3658])


In [5]:
class GCNModel(nn.Module):
    def __init__(self, num_nodes, hidden_dim, num_layers, dropout, num_classes):
        super().__init__()
        self.emb = nn.Embedding(num_nodes, hidden_dim)
        self.convs = nn.ModuleList()
        self.convs.append(GCNConv(hidden_dim, hidden_dim))
        for _ in range(num_layers - 1):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))
        self.lin = nn.Linear(hidden_dim, num_classes)
        self.dropout = dropout

    def forward(self, edge_index):
        x = self.emb.weight
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        return self.lin(x)


class SAGEModel(nn.Module):
    def __init__(self, num_nodes, hidden_dim, num_layers, dropout, num_classes):
        super().__init__()
        self.emb = nn.Embedding(num_nodes, hidden_dim)
        self.convs = nn.ModuleList()
        self.convs.append(SAGEConv(hidden_dim, hidden_dim))
        for _ in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden_dim, hidden_dim))
        self.lin = nn.Linear(hidden_dim, num_classes)
        self.dropout = dropout

    def forward(self, edge_index):
        x = self.emb.weight
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        return self.lin(x)


class GATModel(nn.Module):
    def __init__(self, num_nodes, hidden_dim, num_layers, heads, dropout, num_classes):
        super().__init__()
        self.emb = nn.Embedding(num_nodes, hidden_dim)
        self.convs = nn.ModuleList()
        self.convs.append(GATConv(hidden_dim, hidden_dim, heads=heads, concat=False))
        for _ in range(num_layers - 1):
            self.convs.append(GATConv(hidden_dim, hidden_dim, heads=heads, concat=False))
        self.lin = nn.Linear(hidden_dim, num_classes)
        self.dropout = dropout

    def forward(self, edge_index):
        x = self.emb.weight
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.elu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        return self.lin(x)

In [6]:
def train_one_config_fold(model_type, cfg, fold_idx,
                          train_idx, val_idx, test_idx,
                          device, max_epochs=30, patience=5):
    if model_type == "GCN":
        model = GCNModel(data.num_nodes, cfg["hidden_dim"], cfg["num_layers"],
                         cfg["dropout"], num_classes)
    elif model_type == "SAGE":
        model = SAGEModel(data.num_nodes, cfg["hidden_dim"], cfg["num_layers"],
                          cfg["dropout"], num_classes)
    elif model_type == "GAT":
        model = GATModel(data.num_nodes, cfg["hidden_dim"], cfg["num_layers"],
                         cfg["heads"], cfg["dropout"], num_classes)
    else:
        raise ValueError("Unknown model type")

    model = model.to(device)
    edge_index = data.edge_index.to(device)
    y_all = data.y.to(device)

    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=cfg["lr"],
                                 weight_decay=cfg["weight_decay"])

    train_idx_t = torch.tensor(train_idx, dtype=torch.long, device=device)
    val_idx_t   = torch.tensor(val_idx, dtype=torch.long, device=device)
    test_idx_t  = torch.tensor(test_idx, dtype=torch.long, device=device)

    dst_nodes = edge_index[1]  # classify based on book node embedding

    best_state = None
    best_val_f1 = -1.0
    no_improve = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        optimizer.zero_grad()

        logits_nodes = model(edge_index)
        train_logits = logits_nodes[dst_nodes[train_idx_t]]
        train_labels = y_all[train_idx_t]

        loss = criterion(train_logits, train_labels)
        loss.backward()
        optimizer.step()

        # validation
        model.eval()
        with torch.no_grad():
            logits_nodes = model(edge_index)
            def eval_split(idx_t):
                preds = logits_nodes[dst_nodes[idx_t]].argmax(dim=-1).cpu().numpy()
                true  = y_all[idx_t].cpu().numpy()
                acc = accuracy_score(true, preds)
                f1  = f1_score(true, preds, average="macro")
                return acc, f1

            val_acc, val_f1 = eval_split(val_idx_t)

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_state = model.state_dict()
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                break

    # Load best weights and compute final metrics
    model.load_state_dict(best_state)
    model.eval()
    with torch.no_grad():
        logits_nodes = model(edge_index)
        def eval_split(idx_t):
            preds = logits_nodes[dst_nodes[idx_t]].argmax(dim=-1).cpu().numpy()
            true  = y_all[idx_t].cpu().numpy()
            acc = accuracy_score(true, preds)
            f1  = f1_score(true, preds, average="macro")
            return acc, f1

        train_acc, train_f1 = eval_split(train_idx_t)
        val_acc, val_f1     = eval_split(val_idx_t)
        test_acc, test_f1   = eval_split(test_idx_t)

    return {
        "train_acc": train_acc, "train_f1": train_f1,
        "val_acc": val_acc, "val_f1": val_f1,
        "test_acc": test_acc, "test_f1": test_f1,
    }

In [7]:
def run_grid_search(configs, model_type, device, n_splits=3,
                    max_epochs=30, patience=5,
                    jsonl_path="gnn_results.jsonl"):
    """
    configs: list of dicts with hyperparameters
    model_type: "GCN" / "SAGE" / "GAT"
    jsonl_path: JSONL file where every round (config x fold) is appended
    """
    results = []

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    indices = np.arange(len(y))
    splits = list(skf.split(indices, y.numpy()))

    with open(jsonl_path, "a", encoding="utf-8") as jsonl_f:
        for cfg_id, cfg in enumerate(tqdm(configs, desc=f"{model_type} configs")):
            for fold_id, (train_val_idx, test_idx) in enumerate(splits):

                inner_skf = StratifiedKFold(n_splits=5, shuffle=True,
                                            random_state=fold_id)
                inner_train, inner_val = next(
                    inner_skf.split(train_val_idx, y[train_val_idx].numpy())
                )
                train_idx = train_val_idx[inner_train]
                val_idx   = train_val_idx[inner_val]

                metrics = train_one_config_fold(
                    model_type, cfg, fold_id,
                    train_idx, val_idx, test_idx,
                    device, max_epochs=max_epochs, patience=patience
                )

                row = {
                    "timestamp": datetime.utcnow().isoformat(),
                    "model": model_type,
                    "config_id": cfg_id,
                    "fold": fold_id,
                    **cfg,
                    **metrics,
                }

                jsonl_f.write(json.dumps(row, ensure_ascii=False) + "\n")
                jsonl_f.flush()

                results.append(row)

    return pd.DataFrame(results)

# Kaggle GPU – GAT (richer grid)

In [8]:
configs_gat = []
for hidden_dim in [64, 128, 256]:
    for num_layers in [2, 3]:
        for heads in [2, 4, 8]:
            for dropout in [0.0, 0.3]:
                for lr in [1e-3, 3e-4]:
                    for wd in [0.0, 5e-4]:
                        configs_gat.append({
                            "hidden_dim": hidden_dim,
                            "num_layers": num_layers,
                            "heads": heads,
                            "dropout": dropout,
                            "lr": lr,
                            "weight_decay": wd,
                        })

gat_results = run_grid_search(
    configs_gat, "GAT", device,
    n_splits=3, max_epochs=25, patience=5,
    jsonl_path="gat_results_kaggle.jsonl"
)

GAT configs:   0%|          | 0/144 [00:00<?, ?it/s]

  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().i

KeyboardInterrupt: 

# Google Colab GPU – GraphSAGE

In [None]:
configs_sage = []
for hidden_dim in [64, 128]:
    for num_layers in [2, 3]:
        for dropout in [0.0, 0.3, 0.5]:
            for lr in [1e-3, 3e-4]:
                for wd in [0.0, 5e-4]:
                    configs_sage.append({
                        "hidden_dim": hidden_dim,
                        "num_layers": num_layers,
                        "heads": 1,
                        "dropout": dropout,
                        "lr": lr,
                        "weight_decay": wd,
                    })

sage_results = run_grid_search(
    configs_sage, "SAGE", device,
    n_splits=3, max_epochs=25, patience=5,
    jsonl_path="sage_results_colab.jsonl"
)

# Your PC (CPU) – GCN (lighter grid)

In [None]:
device_pc = torch.device("cpu")

configs_gcn = []
for hidden_dim in [64, 128]:
    for num_layers in [2, 3]:
        for dropout in [0.0, 0.5]:
            for lr in [1e-3, 3e-4]:
                for wd in [0.0, 5e-4]:
                    configs_gcn.append({
                        "hidden_dim": hidden_dim,
                        "num_layers": num_layers,
                        "heads": 1,
                        "dropout": dropout,
                        "lr": lr,
                        "weight_decay": wd,
                    })

gcn_results = run_grid_search(
    configs_gcn, "GCN", device_pc,
    n_splits=3, max_epochs=20, patience=4,
    jsonl_path="gcn_results_pc.jsonl"
)