In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from sklearn.model_selection import KFold
import numpy as np

In [None]:
# Load your dataset
df = pd.read_csv('labr_20_samples.csv')  # Replace with your dataset path

# Map user and book IDs to indices
user_ids = df['user_id'].unique()
book_ids = df['book_id'].unique()

uid_map = {uid: i for i, uid in enumerate(user_ids)}
bid_map = {bid: i for i, bid in enumerate(book_ids)}

num_users = len(user_ids)
num_books = len(book_ids)

# Prepare edge_index, edge_y, edge_attr
edge_index = []
edge_y = []
edge_attr = []

for _, row in df.iterrows():
    u_idx = uid_map[row['user_id']]
    b_idx = num_users + bid_map[row['book_id']]
    edge_index.append([u_idx, b_idx])
    edge_y.append(row['ground_truth'])
    edge_attr.append([
        row['final_score'],
        row['rating'],
        row['text_length'],
        row['sentiment_label'],
        row['sentiment_score_normalized'],
        row['score_normalized']
    ])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_y = torch.tensor(edge_y, dtype=torch.long)
edge_attr = torch.tensor(edge_attr, dtype=torch.float)

# Node features (learnable embeddings)
x = torch.zeros(num_users + num_books, 64)  # 64-dim embedding


In [None]:
class GCNModel(torch.nn.Module):
    def __init__(self, input_dim=64, hidden_dim=64, num_classes=3):
        super().__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.classifier = torch.nn.Linear(hidden_dim * 2 + 6, num_classes)

    def forward(self, x, edge_index, edge_attr):
        h = self.conv1(x, edge_index)
        h = torch.tanh(h)
        h = self.conv2(h, edge_index)
        h = torch.clamp(h, min=0, max=6)
        row, col = edge_index
        edge_emb = torch.cat([h[row], h[col], edge_attr], dim=-1)
        out = self.classifier(edge_emb)
        return out

class SAGEModel(torch.nn.Module):
    def __init__(self, input_dim=64, hidden_dim=64, num_classes=3):
        super().__init__()
        self.sage1 = SAGEConv(input_dim, hidden_dim)
        self.sage2 = SAGEConv(hidden_dim, hidden_dim)
        self.classifier = torch.nn.Linear(hidden_dim * 2 + 6, num_classes)

    def forward(self, x, edge_index, edge_attr):
        h = self.sage1(x, edge_index).relu()
        h = torch.tanh(h)
        h = self.sage2(h, edge_index)
        h = torch.clamp(h, min=0, max=6)
        row, col = edge_index
        edge_emb = torch.cat([h[row], h[col], edge_attr], dim=-1)
        out = self.classifier(edge_emb)
        return out

class GATModel(torch.nn.Module):
    def __init__(self, input_dim=64, hidden_dim=32, num_classes=3):
        super().__init__()
        self.gat1 = GATConv(input_dim, hidden_dim, heads=4, concat=True, dropout=0.1)
        self.gat2 = GATConv(hidden_dim * 4, hidden_dim, heads=1, concat=False, dropout=0.1)
        self.classifier = torch.nn.Linear(hidden_dim * 2 + 6, num_classes)

    def forward(self, x, edge_index, edge_attr):
        h = self.gat1(x, edge_index)
        h = torch.tanh(h)
        h = self.gat2(h, edge_index)
        h = torch.clamp(h, min=0, max=6)
        row, col = edge_index
        edge_emb = torch.cat([h[row], h[col], edge_attr], dim=-1)
        out = self.classifier(edge_emb)
        return out


In [None]:
def train_and_validate(model, x, edge_index, edge_attr, edge_y, k_folds=5, model_name=""):
    kf = KFold(n_splits=k_folds, shuffle=True)
    results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(edge_y)):
        # Split edges for cross-validation
        train_edge_index = edge_index[:, train_idx]
        val_edge_index = edge_index[:, val_idx]
        train_edge_y = edge_y[train_idx]
        val_edge_y = edge_y[val_idx]
        train_edge_attr = edge_attr[train_idx]
        val_edge_attr = edge_attr[val_idx]

        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        criterion = torch.nn.CrossEntropyLoss()

        # Train
        for epoch in range(100):
            model.train()
            optimizer.zero_grad()
            out = model(x, train_edge_index, train_edge_attr)
            loss = criterion(out, train_edge_y)
            loss.backward()
            optimizer.step()

        # Validate
        model.eval()
        with torch.no_grad():
            out = model(x, val_edge_index, val_edge_attr)
            pred = out.argmax(dim=1)
            accuracy = (pred == val_edge_y).float().mean().item()

            # Calculate MAE, MSE, RMSE
            pred = pred.float()
            val_edge_y = val_edge_y.float()
            mae = torch.abs(pred - val_edge_y).mean().item()
            mse = ((pred - val_edge_y) ** 2).mean().item()
            rmse = mse ** 0.5

        # Store results
        results.append({
            "model": model_name,
            "fold": fold,
            "accuracy": accuracy,
            "mae": mae,
            "mse": mse,
            "rmse": rmse,
            "hidden_dim": model.conv1.out_channels if hasattr(model, 'conv1') else model.sage1.out_channels if hasattr(model, 'sage1') else model.gat1.out_channels,
            "lr": 0.01,
            "num_layers": 2,
        })

    return results

# Train and validate each model
models = {
    'GCN': GCNModel(num_classes=3),
    'SAGE': SAGEModel(num_classes=3),
    'GAT': GATModel(num_classes=3)
}

all_results = []
for name, model in models.items():
    results = train_and_validate(model, x, edge_index, edge_attr, edge_y, model_name=name)
    all_results.extend(results)

# Save to CSV
df_results = pd.DataFrame(all_results)
df_results.to_csv("model_results.csv", index=False)
