In [None]:
!pip install torch-geometric

In [None]:
import torch
import pandas as pd
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import KFold
from itertools import product
from tqdm import tqdm

In [None]:
# Load your dataset
df = pd.read_csv('/kaggle/input/labr-dataset/labr_complete.csv')

In [None]:
user_features = ['user_id', 'sentiment_score_normalized', 'text_length']
book_features = ['book_id', 'score_normalized']
score_col = 'final_score'

user_feats = df[user_features].values
book_feats = df[book_features].values
scores = df[score_col].values

# Pad features to match dimensions
max_features = max(user_feats.shape[1], book_feats.shape[1])
user_feats = np.pad(user_feats, ((0, 0), (0, max_features - user_feats.shape[1])), mode='constant')
book_feats = np.pad(book_feats, ((0, 0), (0, max_features - book_feats.shape[1])), mode='constant')

num_users = len(user_feats)
num_books = len(book_feats)
x = torch.cat([torch.tensor(user_feats, dtype=torch.float),
               torch.tensor(book_feats, dtype=torch.float)], dim=0)
edge_index = torch.tensor([list(range(num_users)), list(range(num_users, num_users + num_books))], dtype=torch.long)
y = torch.zeros(num_users + num_books, 1, dtype=torch.float)
y[:num_users] = torch.tensor(scores, dtype=torch.float).unsqueeze(1)
# Check if GPU is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

data = Data(x=x, edge_index=edge_index, y=y)

# Move data to GPU
data = data.to(device)

In [None]:
# GNN Models
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_layers, hidden_dim, out_dim=1):
        super(GCN, self).__init__()
        layers = []
        for _ in range(num_layers):
            layers.append(GCNConv(num_features, hidden_dim))
            num_features = hidden_dim
        self.layers = torch.nn.ModuleList(layers)
        self.out = GCNConv(hidden_dim, out_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for layer in self.layers:
            x = F.relu(layer(x, edge_index))
        return self.out(x, edge_index)

class SAGE(torch.nn.Module):
    def __init__(self, num_features, num_layers, hidden_dim, out_dim=1):
        super(SAGE, self).__init__()
        layers = []
        for _ in range(num_layers):
            layers.append(SAGEConv(num_features, hidden_dim))
            num_features = hidden_dim
        self.layers = torch.nn.ModuleList(layers)
        self.out = SAGEConv(hidden_dim, out_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for layer in self.layers:
            x = F.relu(layer(x, edge_index))
        return self.out(x, edge_index)

class GAT(torch.nn.Module):
    def __init__(self, num_features, num_layers, hidden_dim, heads, out_dim=1):
        super(GAT, self).__init__()
        layers = []
        for _ in range(num_layers):
            layers.append(GATConv(num_features, hidden_dim, heads=heads))
            num_features = hidden_dim * heads
        self.layers = torch.nn.ModuleList(layers)
        self.out = GATConv(hidden_dim * heads, out_dim, heads=1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        for layer in self.layers:
            x = F.relu(layer(x, edge_index))
        return self.out(x, edge_index)


In [None]:
def generate_configs():
    # Focus on best parameters and nearby values
    num_layers_list = [3, 4, 5]  # Slightly vary around 4
    hidden_dim_list = [16, 32, 64]  # Slightly vary around 32
    heads_list = [2, 4, 8]  # Slightly vary around 4
    lr_list = [0.01, 0.1, 0.001]  # Slightly vary around 0.1

    for nl in num_layers_list:
        for hd in hidden_dim_list:
            for h in heads_list:
                for lr in lr_list:
                    # Skip configurations that are likely to exceed memory
                    if nl * hd * h > 5000:  # Adjust threshold as needed
                        continue
                    yield {'num_layers': nl, 'hidden_dim': hd, 'heads': h, 'lr': lr}


In [None]:
# Cross-validation function with GPU and progress tracking
def cross_validate_gnn(model_class, data, configs, k=3, epochs=100):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    results = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(np.arange(num_users))):
        # Create masks for current fold
        train_mask = torch.zeros(num_users + num_books, dtype=torch.bool).to(device)
        test_mask = torch.zeros(num_users + num_books, dtype=torch.bool).to(device)
        train_mask[train_idx] = True
        test_mask[test_idx] = True
        data_fold = data.clone()
        data_fold.train_mask = train_mask
        data_fold.test_mask = test_mask

        for config in tqdm(configs, desc=f"Fold {fold+1}/{k}"):
            if model_class == GAT:
                model = model_class(data.num_features, config['num_layers'], config['hidden_dim'], config['heads']).to(device)
            else:
                model = model_class(data.num_features, config['num_layers'], config['hidden_dim']).to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
            model.train()
            for epoch in range(epochs):
                optimizer.zero_grad()
                out = model(data_fold)[data_fold.train_mask]
                loss = F.mse_loss(out, data_fold.y[data_fold.train_mask])
                loss.backward()
                optimizer.step()
            model.eval()
            with torch.no_grad():
                out = model(data_fold)[data_fold.test_mask]
                y_true = data_fold.y[data_fold.test_mask]
                mse = F.mse_loss(out, y_true).item()
                mae = F.l1_loss(out, y_true).item()
                rmse = np.sqrt(mse)
            results.append({
                'model': model_class.__name__,
                'fold': fold,
                'num_layers': config['num_layers'],
                'hidden_dim': config['hidden_dim'],
                'heads': config['heads'] if model_class == GAT else None,
                'lr': config['lr'],
                'test_mse': mse,
                'test_mae': mae,
                'test_rmse': rmse
            })
    return results

In [None]:
# Generate configs and run cross-validation for each GNN
configs = list(generate_configs())
results = []

results = cross_validate_gnn(GAT, data, configs, k=3)

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('gat_metric_labr_ver4.csv', index=False)
print("Cross-validation metrics saved to gat_metric_labr.csv")