In [None]:
###Best Version###
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path

class GNN_Autoencoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels):
        super(GNN_Autoencoder, self).__init__()
        self.encoder_conv = GCNConv(in_channels, hidden_channels)
        self.encoder_conv2 = GCNConv(hidden_channels, out_channels)
        self.decoder_conv = GCNConv(out_channels, hidden_channels)
        self.decoder_conv2 = GCNConv(hidden_channels, in_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.encoder_conv(x, edge_index))
        x = F.relu(self.encoder_conv2(x, edge_index))
        x = F.relu(self.decoder_conv(x, edge_index))
        x = self.decoder_conv2(x, edge_index)
        return x

def pad_features(x, max_features):
    pad_size = max_features - x.shape[1]
    if pad_size > 0:
        padding = torch.zeros((x.shape[0], pad_size), dtype=x.dtype)
        x = torch.cat([x, padding], dim=1)
    return x

def generate_graph_data(json_file, max_features):
    with open(json_file, 'r') as file:
        data_list = json.load(file)
    for item in data_list:
        edge_index = torch.tensor(item['data']['edge_index'], dtype=torch.long)
        if edge_index.dim() == 1 or edge_index.size(0) == 1:
            edge_index = edge_index.view(2, -1)
        x = torch.tensor(item['data']['x'], dtype=torch.float)
        x_padded = pad_features(x, max_features)
        label = item.get('label')
        yield Data(x=x_padded, edge_index=edge_index), label

def find_max_features(json_dir):
    max_features = 0
    for json_file in Path(json_dir).glob('*.json'):
        with open(json_file, 'r') as file:
            data_list = json.load(file)
        for item in data_list:
            x = torch.tensor(item['data']['x'], dtype=torch.float)
            max_features = max(max_features, x.size(1))
    return max_features

def evaluate_model_with_threshold(model, criterion, device, test_data_loader, test_labels, threshold):
    model.eval()
    total_loss = 0
    graph_level_errors = []
    with torch.no_grad():
        for batch in test_data_loader:
            batch = batch.to(device)
            output = model(batch.x, batch.edge_index)
            loss = criterion(output, batch.x)
            total_loss += loss.item()
            batch_errors = torch.mean((output - batch.x) ** 2, dim=1)
            if hasattr(batch, 'batch'):
                for i in range(batch.num_graphs):
                    graph_error = batch_errors[batch.batch == i].mean().item()
                    graph_level_errors.append(graph_error)
            else:
                graph_level_errors.append(batch_errors.mean().item())
    
    predicted_labels = [1 if error > threshold else 0 for error in graph_level_errors]
    precision = precision_score(test_labels, predicted_labels)
    recall = recall_score(test_labels, predicted_labels)
    f1 = f1_score(test_labels, predicted_labels)
    avg_reconstruction_error = sum(graph_level_errors) / len(graph_level_errors)

    
    predicted_labels = [1 if error > threshold else 0 for error in graph_level_errors]
    
    # Calculate True Positives (TP), True Negatives (TN), False Positives (FP), False Negatives (FN)
    tp = sum(t == 1 and p == 1 for t, p in zip(test_labels, predicted_labels))
    tn = sum(t == 0 and p == 0 for t, p in zip(test_labels, predicted_labels))
    fp = sum(t == 0 and p == 1 for t, p in zip(test_labels, predicted_labels))
    fn = sum(t == 1 and p == 0 for t, p in zip(test_labels, predicted_labels))
    
    # Calculate Accuracy
    accuracy = (tp + tn) / (len(test_labels))
    
    # Calculate AUC 
    if tp + fn > 0 and tn + fp > 0:
      auc = roc_auc_score(test_labels, predicted_labels)
    else:
      auc = 0.5 

    fpr, tpr, _ = roc_curve(test_labels, predicted_labels)
    auc = roc_auc_score(test_labels, predicted_labels)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label='ROC Curve (AUC = %0.4f)' % auc)
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.grid()
    # plt.show()  # This will display the plot

    plt.savefig('roc_curve.pdf', format="pdf", dpi=300, bbox_inches='tight') 
    return total_loss / len(test_data_loader), precision, recall, f1, avg_reconstruction_error, auc, accuracy

def main():
    json_dir = 'json2'  # Adjust path as needed
    test_json_dir = 'json4'  # Adjust path as needed
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    max_features = max(find_max_features(json_dir), find_max_features(test_json_dir))

    # Parameters from optimization
    learning_rate = xxx
    weight_decay = xxx
    threshold = xxx
    out_channels = xxx
    hidden_channels = xxx

    model = GNN_Autoencoder(in_channels=max_features, out_channels=out_channels, hidden_channels=hidden_channels).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = torch.nn.MSELoss()

    # Prepare training data
    training_dataset = []
    for json_file in Path(json_dir).glob('*.json'):
        for data, _ in generate_graph_data(json_file, max_features):  # Labels not used in autoencoder training
            training_dataset.append(data)
    training_data_loader = DataLoader(training_dataset, batch_size=2, shuffle=True)

    # Training loop
    for epoch in range(5):  # Assuming a small number of epochs for demonstration
        model.train()
        for data in training_data_loader:
            data = data.to(device)
            optimizer.zero_grad()
            reconstructed = model(data.x, data.edge_index)
            loss = criterion(reconstructed, data.x)
            loss.backward()
            optimizer.step()

    # Prepare test data
    test_dataset, test_labels = [], []
    for json_file in Path(test_json_dir).glob('*.json'):
        for data, label in generate_graph_data(json_file, max_features):
            test_dataset.append(data)
            test_labels.append(label if label is not None else 0)
    test_data_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

    # Evaluate the model


    test_loss, precision, recall, f1, avg_reconstruction_error, auc, accuracy = evaluate_model_with_threshold(
        model, criterion, device, test_data_loader, test_labels, threshold)
    
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Average Reconstruction Error: {avg_reconstruction_error:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Accuracy: {accuracy:.4f}")

  
    print("Best model saved successfully.")

if __name__ == "__main__":
    main()

