In [4]:
from torch_geometric.data import HeteroData, DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, to_hetero , SAGEConv
from torch_geometric.utils import negative_sampling
from torch_geometric.loader import LinkNeighborLoader

import torch
from torch import nn 
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
warnings.filterwarnings("ignore") 


In [3]:
# *****************************************************************************
# Load the Dataframes :
path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
graph_data = torch.load(f'{path_work}/train_nn/graph_file.1107.pt')


In [7]:
# *****************************************************************************
# The model : Linear Classifier
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels , conv=SAGEConv):
        super().__init__()
        self.conv = conv(-1, hidden_channels, aggr='mean')  # Use 'mean' aggregation
    def forward(self, x, edge_index):
        x = self.conv(x, edge_index)  # GNN layer (feature transformation)
        return x
        
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, 1)

    def forward(self, features_A, features_B1, graph_data):
        index_B1 , index_A = graph_data["B1", "infects", "A"].edge_label_index
        z = torch.cat([features_B1[index_B1] ,features_A[index_A]], dim=-1)  # Can you explain why this line gives me an error 
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, out_channels , conv=SAGEConv):
        super().__init__()
        self.gnn_B2_B1 = GNN(out_channels, conv)  # Use GNN instead of conv
        self.gnn_B1_A = GNN(out_channels, conv)  # Use GNN instead of conv
        self.decode = EdgeDecoder(out_channels)
        
    def forward(self, graph_data):
        # Propagate B2 features to B1 : prop 1 
        prop_1_x = graph_data.x_dict["B1"]
        prop_1_edge = graph_data.edge_index_dict[('B2','expressed','B1')]
        features_B1_updated = self.gnn_B2_B1(prop_1_x, prop_1_edge)  # Added edge_index_B2_B1        
        # Propagate new B1 features to A : prop 2 
        prop_2_edge = graph_data.edge_index_dict[('B1','infects','A')]  # Fixed the variable name from sampled_data to graph_data
        features_A_updated = self.gnn_B1_A(features_B1_updated, prop_2_edge)  # Added edge_index_B1_A
        return self.decode(features_A_updated , features_B1_updated, graph_data)


In [2]:
# *****************************************************************************
# Pre-process data :
transform = T.RandomLinkSplit(
    num_val=0.1, 
    num_test=0.2, 
    #disjoint_train_ratio=...,  
    neg_sampling_ratio=1.0,  
    add_negative_train_samples=True, 
    edge_types=("B1", "infects", "A"),
    rev_edge_types=("A", "harbors", "B1"), 
)

train_data, val_data, test_data = transform(graph_data)

train_loader = LinkNeighborLoader(
    data=train_data,  
    num_neighbors= [-1],  
    edge_label_index=(("B1", "infects", "A"), train_data["B1", "infects", "A"].edge_label_index),
    edge_label=train_data["B1", "infects", "A"].edge_label,
    batch_size=128,
    shuffle=True,
)

val_loader = LinkNeighborLoader(
    data=val_data,  
    num_neighbors= [-1],  
    edge_label_index=(("B1", "infects", "A"), val_data["B1", "infects", "A"].edge_label_index),
    edge_label=val_data["B1", "infects", "A"].edge_label,
    batch_size=128,
    shuffle=True,
)

test_loader = LinkNeighborLoader(
    data=test_data,  
    num_neighbors= [-1],  
    edge_label_index=(("B1", "infects", "A"), test_data["B1", "infects", "A"].edge_label_index),
    edge_label=test_data["B1", "infects", "A"].edge_label,
    batch_size=128,
    shuffle=True,
)

NameError: name 'T' is not defined

> Batches 

In [None]:
# *****************************************************************************
# Training :
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available

def train(model, loader, optimizer, criterion, edge_type):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        edge_labels = data[edge_type].edge_label   
        loss = criterion(out, edge_labels)  
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)
    
@torch.no_grad()
def evaluate(model, loader, criterion, edge_type):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []  # Collect output probabilities for AUC
    for data in loader:
        data = data.to(device)
        out = model(data)
        edge_labels = data[edge_type].edge_label
        val_loss = criterion(out, edge_labels)
        total_loss += val_loss.item()
        probs = torch.sigmoid(out)  # Convert output to probabilities
        pred_class = probs.round()  # Round to nearest integer to get class predictions
        all_preds.extend(pred_class.cpu().numpy())
        all_labels.extend(edge_labels.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())  # Collect output probabilities
    # Calculate the metrics
    f1 = f1_score(all_labels, all_preds, average='binary')
    precision = precision_score(all_labels, all_preds, average='binary')
    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)  # Use probabilities, not class predictions
    return total_loss / len(loader), f1, precision, accuracy, auc

def main():
    hidden_channels = 60 
    model = Model(hidden_channels).to(device)
    # Due to lazy initialization, we need to run one model step so the number
    # of parameters can be inferred:
    eg_gratia_data = next(iter(val_loader))
    with torch.no_grad():
        model(eg_gratia_data)
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    edge_type = ("B1", "infects", "A")  
    for epoch in range(100): 
        train_loss = train(model, train_loader, optimizer, criterion, edge_type)
        if epoch % 10 == 0:
            test_loss, f1, precision, accuracy, auc = evaluate(model, test_loader, criterion, edge_type)
            print(f'Epoch: {epoch}, Train Loss: {train_loss}, Test Loss: {test_loss}, F1 Score: {f1}, Precision: {precision}, Accuracy: {accuracy}, AUC: {auc}')
    # Save the model
    torch.save(model.state_dict(), f"{path_work}/SAGEConv.model.1307.pt")

if __name__ == "__main__":
    main()



> Whole graph at once : 

In [None]:
def train(model, data, optimizer, criterion, edge_type):
    model.train()
    data = data.to(device)
    optimizer.zero_grad()
    out = model(data)
    edge_labels = data[edge_type].edge_label
    loss = criterion(out, edge_labels)
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, criterion, edge_type):
    model.eval()
    data = data.to(device)
    out = model(data)
    edge_labels = data[edge_type].edge_label
    val_loss = criterion(out, edge_labels)
    probs = torch.sigmoid(out)  
    pred_class = probs.round()  
    all_preds = pred_class.cpu().numpy()
    all_labels = edge_labels.cpu().numpy()
    all_probs = probs.cpu().numpy()
    # Calculate the metrics
    f1 = f1_score(all_labels, all_preds, average='binary')
    precision = precision_score(all_labels, all_preds, average='binary')
    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)
    return val_loss.item(), f1, precision, accuracy, auc

def main():
    hidden_channels = 60 
    model = Model(hidden_channels).to(device)
    model(data)
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    edge_type = ("B1", "infects", "A")  
    for epoch in range(500): 
        train_loss = train(model, train_data, optimizer, criterion, edge_type)
        if epoch % 10 == 0:
            test_loss, f1, precision, accuracy, auc = evaluate(model, test_data, criterion, edge_type)
            print(f'Epoch: {epoch}, Train Loss: {train_loss}, Test Loss: {test_loss}, F1 Score: {f1}, Precision: {precision}, Accuracy: {accuracy}, AUC: {auc}')
    # Save the model
    torch.save(model.state_dict(), f"{path_work}/SAGEConv.model.single_batch.1307.pt")

if __name__ == "__main__":
    main()

In [None]:
#!/bin/bash
#BATCH --job-name=GCNConv__
#SBATCH --qos=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=10 
#SBATCH --mem=100gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=GCNConv__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate torch_geometric

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/script_files/GCNConv_Hetero.dot.py


In [None]:
# Cross-validation
for fold, (train_idx, test_idx) in enumerate(kfold.split(graph_data.node_items['A'], graph_data.y)):
    train_data = graph_data[train_idx]
    test_data = graph_data[test_idx]
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=32)
    for epoch in range(100):  # adjust as needed
        train_loss = train(model, train_loader, optimizer, criterion)
        print(f"Fold: {fold+1}, Epoch: {epoch+1}, Train loss: {train_loss}")
    acc, prec, rec, f1, auroc = test(model, test_loader)
    print(f"Fold: {fold+1}, Accuracy: {acc}, Precision: {prec}, Recall: {rec}, F1-score: {f1}, AUROC: {auroc}")

