In [1]:
from torch_geometric.data import HeteroData, DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, to_hetero , SAGEConv
from torch_geometric.utils import negative_sampling
from torch_geometric.loader import LinkNeighborLoader

import torch
from torch import nn 
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import warnings
warnings.filterwarnings("ignore") 


In [2]:
path_work = "/media/concha-eloko/Linux/PPT_clean"
graph_data = torch.load(f'{path_work}/graph_file.1107.pt')

graph_data

HeteroData(
  [1mA[0m={ x=[4530, 127] },
  [1mB1[0m={ x=[11339, 0] },
  [1mB2[0m={ x=[3608, 1280] },
  [1m(B1, infects, A)[0m={
    edge_index=[2, 9677],
    y=[9677]
  },
  [1m(B2, expressed, B1)[0m={
    edge_index=[2, 13285],
    y=[13285]
  },
  [1m(A, harbors, B1)[0m={
    edge_index=[2, 9677],
    y=[9677]
  }
)

In [6]:
graph_data[('B2', 'expressed', 'B1')].edge_index


tensor([[ 1909,  2314,   122,  ...,  3605,  3606,  3607],
        [    0,     0,     1,  ..., 11336, 11337, 11338]])

In [3]:
# *****************************************************************************
# Pre-process data :
transform = T.RandomLinkSplit(
    num_val=0.1, 
    num_test=0.2, 
    #disjoint_train_ratio=...,  
    neg_sampling_ratio=1.0,  
    add_negative_train_samples=True, 
    edge_types=("B1", "infects", "A"),
    rev_edge_types=("A", "harbors", "B1"), 
)

train_data, val_data, test_data = transform(graph_data)

train_loader = LinkNeighborLoader(
    data=train_data,  
    num_neighbors= [-1],  
    edge_label_index=(("B1", "infects", "A"), train_data["B1", "infects", "A"].edge_label_index),
    edge_label=train_data["B1", "infects", "A"].edge_label,
    batch_size=128,
    shuffle=True,
)

val_loader = LinkNeighborLoader(
    data=val_data,  
    num_neighbors= [-1],  
    edge_label_index=(("B1", "infects", "A"), val_data["B1", "infects", "A"].edge_label_index),
    edge_label=val_data["B1", "infects", "A"].edge_label,
    batch_size=128,
    shuffle=True,
)

test_loader = LinkNeighborLoader(
    data=test_data,  
    num_neighbors= [-1],  
    edge_label_index=(("B1", "infects", "A"), test_data["B1", "infects", "A"].edge_label_index),
    edge_label=test_data["B1", "infects", "A"].edge_label,
    batch_size=128,
    shuffle=True,
)

In [33]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

def visualize_heterograph(data):
    G = nx.DiGraph()
    node_labels = {}
    # Adding nodes
    for node_type in ['A', 'B1', 'B2']:
        for node_id in range(data[node_type].num_nodes):
            G.add_node(f'{node_type}_{node_id}')
            node_labels[f'{node_type}_{node_id}'] = f'{node_type}_{node_id}'

    # Adding edges
    for edge_type in [('B1', 'infects', 'A'), ('B2', 'expressed', 'B1'), ('A', 'harbors', 'B1')]:
        for i in range(data[edge_type].num_edges):
            src, dest = data[edge_type].edge_index[:, i]
            G.add_edge(f'{edge_type[0]}_{src}', f'{edge_type[2]}_{dest}', label=edge_type[1])

    # Plotting
    pos = nx.spring_layout(G)
    plt.figure(figsize=(12, 12))
    nx.draw(G, pos, labels=node_labels, with_labels=True, node_size=1000, node_color="skyblue", node_shape="s", alpha=0.5, linewidths=40)
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [32]:
sampled_data

HeteroData(
  [1mA[0m={
    x=[162, 127],
    n_id=[162]
  },
  [1mB1[0m={
    x=[299, 0],
    n_id=[299]
  },
  [1mB2[0m={
    x=[133, 1280],
    n_id=[133]
  },
  [1m(B1, infects, A)[0m={
    edge_index=[2, 293],
    y=[293],
    edge_label=[128],
    edge_label_index=[2, 128],
    e_id=[293],
    input_id=[128]
  },
  [1m(B2, expressed, B1)[0m={
    edge_index=[2, 167],
    y=[167],
    e_id=[167]
  },
  [1m(A, harbors, B1)[0m={
    edge_index=[2, 125],
    y=[125],
    e_id=[125]
  }
)

In [24]:
sampled_data = next(iter(train_loader))
sampled_data.x_dict["A"]
sampled_data.edge_index_dict[('B2','expressed','B1')][1]

tensor([  0,   1,   2,   3,   3,   4,   4,   5,   5,   6,   6,   7,   8,   9,
         10,  11,  12,  13,  14,  14,  14,  15,  15,  15,  16,  16,  16,  17,
         17,  17,  18,  18,  19,  19,  20,  20,  21,  21,  22,  22,  23,  23,
         24,  24,  25,  25,  26,  26,  27,  28,  29,  30,  31,  32,  32,  33,
         34,  35,  36,  36,  37,  37,  38,  39,  40,  40,  41,  42,  42,  43,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  52,  53,  54,  54,
         55,  56,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  65,  66,
         67,  68,  68,  69,  70,  71,  72,  72,  73,  74,  75,  76,  77,  77,
         78,  79,  80,  80,  81,  82,  83,  84,  85,  86,  87,  87,  88,  89,
         90,  91,  91,  92,  93,  94,  95,  95,  96,  97,  98,  99, 100, 101,
        102, 103, 104, 105, 106, 107, 108, 108, 109, 110, 111, 112, 113, 114,
        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127])

In [31]:
debug_nn = GNN(20,10) 
out = debug_nn(sampled_data.x_dict["B2"] , sampled_data.edge_index_dict[('B2','expressed','B1')])

out[0] , out[-1]

(tensor([ 0.2863, -0.1950,  0.0128,  0.1964,  0.0717, -0.1568, -0.2021, -0.0185,
         -0.1320, -0.2648], grad_fn=<SelectBackward0>),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<SelectBackward0>))

In [6]:
# *****************************************************************************
# The model : Dot product
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(-1, hidden_channels, add_self_loops=False)
        self.conv2 = GCNConv(-1, out_channels, add_self_loops=False)
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

class Classifier(torch.nn.Module):
    def forward(self, x, edge_index):
        edge_feat_B1 = x["B1"][edge_index[("B1", "infects", "A")][0]]
        edge_feat_A = x["A"][edge_index[("B1", "infects", "A")][1]]




        
        return (edge_feat_B1 * edge_feat_A).sum(dim=-1)
    
class Model(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.gnn_B2_B1 = GNN(hidden_channels, out_channels)
        self.gnn_B1_A = GNN(hidden_channels, out_channels)
        self.classifier = Classifier()
    def forward(self, graph_data):
        # Propagate B2 features to B1
        x_B2 = graph_data['B2'].x
        edge_index_B2_B1 = graph_data[('B2', 'expressed', 'B1')].edge_index
        x_B1_from_B2 = self.gnn_B2_B1(x_B2, edge_index_B2_B1)  # Added edge_index_B2_B1
        # Propagate new B1 features to A
        edge_index_B1_A = graph_data[('B1', 'infects', 'A')].edge_index
        x_A_from_B1 = self.gnn_B1_A(x_B1_from_B2, edge_index_B1_A)  # Added edge_index_B1_A
        # Classification based on new features
        x = {'B1': x_B1_from_B2, 'A': x_A_from_B1}
        pred = self.classifier(x, edge_index_B1_A)  # Passed x dictionary and edge_index_B1_A to classifier
        return pred
    


In [None]:
# *****************************************************************************
# Training :
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available

def train(model, loader, optimizer, criterion, edge_type):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        edge_labels = data[edge_type].y[data[edge_type].train_mask]
        loss = criterion(out[data[edge_type].train_mask], edge_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

@torch.no_grad()
def evaluate(model, loader, criterion, edge_type):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    for data in loader:
        data = data.to(device)
        pred = model(data)
        edge_labels = data[edge_type].y[data[edge_type].val_mask]
        val_loss = criterion(pred[data[edge_type].val_mask], edge_labels)
        total_loss += val_loss.item()
        _, pred_class = pred.max(dim=1)
        all_preds.extend(pred_class.cpu().numpy())
        all_labels.extend(edge_labels.cpu().numpy())
    # Calculate the metrics
    f1 = f1_score(all_labels, all_preds, average='macro')
    precision = precision_score(all_labels, all_preds, average='macro')
    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_preds)
    return total_loss / len(loader), f1, precision, accuracy, auc

def main():
    hidden_channels = 580 
    out_channels = 100 
    model = Model(hidden_channels, out_channels).to(device)
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    edge_type = ("B1", "infects", "A")  # adjust this as per your requirement
    for epoch in range(100): 
        train_loss = train(model, train_loader, optimizer, criterion, edge_type)
        if epoch % 10 == 0:
            test_loss, f1, precision, accuracy, auc = evaluate(model, test_loader, criterion, edge_type)
            print(f'Epoch: {epoch}, Train Loss: {train_loss}, Test Loss: {test_loss}, F1 Score: {f1}, Precision: {precision}, Accuracy: {accuracy}, AUC: {auc}')
    # Save the model
    torch.save(model.state_dict(), f"{path_work}/GCNConv.model.1307.pt")

if __name__ == "__main__":
    main()

In [None]:
#!/bin/bash
#BATCH --job-name=GCNConv__
#SBATCH --qos=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=10 
#SBATCH --mem=100gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=GCNConv__%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate torch_geometric

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/script_files/GCNConv_Hetero.dot.py


In [None]:
# Cross-validation
for fold, (train_idx, test_idx) in enumerate(kfold.split(graph_data.node_items['A'], graph_data.y)):
    train_data = graph_data[train_idx]
    test_data = graph_data[test_idx]
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=32)
    for epoch in range(100):  # adjust as needed
        train_loss = train(model, train_loader, optimizer, criterion)
        print(f"Fold: {fold+1}, Epoch: {epoch+1}, Train loss: {train_loss}")
    acc, prec, rec, f1, auroc = test(model, test_loader)
    print(f"Fold: {fold+1}, Accuracy: {acc}, Precision: {prec}, Recall: {rec}, F1-score: {f1}, AUROC: {auroc}")

