In [None]:
from torch_geometric.data import HeteroData, DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, HeteroConv , GATConv
from torch_geometric.utils import negative_sampling
from torch_geometric.loader import LinkNeighborLoader

import torch
from torch import nn 
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import os 
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import optuna
from optuna.samplers import TPESampler
import warnings
import logging

warnings.filterwarnings("ignore")


In [None]:
# *****************************************************************************
# Load the Dataframes :
path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
logging.basicConfig(filename = f"{path_work}/optuna_2607.log",format='%(asctime)s | %(levelname)s: %(message)s', level=logging.NOTSET, filemode='w')

graph_data = torch.load(f'{path_work}/train_nn/graph_file.2607.OHE.pt')


In [None]:
# *****************************************************************************
logging.basicConfig(filename = f"{path_work}/train_nn/GATConv.2607.optuna.log",format='%(asctime)s | %(levelname)s: %(message)s', level=logging.NOTSET, filemode='w')

# The model : Classifier
class GNN(torch.nn.Module):
    def __init__(self, edge_type , conv, hidden_channels, heads, dropout): # GCNConv(-1, 64) , SAGEConv((-1, -1), 64), GATConv((-1, -1), 64)
        super().__init__()
        self.conv = conv((-1,-1), hidden_channels, add_self_loops = False, heads = heads, dropout = dropout, shared_weights = True)
        self.hetero_conv = HeteroConv({edge_type: self.conv})
    def forward(self, x_dict, edge_index_dict):
        x = self.hetero_conv(x_dict, edge_index_dict)
        return x

# Classifier, Binary :
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels, heads):
        super().__init__()
        self.lin1 = torch.nn.Linear(heads*hidden_channels + 127, 512)
        self.lin2 = torch.nn.Linear(512, 1)

    def forward(self, x_dict_A , x_dict_B1, graph_data):
        edge_type = ("B1", "infects", "A")
        edge_feat_A = x_dict_A["A"][graph_data[edge_type].edge_label_index[1]]
        edge_feat_B1 = x_dict_B1["B1"][graph_data[edge_type].edge_label_index[0]]
        features_phage = torch.cat((edge_feat_A ,edge_feat_B1), dim=-1)
        x = self.lin1(features_phage).relu()
        x = self.lin2(x)
        return x.view(-1)

class Model(torch.nn.Module):
    def __init__(self, conv, hidden_channels, heads, dropout):
        super().__init__()
        self.single_layer_model = GNN(("B2", "expressed", "B1") ,conv, hidden_channels,heads,dropout)
        self.EdgeDecoder = EdgeDecoder(hidden_channels,heads)

    def forward(self, graph_data):
        b1_nodes = self.single_layer_model(graph_data.x_dict , graph_data.edge_index_dict)
        a_nodes =  graph_data.x_dict
        out = self.EdgeDecoder(a_nodes ,b1_nodes , graph_data)
        return out


In [None]:
# *****************************************************************************
# Pre-process data :
transform = T.RandomLinkSplit(
    num_val=0.1, 
    num_test=0.2, 
    #disjoint_train_ratio=...,  
    neg_sampling_ratio=1.0,  
    add_negative_train_samples=True, 
    edge_types=("B1", "infects", "A"),
    rev_edge_types=("A", "harbors", "B1"), 
)

train_data, val_data, test_data = transform(graph_data)


> Minimizing the loss : 

In [None]:
# *****************************************************************************
# Training :
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available

def train(model, data, optimizer, criterion, edge_type):
    model.train()
    data = data.to(device)
    optimizer.zero_grad()
    out = model(data)
    edge_labels = data[edge_type].edge_label
    loss = criterion(out, edge_labels)
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, criterion, edge_type):
    model.eval()
    data = data.to(device)
    out = model(data)
    edge_labels = data[edge_type].edge_label
    val_loss = criterion(out, edge_labels)
    probs = torch.sigmoid(out)
    pred_class = probs.round()
    all_preds = pred_class.cpu().numpy()
    all_labels = edge_labels.cpu().numpy()
    all_probs = probs.cpu().numpy()
    # Calculate the metrics
    f1 = f1_score(all_labels, all_preds, average='binary')
    precision = precision_score(all_labels, all_preds, average='binary')
    recall = recall_score(all_labels, all_preds, average='binary')  # Calculate recall
    mcc = matthews_corrcoef(all_labels, all_preds)  # Calculate MCC
    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)
    return val_loss.item(), f1, precision, recall, mcc, accuracy, auc  # Include recall and MCC in return values


def objective(trial):
    # Define the hyperparameters
    conv = GATConv
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-2)
    hidden_channels = trial.suggest_int('hidden_channels', 500, 2000, step=50)
    dropout = trial.suggest_uniform('dropout', 0, 0.5)
    heads = trial.suggest_int('heads', 1, 8, step=1)
    # Define and train the model using the given hyperparameters
    model = Model(conv,hidden_channels,heads,dropout).to(device)
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    edge_type = ("B1", "infects", "A")
    best_loss = float('inf')
    for epoch in range(3000):
        train_loss = train(model, train_data, optimizer, criterion, edge_type)
        if epoch % 50 == 0:
            val_loss, f1, precision, recall, mcc, accuracy, auc = evaluate(model, test_data, criterion, edge_type)
            # Early stopping based on validation loss
            if val_loss <= best_loss:
                best_loss = val_loss
            else:
                break  # stop training if the validation loss does not decrease

    return best_loss  # this is the value to minimize


# Optimize
logging.info(f"Let's start the work")
study = optuna.create_study(sampler=TPESampler() , direction='minimize') 
study.optimize(objective, n_trials=500 ,  n_jobs=-1)

print(f"Best parameters: {study.best_params}")
logging.info(f"Best parameters: {study.best_params}")



> Maximizing the mcc :

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available

def train(model, data, optimizer, criterion, edge_type):
    model.train()
    data = data.to(device)
    optimizer.zero_grad()
    out = model(data)
    edge_labels = data[edge_type].edge_label
    loss = criterion(out, edge_labels)
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, criterion, edge_type):
    model.eval()
    data = data.to(device)
    out = model(data)
    edge_labels = data[edge_type].edge_label
    val_loss = criterion(out, edge_labels)
    probs = torch.sigmoid(out)
    pred_class = probs.round()
    all_preds = pred_class.cpu().numpy()
    all_labels = edge_labels.cpu().numpy()
    all_probs = probs.cpu().numpy()
    # Calculate the metrics
    f1 = f1_score(all_labels, all_preds, average='binary')
    precision = precision_score(all_labels, all_preds, average='binary')
    recall = recall_score(all_labels, all_preds, average='binary')  # Calculate recall
    mcc = matthews_corrcoef(all_labels, all_preds)  # Calculate MCC
    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)
    return val_loss.item(), f1, precision, recall, mcc, accuracy, auc  # Include recall and MCC in return values


def objective(trial):
    # Define the hyperparameters
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-2)
    hidden_channels = trial.suggest_int('hidden_channels', 500, 1280, step=50)
    dropout = trial.suggest_uniform('dropout', 0, 0.5)
    heads = trial.suggest_int('heads', 1, 8, step=1)
    # Define and train the model using the given hyperparameters
    model = Model(hidden_channels, heads,dropout=dropout).to(device)
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    edge_type = ("B1", "infects", "A")
    best_mcc = float('-inf')
    for epoch in range(3000):
        train_loss = train(model, train_data, optimizer, criterion, edge_type)
        if epoch % 10 == 0:
            val_loss, f1, precision, recall, mcc, accuracy, auc = evaluate(model, test_data, criterion, edge_type)
            logging.info(f"lr:{lr}\thidden_channels:{hidden_channels}\tdropout:{dropout}\tf1:{f1}\tprecision:{precision}\tmcc:{mcc}\taccuracy:{accuracy}\trecall:{recall}\tauc:{auc}")
            # Early stopping based on MCC
            if mcc >= best_mcc:
                best_mcc = mcc
            else:
                logging.info(f"Next round after {epoch} epochs")
                break
    return best_mcc

# Optimize
logging.info(f"Let's start the work")
study = optuna.create_study(sampler=TPESampler() , direction='maximize')  # use as many processes as possible
study.optimize(objective, n_trials=50, n_jobs = -1)

print(f"Best parameters: {study.best_params}")
logging.info(f"Best parameters: {study.best_params}")

In [None]:
Best parameters: {'lr': 0.005082551361677657, 'hidden_channels': 800, 'dropout': 0.3185921909740461, 'heads': 6}


> Make predictions 

In [None]:
def make_predictions(model, data):
    model.eval()  # Set the model to evaluation mode
    data = data.to(device)  # Transfer the data to the device
    with torch.no_grad():  # No need to track gradients for prediction
        output = model(data)
    probabilities = torch.sigmoid(output)  # Convert output to probabilities
    predictions = probabilities.round()  # Convert probabilities to class labels
    return predictions.cpu().numpy(), probabilities.cpu().numpy()

# Load the saved model
hidden_channels = 1000  # this should be the same value you used during training #used to be 1000
model = Model(hidden_channels)
model.load_state_dict(torch.load(f"{path_work}/GATConv.model.single_batch.1807.pt"))
model = model.to(device)

# Predict using the new data
new_data = ...  # load or create new data here. It should have the same format as your training/test data
predictions, probabilities = make_predictions(model, new_data)

print(f"Predictions: {predictions}")
print(f"Probabilities: {probabilities}")

In [None]:
from torch_geometric.data import HeteroData, DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv, HeteroConv , GATConv
from torch_geometric.utils import negative_sampling
from torch_geometric.loader import LinkNeighborLoader

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import product
import random
from collections import Counter
import optuna
import warnings
import logging

warnings.filterwarnings("ignore")

# *****************************************************************************
# Load the Dataframes :
path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
logging.basicConfig(filename = f"{path_work}/optuna_2107.loss.log",format='%(asctime)s | %(levelname)s: %(message)s', level=logging.NOTSET,filemode='w')

graph_data = torch.load(f'{path_work}/train_nn/graph_file.1107.pt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available

# *****************************************************************************
# The model : dot product
class GNN(torch.nn.Module):
    def __init__(self, edge_type , hidden_channels, dropout, heads, conv=GATConv):
        super().__init__()
        self.conv = conv((-1,-1), hidden_channels, add_self_loops = False, heads = heads, dropout = dropout)
        self.hetero_conv = HeteroConv({edge_type: self.conv})
    def forward(self, x_dict, edge_index_dict):
        x = self.hetero_conv(x_dict, edge_index_dict)
        return x
# Dot product :
class Classifier(torch.nn.Module):
    def forward(self, x_dict_A , x_dict_B1, edge_index):
        edge_type = ("B1", "infects", "A")
        edge_feat_A = x_dict_A["A"][edge_index[edge_type].edge_label_index[1]]
        edge_feat_B1 = x_dict_B1["B1"][edge_index[edge_type].edge_label_index[0]]
        return (edge_feat_A * edge_feat_B1).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, out_channels, dropout):
        super().__init__()
        self.single_layer_model = GNN(("B2", "expressed", "B1") , out_channels, dropout)
        self.second_layer_model = GNN(("B1", "infects", "A") , out_channels, dropout)
        self.classifier_dot = Classifier()

    def forward(self, graph_data):
        b1_nodes = self.single_layer_model(graph_data.x_dict , graph_data.edge_index_dict)
        updated_dict = {}
        updated_dict["A"], updated_dict["B2"] = graph_data.x_dict["A"], graph_data.x_dict["B2"]
        updated_dict["B1"] = b1_nodes["B1"]
        a_nodes = self.second_layer_model(updated_dict , graph_data.edge_index_dict)
        dot_product = self.classifier_dot(a_nodes ,b1_nodes , graph_data)

        return dot_product
# *****************************************************************************
# Pre-process data :
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.2,
    #disjoint_train_ratio=...,
    neg_sampling_ratio=1.0,
    add_negative_train_samples=True,
    edge_types=("B1", "infects", "A"),
    rev_edge_types=("A", "harbors", "B1"),
)

train_data, val_data, test_data = transform(graph_data)


def train(model, data, optimizer, criterion, edge_type):
    model.train()
    data = data.to(device)
    optimizer.zero_grad()
    out = model(data)
    edge_labels = data[edge_type].edge_label
    loss = criterion(out, edge_labels)
    loss.backward()
    optimizer.step()
    return loss.item()
@torch.no_grad()
def evaluate(model, data, criterion, edge_type):
    model.eval()
    data = data.to(device)
    out = model(data)
    edge_labels = data[edge_type].edge_label
    val_loss = criterion(out, edge_labels)
    probs = torch.sigmoid(out)
    pred_class = probs.round()
    all_preds = pred_class.cpu().numpy()
    all_labels = edge_labels.cpu().numpy()
    all_probs = probs.cpu().numpy()
    # Calculate the metrics
    f1 = f1_score(all_labels, all_preds, average='binary')
    precision = precision_score(all_labels, all_preds, average='binary')
    recall = recall_score(all_labels, all_preds, average='binary')  # Calculate recall
    mcc = matthews_corrcoef(all_labels, all_preds)  # Calculate MCC
    accuracy = accuracy_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_probs)
    return val_loss.item(), f1, precision, recall, mcc, accuracy, auc  # Include recall and MCC in return values


def objective(trial):
    # Define the hyperparameters
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-2)
    hidden_channels = trial.suggest_int('hidden_channels', 500, 1280, step=50)
    dropout = trial.suggest_uniform('dropout', 0, 0.5)
    heads = trial.suggest_int('heads', 0, 8, step=1)
    # Define and train the model using the given hyperparameters
    model = Model(hidden_channels, dropout=dropout,heads=heads).to(device)
    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    edge_type = ("B1", "infects", "A")
    best_loss = float('inf')
    for epoch in range(5000):
        train_loss = train(model, train_data, optimizer, criterion, edge_type)
        if epoch % 10 == 0:
            val_loss, f1, precision, recall, mcc, accuracy, auc = evaluate(model, test_data, criterion, edge_type)
            logging.info(f"lr:{lr}\thidden_channels:{hidden_channels}\tdropout:{dropout}\tf1:{f1}\tprecision:{precision}\tmcc:{mcc}\taccuracy:{accuracy}\trecall:{recall}\tauc:{auc}")
            if val_loss <= best_loss:
                best_loss = val_loss
            else:
                break  # stop training if the validation loss does not decrease
    return best_loss  # this is the value to minimize
# Optimize
logging.info(f"Let's start the work")
study = optuna.create_study(direction='minimize')  # use as many processes as possible
study.optimize(objective, n_trials=50, n_jobs = -1)

print(f"Best parameters: {study.best_params}")
logging.info(f"Best parameters: {study.best_params}")

