This notebook is ued to train and evaluate OncoPlex on the pan cancer dataset

- Load the data preprocessed previously
- Model class
- Train and eval functions

In [None]:
import numpy as np
import pandas as pd
import math
import os
import pickle
import random
 
import torch
import torch_geometric
from torch_geometric.nn import GCNConv, ChebConv, GATConv
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter


from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_recall_curve, auc, f1_score, roc_auc_score

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# Define model

In [None]:
class conv_layer(nn.Module):
    def __init__(self, in_ft, out_ft, bias=True):
        super(conv_layer, self).__init__()

        self.weight = Parameter(torch.Tensor(in_ft, out_ft))
        if bias:
            self.bias = Parameter(torch.Tensor(out_ft))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x: torch.Tensor, G: torch.Tensor):
        x = x.matmul(self.weight)
        if self.bias is not None:
            x = x + self.bias
        x = G.matmul(x)
        return x

#===========================================================
class HGCN_layer(nn.Module):
    def __init__(self, n_hid, dropout=0.5):
        super(HGCN_layer, self).__init__()
        self.hgc1 = conv_layer(n_hid, n_hid)
        self.act = nn.LeakyReLU()
        self.dropout = dropout  

    def forward(self, x, G):
        x = self.hgc1(x, G)
        x = self.act(x)
        x = F.dropout(x, self.dropout, training=self.training)
        return x

#=======================================================
class HD_sim(nn.Module):
    def __init__(self, h_dim, dropout=0.5):
        super(HD_sim, self).__init__()
        self.HD1 = HGCN_layer(h_dim)
        self.emb = nn.Linear(h_dim, h_dim)
        #self.norm = nn.LayerNorm(h_dim)
        self.dropout = dropout

    def forward(self, x, G):
        x = F.leaky_relu_(self.HD1(x, G))
        x1 = self.emb(x)
        #x1 = self.norm(x1)
        x1 += x  # residual
        return x1

#=============================================================
class OncoNet(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim, num_layer=3, dropout=0.5):
        super(OncoNet, self).__init__()

        
        self.mlp = nn.Linear(in_dim, hid_dim)

        self.convs = nn.ModuleList([HD_sim(hid_dim, dropout) for _ in range(num_layer)])
        self.fc2 = nn.Linear(hid_dim, out_dim)
        self.dropout = dropout

    def forward(self, x, G):
        x = F.leaky_relu(self.mlp(x))
        x = F.dropout(x, self.dropout, training=self.training)

        for conv in self.convs:
            x = conv(x, G)

        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


# Pan cancer training and evaluation

In [None]:
# Evaluation metrics
def cal_auc(y_true, y_pred):
     pred = y_pred.cpu().detach().numpy()
     pred= np.exp(pred)
     pred = pred[:,1]
    # pred = (torch.sigmoid(y_pred) > 0.5).float()
     true = y_true.cpu().numpy()
     AUROC = roc_auc_score(true, pred)
     precision, recall, thresholds = precision_recall_curve(true, pred)
     AUPRC = auc(recall, precision)
     return AUROC, AUPRC

def accuracy_fn(y_true, y_pred):
   # pred=(torch.sigmoid(y_pred)>0.5).float()
    pred=torch.argmax(y_pred,dim=1).cpu().numpy()
    true=y_true.cpu().numpy()
    acc = (pred == true).mean()
    return acc


def f1_score_(y_true, y_pred):
    pred = y_pred.cpu().detach().numpy()
    pred = np.exp(pred)
    pred = (pred[:,1] > 0.5).astype(float)
    true = y_true.cpu().numpy()
    f1 = f1_score(true, pred)
    return f1

In [None]:
#  Train, validation, and test 
def train(model, optimizer, x, G, y, train_idx, weight):
    model.train()
    optimizer.zero_grad()
    logits = model(x, G)
    loss = F.nll_loss(logits[train_idx], y[train_idx], weight=torch.tensor(weight))
    train_auroc, train_auprc = cal_auc(y[train_idx], logits[train_idx])
    train_f1 = f1_score_(y[train_idx], logits[train_idx])
    loss.backward()
    optimizer.step()
    return loss.item(), train_auroc, train_auprc, train_f1



@torch.no_grad()
def val(model, x, G, y, val_idx, weight):
    model.eval()
    logits = model(x, G)
    loss = F.nll_loss(logits[val_idx], y[val_idx], weight=torch.tensor(weight))
    val_acc = accuracy_fn(y[val_idx], logits[val_idx])
    val_auroc, val_auprc = cal_auc(y[val_idx], logits[val_idx])
    val_f1 = f1_score_(y[val_idx], logits[val_idx])
    return loss.item(), val_acc, val_auroc, val_auprc, val_f1



@torch.no_grad()
def test(model, x, G, y, test_idx, nodes, unknown_idx, weight):
    model.eval()
    logits = model(x, G)
    loss = F.nll_loss(logits[test_idx], y[test_idx], weight=torch.tensor(weight))
    test_acc = accuracy_fn(y[test_idx], logits[test_idx])
    test_auroc, test_auprc = cal_auc(y[test_idx], logits[test_idx])
    test_f1 = f1_score_(y[test_idx], logits[test_idx])

    test_genes = [nodes[i] for i in test_idx]
    unknown_genes = [nodes[i] for i in unknown_idx]

    prob_test = logits.exp().detach().cpu().numpy()[test_idx]
    prob_unknown = logits.exp().detach().cpu().numpy()[unknown_idx]

    test_results = pd.DataFrame(prob_test, index=test_genes, columns=["non_driver", "driver"])
    unknown_results = pd.DataFrame(prob_unknown, index=unknown_genes, columns=["non_driver", "driver"])
    final_results = pd.concat([test_results, unknown_results])

    return loss.item(), test_acc, test_auroc, test_auprc, test_f1, final_results, test_results, unknown_results


In [None]:
def load_data(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data



def main(seed):
    # Set device and seeds
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    num_epochs = 300
    patience = 20

    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    data = load_data('drive/MyDrive/OncoPlex/cancer_data/TCGA/processed/OncoPlex_dataset.pkl')
   
    # Check 
    data.keys()
   # cancer_nodes = [nodes for nodes, label in zip(data['nodes'], data['label']) if label == 1]
   # len(cancer_nodes)

   # nc_nodes = [nodes for nodes, label in zip(data['nodes'], data['label']) if label == 0]
   # len(nc_nodes), len(cancer_nodes)

    x = data['pancancer']['core_features'].to(device)  # if used comprehensive features, change it to comp
   #x = torch.eye(x.shape[0], device=device) # this for incidence based features
    y = torch.tensor(data['pancancer']['label'], dtype=torch.long).to(device)
    G = torch.tensor(data['pancancer']['edge_index'], dtype=torch.float).to(device)

    
    known_idx = torch.where((y== 1) | (y== 0))[0].to(device)
    unknown_idx = torch.where(y== -1)[0].to(device)

    train_idx, test_idx = torch.tensor(data['pancancer']['train_idx'], dtype=torch.long).to(device), torch.tensor(data['pancancer']['test_idx'], dtype=torch.long).to(device)
    nodes = data['pancancer']['nodes']

    param_grid = {
        'lr': [0.001, 0.005, 0.0005],
        'weight_decay': [0.001, 0.0001],
        'hidden_dim': [128, 64, 256],
        'dropout': [0.5, 0.4, 0.25],
        'num_layers': [2, 3, 4],
        'class_weight': [[1.0, 0.4], [1.0, 0.2]]
    }

    outer_k = 5
    inner_k = 3
    outer_kfold = StratifiedKFold(n_splits=outer_k, shuffle=True, random_state=seed)
    outer_results = []

    for fold, (train_val_idx, test_idx) in enumerate(outer_kfold.split(x[train_idx], y[train_idx])):
        print(f"\n Outer Fold {fold + 1}/{outer_k}")

        best_hyperparams = None
        best_val_loss = float('inf')

        inner_kfold = StratifiedKFold(n_splits=inner_k, shuffle=True, random_state=seed)

        for params in ParameterGrid(param_grid):
            val_losses = []

            for inner_train_idx, inner_val_idx in inner_kfold.split(x[train_val_idx], y[train_val_idx]):
                model = OncoNet(
                    x.shape[1],
                    hid_dim=params['hidden_dim'],
                    num_layer=params['num_layers'],
                    dropout=params['dropout'],
                    out_dim=2
                )
                optimizer = torch.optim.AdamW(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])

                best_inner_val_loss = float('inf')
                patience_counter = 0

                for epoch in range(num_epochs):
                    train_loss, *_ = train(model, optimizer, x, G, y, inner_train_idx, weight=params['class_weight'])
                    val_loss, *_ = val(model, x, G, y, inner_val_idx, weight=params['class_weight'])

                    if val_loss < best_inner_val_loss:
                        best_inner_val_loss = val_loss
                        patience_counter = 0
                    else:
                        patience_counter += 1
                        if patience_counter >= patience:
                            print(f"  Early stopping at epoch {epoch + 1} in inner fold (no improvement for {patience} epochs)")
                            break

                val_losses.append(best_inner_val_loss)

            avg_val_loss = np.mean(val_losses)
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_hyperparams = params

        print(f"Best hyperparameters for fold {fold + 1}: {best_hyperparams}")

        model = OncoNet(
            x.shape[1],
            hid_dim=best_hyperparams['hidden_dim'],
            num_layer=best_hyperparams['num_layers'],
            dropout=best_hyperparams['dropout'],
            out_dim=2
        )
        optimizer = torch.optim.AdamW(model.parameters(), lr=best_hyperparams['lr'], weight_decay=best_hyperparams['weight_decay'])

        for epoch in range(num_epochs):
            train_loss, *_ = train(model, optimizer, x, G, y, train_val_idx, weight=best_hyperparams['class_weight'])

        test_loss, test_acc, test_auroc, test_auprc, test_f1, final_results, test_results, unknown_results = test(model, optimizer, x, G, y, test_idx, nodes, unknown_idx, weight=best_hyperparams['class_weight'])

        outer_results.append({
            'test_loss': test_loss,
            'test_acc': test_acc,
            'test_auroc': test_auroc,
            'test_auprc': test_auprc,
            'test_f1': test_f1
        })

        fold_dir = f"results/pan_cancer/fold_{fold+1}"
        os.makedirs(fold_dir, exist_ok=True)
        test_results.to_csv(f"{fold_dir}/test_results.csv")
        unknown_results.to_csv(f"{fold_dir}/unknown_results.csv")
        final_results.to_csv(f"{fold_dir}/final_results.csv")

    metrics_df = pd.DataFrame(outer_results)
    mean_metrics = metrics_df.mean()
    std_metrics = metrics_df.std()

    summary_df = pd.DataFrame({
        "Metric": mean_metrics.index,
        "Mean": mean_metrics.values,
        "Std": std_metrics.values
    })
    summary_dir = "results/pan_cancer"
    summary_df.to_csv(os.path.join(summary_dir, "outer_fold_summary.csv"), index=False)

    print("\nAverage Results Across Outer Folds:")
    print(summary_df)

if __name__ == "__main__":

    seed = 42
    print(f"Starting training with seed {seed}...")
    main(seed)