In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch_geometric
import pickle
import optuna
from torch_geometric.nn import GCNConv

### data

In [2]:
# model structure
class GCN(torch.nn.Module):
    def __init__(self, size1, size2, size3, size4, size5):
        super(GCN, self).__init__()

        self.conv1 = GCNConv(1969, size1)
        self.conv2 = GCNConv(size1, size2)
        self.conv3 = GCNConv(size2, size3)
        self.conv4 = GCNConv(size3, size4)
        self.fc1 = torch.nn.Linear(size4, size5)
        self.fc2 = torch.nn.Linear(size5, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.conv3(x, edge_index)
        x = torch.relu(x)
        x = self.conv4(x, edge_index)
        x1 = torch.relu(x)
        x = self.fc1(x1)
        x = torch.relu(x)
        x = self.fc2(x)       
        return x, x1

def train(model, data, optimizer, mask):
    model.train()
    optimizer.zero_grad()
    out, _ = model(data)
    loss = F.mse_loss(out[mask].view(-1), data.y[mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, data, mask):
    model.eval()
    with torch.no_grad():
        out, _ = model(data)
        loss = F.mse_loss(out[mask].view(-1), data.y[mask])
    return loss.item()

def objective(trial: optuna.Trial):   
    size1 = trial.suggest_int("size1", 32, 1024)
    size2 = trial.suggest_int("size2", 4, 512)
    size3 = trial.suggest_int("size3", 4, 512)
    size4 = trial.suggest_int("size4", 4, 512)
    size5 = trial.suggest_int("size5", 4, 512)
    num_epochs = trial.suggest_int("num_epochs", 1, 200)
    lr = trial.suggest_float("lr", 1e-5, 1e-2)

    model = GCN(size1, size2, size3, size4, size5).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # find best hyperparameter by training on the training set and get loss on the validating set
    for epoch in range(1, num_epochs + 1):
        train_loss = train(model, data, optimizer, data.train_mask)
        val_loss = evaluate(model, data, data.valid_mask)
        trial.report(val_loss, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return val_loss

In [None]:
for rs in range(10):
    
    # read data
    X_train = pd.read_csv("../../result/input_perturb_phyloP/%d/X_train_stratified" % rs, sep="\t", index_col=0).values
    X_valid = pd.read_csv("../../result/input_perturb_phyloP/%d/X_valid_stratified" % rs, sep="\t", index_col=0).values
    X_test = pd.read_csv("../../result/input_perturb_phyloP/%d/X_test_stratified" % rs, sep="\t", index_col=0).values
    Y_train = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_train_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
    Y_valid = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_valid_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
    Y_test = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).values.reshape(-1)
    Y_test_gene = pd.read_csv("../../result/input_perturb_phyloP/%d/Y_test_stratified" % rs, sep="\t", index_col=0).index

    # concat all subsets
    X = np.concatenate([X_train, X_valid, X_test])
    Y = np.concatenate([Y_train, Y_valid, Y_test])
    train_mask = np.concatenate([[True] * len(X_train), [False] * len(X_valid), [False] * len(X_test)])
    valid_mask = np.concatenate([[False] * len(X_train), [True] * len(X_valid), [False] * len(X_test)])
    test_mask = np.concatenate([[False] * len(X_train), [False] * len(X_valid), [True] * len(X_test)])
    
    # mask of whether a test node is in the DAGMA graph
    dag = pd.read_csv("../../result/network_perturb_phyloP/DAGMA_thresholdAdaptive.tsv", sep="\t", header=None)
    id2genes = pd.read_csv("../../result/network_perturb_phyloP/valid_genes", sep="\t").set_index("ID")['genes'].to_dict()
    dag[0] = dag[0].map(id2genes)
    dag[1] = dag[1].map(id2genes)
    dag_genes = list(set.union(set(dag[0]), set(dag[1])))
    test_inDAG = Y_test_gene.isin(dag_genes)

    for network_name in ["DAGMA_thresholdAdaptive", "STRING", "BIOGRID", "Combine", "ChIP_hTFtarget", "ChIP_TIP", "ChIP_TIP_K562", "CoExpr_ENCODE_K562_0.75", "CoExpr_GTEx_WholeBlood_0.75", "CoExpr_perturb_0.5", "Random_ER", "Random_SF", "NOTEARS_thresholdAdaptive"]:

        # read network
        edge_index = torch.tensor(pd.read_csv("../../result/network_perturb_phyloP/%s.tsv" % network_name, sep="\t", header=None).values.T)

        # convert to data object
        data = torch_geometric.data.Data(x=torch.tensor(X).float(), edge_index=edge_index, y=torch.tensor(Y).float())
        data.train_mask = torch.tensor(train_mask)
        data.valid_mask = torch.tensor(valid_mask)
        data.test_mask = torch.tensor(test_mask)
        data.test_net_mask = np.concatenate([[False] * len(Y_train), [False] * len(Y_valid), test_inDAG])

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        data = data.to(device)


        # repeat the whole process multiple times for mean and std of the loss
        test_loss_reps = []
        test_net_loss_reps = []

        os.makedirs("../../result/model_perturb_phyloP_deep4/%s/%d/" % (network_name, rs), exist_ok=True)

        for rep in range(10):
            # tune hyperparameters
            study = optuna.create_study(direction="minimize")
            study.optimize(objective, n_trials=20, timeout=600)
            best_params = study.best_params

            # final model with best hyperparameters and use train and valid data to train
            final_model = GCN(best_params["size1"], best_params["size2"], best_params["size3"], best_params["size4"], best_params["size5"]).to(device)
            optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params["lr"])

            # train final model
            for epoch in range(1, best_params["num_epochs"] + 1):
                train_loss = train(final_model, data, optimizer, data.train_mask|data.valid_mask)

            # prediction from final model
            final_model.eval()
            with torch.no_grad():
                out, X1 = final_model(data)
                # loss on the testing set
                loss1 = F.mse_loss(out[data.test_mask].view(-1), data.y[data.test_mask])
                test_loss_reps.append(loss1.item())
                # loss on the testing set, but only the nodes in the DAGMA
                loss2 = F.mse_loss(out[data.test_net_mask].view(-1), data.y[data.test_net_mask])
                test_net_loss_reps.append(loss2.item())

            # save model, hyperparameters and results
            prefix = "../../result/model_perturb_phyloP_deep4/%s/%d/model%d" % (network_name, rs, rep)
            torch.save(final_model.state_dict(), prefix+".model")
            with open(prefix+".para", "wb") as f:
                pickle.dump([best_params, X1.cpu().numpy(), out.cpu().numpy(), loss1.item(), loss2.item()], f)

[I 2023-12-05 19:06:11,601] A new study created in memory with name: no-name-8ae9a3d1-3a60-4ffb-8e2b-b370705bdc0b
[I 2023-12-05 19:06:13,305] Trial 0 finished with value: 0.1599070131778717 and parameters: {'size1': 752, 'size2': 133, 'size3': 199, 'size4': 379, 'size5': 299, 'num_epochs': 111, 'lr': 0.006362425789208199}. Best is trial 0 with value: 0.1599070131778717.
[I 2023-12-05 19:06:13,780] Trial 1 finished with value: 0.16229307651519775 and parameters: {'size1': 289, 'size2': 500, 'size3': 122, 'size4': 117, 'size5': 108, 'num_epochs': 65, 'lr': 0.007182656000856493}. Best is trial 0 with value: 0.1599070131778717.
[I 2023-12-05 19:06:15,046] Trial 2 finished with value: 0.15499551594257355 and parameters: {'size1': 74, 'size2': 39, 'size3': 177, 'size4': 310, 'size5': 230, 'num_epochs': 178, 'lr': 0.008076221543170326}. Best is trial 2 with value: 0.15499551594257355.
[I 2023-12-05 19:06:16,550] Trial 3 finished with value: 0.1597518026828766 and parameters: {'size1': 838, 's

[I 2023-12-05 19:06:32,542] Trial 11 pruned. 
[I 2023-12-05 19:06:32,589] Trial 12 pruned. 
[I 2023-12-05 19:06:32,779] Trial 13 finished with value: 0.157821387052536 and parameters: {'size1': 658, 'size2': 77, 'size3': 245, 'size4': 152, 'size5': 410, 'num_epochs': 19, 'lr': 0.002686430329199056}. Best is trial 1 with value: 0.15068943798542023.
[I 2023-12-05 19:06:32,825] Trial 14 pruned. 
[I 2023-12-05 19:06:32,927] Trial 15 pruned. 
[I 2023-12-05 19:06:32,980] Trial 16 pruned. 
[I 2023-12-05 19:06:33,024] Trial 17 pruned. 
[I 2023-12-05 19:06:33,146] Trial 18 pruned. 
[I 2023-12-05 19:06:33,739] Trial 19 pruned. 
[I 2023-12-05 19:06:33,959] A new study created in memory with name: no-name-56584b59-5ed7-429d-9bf3-dbd63103123e
[I 2023-12-05 19:06:34,567] Trial 0 finished with value: 0.17857742309570312 and parameters: {'size1': 62, 'size2': 467, 'size3': 136, 'size4': 380, 'size5': 412, 'num_epochs': 83, 'lr': 0.0011593049715455817}. Best is trial 0 with value: 0.17857742309570312.


[I 2023-12-05 19:06:57,433] Trial 2 finished with value: 0.14812985062599182 and parameters: {'size1': 577, 'size2': 492, 'size3': 109, 'size4': 429, 'size5': 361, 'num_epochs': 85, 'lr': 0.0040433793963635814}. Best is trial 2 with value: 0.14812985062599182.
[I 2023-12-05 19:06:58,045] Trial 3 finished with value: 0.15092280507087708 and parameters: {'size1': 770, 'size2': 203, 'size3': 361, 'size4': 494, 'size5': 164, 'num_epochs': 75, 'lr': 0.009862566740354017}. Best is trial 2 with value: 0.14812985062599182.
[I 2023-12-05 19:06:58,377] Trial 4 finished with value: 0.16972953081130981 and parameters: {'size1': 981, 'size2': 315, 'size3': 181, 'size4': 225, 'size5': 336, 'num_epochs': 38, 'lr': 0.006139860534580405}. Best is trial 2 with value: 0.14812985062599182.
[I 2023-12-05 19:06:58,774] Trial 5 finished with value: 0.14579330384731293 and parameters: {'size1': 267, 'size2': 453, 'size3': 137, 'size4': 473, 'size5': 86, 'num_epochs': 52, 'lr': 0.0054960111585497665}. Best is 

[I 2023-12-05 19:07:24,401] Trial 2 finished with value: 0.15659858286380768 and parameters: {'size1': 752, 'size2': 371, 'size3': 141, 'size4': 356, 'size5': 389, 'num_epochs': 125, 'lr': 0.0053774567565714314}. Best is trial 2 with value: 0.15659858286380768.
[I 2023-12-05 19:07:26,075] Trial 3 finished with value: 0.15814366936683655 and parameters: {'size1': 793, 'size2': 260, 'size3': 31, 'size4': 223, 'size5': 68, 'num_epochs': 89, 'lr': 0.003290897050822017}. Best is trial 2 with value: 0.15659858286380768.
[I 2023-12-05 19:07:27,404] Trial 4 finished with value: 0.15715447068214417 and parameters: {'size1': 357, 'size2': 488, 'size3': 223, 'size4': 45, 'size5': 52, 'num_epochs': 84, 'lr': 0.006171768534725502}. Best is trial 2 with value: 0.15659858286380768.
[I 2023-12-05 19:07:27,438] Trial 5 pruned. 
[I 2023-12-05 19:07:27,926] Trial 6 finished with value: 0.15343284606933594 and parameters: {'size1': 47, 'size2': 358, 'size3': 374, 'size4': 80, 'size5': 53, 'num_epochs': 37

[I 2023-12-05 19:08:07,217] Trial 2 finished with value: 0.1528717577457428 and parameters: {'size1': 828, 'size2': 98, 'size3': 157, 'size4': 395, 'size5': 199, 'num_epochs': 39, 'lr': 0.009282579489921724}. Best is trial 2 with value: 0.1528717577457428.
[I 2023-12-05 19:08:10,691] Trial 3 finished with value: 0.18673257529735565 and parameters: {'size1': 739, 'size2': 396, 'size3': 58, 'size4': 55, 'size5': 19, 'num_epochs': 192, 'lr': 0.001702041773588769}. Best is trial 2 with value: 0.1528717577457428.
[I 2023-12-05 19:08:11,271] Trial 4 finished with value: 0.1579645872116089 and parameters: {'size1': 292, 'size2': 133, 'size3': 156, 'size4': 90, 'size5': 55, 'num_epochs': 51, 'lr': 0.008804398327843792}. Best is trial 2 with value: 0.1528717577457428.
[I 2023-12-05 19:08:15,147] Trial 5 finished with value: 0.16726583242416382 and parameters: {'size1': 499, 'size2': 344, 'size3': 241, 'size4': 440, 'size5': 161, 'num_epochs': 193, 'lr': 0.0030990659727308735}. Best is trial 2 w

[I 2023-12-05 19:08:57,146] Trial 2 finished with value: 0.15770530700683594 and parameters: {'size1': 414, 'size2': 397, 'size3': 69, 'size4': 287, 'size5': 187, 'num_epochs': 155, 'lr': 0.008005788829799082}. Best is trial 1 with value: 0.15379589796066284.
[I 2023-12-05 19:09:01,494] Trial 3 finished with value: 0.1577298790216446 and parameters: {'size1': 950, 'size2': 238, 'size3': 474, 'size4': 201, 'size5': 7, 'num_epochs': 180, 'lr': 0.007742502481377125}. Best is trial 1 with value: 0.15379589796066284.
[I 2023-12-05 19:09:05,180] Trial 4 finished with value: 0.15333260595798492 and parameters: {'size1': 751, 'size2': 386, 'size3': 31, 'size4': 149, 'size5': 216, 'num_epochs': 196, 'lr': 0.007179574797942375}. Best is trial 4 with value: 0.15333260595798492.
[I 2023-12-05 19:09:05,259] Trial 5 pruned. 
[I 2023-12-05 19:09:07,156] Trial 6 finished with value: 0.16014736890792847 and parameters: {'size1': 272, 'size2': 142, 'size3': 58, 'size4': 505, 'size5': 214, 'num_epochs': 

[I 2023-12-05 19:09:54,696] Trial 3 finished with value: 0.15610313415527344 and parameters: {'size1': 216, 'size2': 102, 'size3': 120, 'size4': 412, 'size5': 203, 'num_epochs': 93, 'lr': 0.007473834766459472}. Best is trial 0 with value: 0.15557219088077545.
[I 2023-12-05 19:09:58,358] Trial 4 finished with value: 0.16348141431808472 and parameters: {'size1': 777, 'size2': 305, 'size3': 351, 'size4': 38, 'size5': 387, 'num_epochs': 179, 'lr': 0.0003716596473128483}. Best is trial 0 with value: 0.15557219088077545.
[I 2023-12-05 19:09:58,412] Trial 5 pruned. 
[I 2023-12-05 19:09:58,624] Trial 6 finished with value: 0.1554340124130249 and parameters: {'size1': 40, 'size2': 344, 'size3': 505, 'size4': 338, 'size5': 212, 'num_epochs': 12, 'lr': 0.0022986107822212647}. Best is trial 6 with value: 0.1554340124130249.
[I 2023-12-05 19:09:59,085] Trial 7 pruned. 
[I 2023-12-05 19:10:02,249] Trial 8 finished with value: 0.15892718732357025 and parameters: {'size1': 818, 'size2': 170, 'size3': 

[I 2023-12-05 19:10:25,468] Trial 11 pruned. 
[I 2023-12-05 19:10:26,879] Trial 12 finished with value: 0.17888186872005463 and parameters: {'size1': 765, 'size2': 180, 'size3': 250, 'size4': 209, 'size5': 121, 'num_epochs': 99, 'lr': 0.001778803126883852}. Best is trial 3 with value: 0.15964339673519135.
[I 2023-12-05 19:10:27,752] Trial 13 finished with value: 0.19333554804325104 and parameters: {'size1': 490, 'size2': 368, 'size3': 400, 'size4': 39, 'size5': 284, 'num_epochs': 64, 'lr': 0.0021646272402995013}. Best is trial 3 with value: 0.15964339673519135.
[I 2023-12-05 19:10:27,822] Trial 14 pruned. 
[I 2023-12-05 19:10:27,884] Trial 15 pruned. 
[I 2023-12-05 19:10:27,936] Trial 16 pruned. 
[I 2023-12-05 19:10:28,022] Trial 17 pruned. 
[I 2023-12-05 19:10:28,942] Trial 18 finished with value: 0.1800566166639328 and parameters: {'size1': 447, 'size2': 223, 'size3': 181, 'size4': 137, 'size5': 318, 'num_epochs': 81, 'lr': 0.001312460537600341}. Best is trial 3 with value: 0.1596433

[I 2023-12-05 19:10:52,782] Trial 6 pruned. 
[I 2023-12-05 19:10:52,804] Trial 7 pruned. 
[I 2023-12-05 19:10:52,829] Trial 8 pruned. 
[I 2023-12-05 19:10:52,871] Trial 9 pruned. 
[I 2023-12-05 19:10:52,949] Trial 10 pruned. 
[I 2023-12-05 19:10:52,988] Trial 11 pruned. 
[I 2023-12-05 19:10:53,039] Trial 12 pruned. 
[I 2023-12-05 19:10:53,086] Trial 13 pruned. 
[I 2023-12-05 19:10:53,140] Trial 14 pruned. 
[I 2023-12-05 19:10:53,186] Trial 15 pruned. 
[I 2023-12-05 19:10:53,241] Trial 16 pruned. 
[I 2023-12-05 19:10:53,302] Trial 17 pruned. 
[I 2023-12-05 19:10:53,346] Trial 18 pruned. 
[I 2023-12-05 19:10:53,397] Trial 19 pruned. 
[I 2023-12-05 19:10:53,551] A new study created in memory with name: no-name-d590f66e-f3cb-4012-92ba-b2f9f8675c27
[I 2023-12-05 19:10:54,092] Trial 0 finished with value: 0.1561325043439865 and parameters: {'size1': 705, 'size2': 389, 'size3': 194, 'size4': 439, 'size5': 471, 'num_epochs': 33, 'lr': 0.005375282586575399}. Best is trial 0 with value: 0.156132

[I 2023-12-05 19:11:26,000] Trial 1 finished with value: 0.1980733871459961 and parameters: {'size1': 830, 'size2': 60, 'size3': 367, 'size4': 385, 'size5': 509, 'num_epochs': 83, 'lr': 0.0014951167413774916}. Best is trial 0 with value: 0.17320573329925537.
[I 2023-12-05 19:11:26,442] Trial 2 finished with value: 0.16553647816181183 and parameters: {'size1': 140, 'size2': 463, 'size3': 478, 'size4': 159, 'size5': 28, 'num_epochs': 37, 'lr': 0.003121911708369945}. Best is trial 2 with value: 0.16553647816181183.
[I 2023-12-05 19:11:28,107] Trial 3 finished with value: 0.1754598617553711 and parameters: {'size1': 86, 'size2': 118, 'size3': 452, 'size4': 344, 'size5': 70, 'num_epochs': 161, 'lr': 0.0059434413124018915}. Best is trial 2 with value: 0.16553647816181183.
[I 2023-12-05 19:11:29,964] Trial 4 finished with value: 0.15688224136829376 and parameters: {'size1': 133, 'size2': 305, 'size3': 340, 'size4': 70, 'size5': 154, 'num_epochs': 192, 'lr': 0.009257271731354502}. Best is tria

[I 2023-12-05 19:12:57,621] Trial 4 finished with value: 0.15609131753444672 and parameters: {'size1': 148, 'size2': 422, 'size3': 187, 'size4': 48, 'size5': 123, 'num_epochs': 105, 'lr': 0.008438991568484338}. Best is trial 2 with value: 0.15607409179210663.
[I 2023-12-05 19:12:57,698] Trial 5 pruned. 
[I 2023-12-05 19:12:57,790] Trial 6 pruned. 
[I 2023-12-05 19:12:57,899] Trial 7 finished with value: 0.1798415333032608 and parameters: {'size1': 1011, 'size2': 240, 'size3': 183, 'size4': 397, 'size5': 16, 'num_epochs': 1, 'lr': 0.0009672329387834794}. Best is trial 2 with value: 0.15607409179210663.
[I 2023-12-05 19:12:58,329] Trial 8 pruned. 
[I 2023-12-05 19:12:58,443] Trial 9 pruned. 
[I 2023-12-05 19:12:58,548] Trial 10 pruned. 
[I 2023-12-05 19:13:01,056] Trial 11 finished with value: 0.15653260052204132 and parameters: {'size1': 205, 'size2': 384, 'size3': 241, 'size4': 24, 'size5': 149, 'num_epochs': 66, 'lr': 0.006280833986432997}. Best is trial 2 with value: 0.15607409179210