In [8]:
import scanpy as sc
import squidpy as sq
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch_geometric.data import Data, LightningNodeData
import torch_sparse
import torch_geometric.nn as geom_nn
from torch_geometric.loader import NeighborLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
import torch.optim as optim
import os
from torch_geometric.loader import DataLoader
from sklearn.metrics import r2_score

In [5]:
adata = sq.datasets.mibitof()

In [7]:
adata.obs

Unnamed: 0,row_num,point,cell_id,X1,center_rowcoord,center_colcoord,cell_size,category,donor,Cluster,batch,library_id
3034-0,3086,23,2,60316.0,269.0,7.0,408.0,carcinoma,21d7,Epithelial,0,point23
3035-0,3087,23,3,60317.0,294.0,6.0,408.0,carcinoma,21d7,Epithelial,0,point23
3036-0,3088,23,4,60318.0,338.0,4.0,304.0,carcinoma,21d7,Imm_other,0,point23
3037-0,3089,23,6,60320.0,372.0,6.0,219.0,carcinoma,21d7,Myeloid_CD11c,0,point23
3038-0,3090,23,8,60322.0,417.0,5.0,303.0,carcinoma,21d7,Myeloid_CD11c,0,point23
...,...,...,...,...,...,...,...,...,...,...,...,...
47342-2,48953,16,1103,2779.0,143.0,1016.0,283.0,carcinoma,90de,Fibroblast,2,point16
47343-2,48954,16,1104,2780.0,814.0,1017.0,147.0,carcinoma,90de,Fibroblast,2,point16
47344-2,48955,16,1105,2781.0,874.0,1018.0,142.0,carcinoma,90de,Imm_other,2,point16
47345-2,48956,16,1106,2782.0,257.0,1019.0,108.0,carcinoma,90de,Fibroblast,2,point16


In [3]:
#Load dataset 
adata = sq.datasets.imc()

In [2]:
from dataset import HartmannWrapper

2022-07-24 08:15:36.259186: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-24 08:15:36.259410: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
dataset = HartmannWrapper("./data/")

In [10]:
dataset[0].train_mask

AttributeError: 'GlobalStorage' object has no attribute 'train_mask'

In [19]:
loader = DataLoader(dataset, batch_size=58, shuffle=True)

In [9]:
class GCN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = geom_nn.GCNConv(dataset.num_node_features, dataset.y.shape[1])

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)

        return x

In [None]:
class LinearNCEM(pl.LightningModule):
    def __init__(self, **model_kwargs):
        super().__init__()
        # Saving hyperparameters
        self.save_hyperparameters(model_kwargs)

        self.model_mu = GCN(
            in_channels=self.hparams.in_channels,
            hidden_dims=self.hparams.encoder_hidden_dims,
            out_channels=self.hparams.latent_dim,
        )

        self.model_sigma = GCN(
            in_channels=self.hparams.in_channels,
            hidden_dims=self.hparams.encoder_hidden_dims,
            out_channels=self.hparams.latent_dim,
        )

        self.loss_module = nn.GaussianNLLLoss(eps=1e-5)

        def init_weights(m):
            if isinstance(m, geom_nn.GCNConv):
                pass
                # TODO: how to init weights of GNN's?
                # torch.nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                # m.bias.data.fill_(0.01)


        self.model_mu.apply(init_weights)
        self.model_sigma.apply(init_weights)


    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        sigma = self.model_mu(x, edge_index)
        mu = self.model_sigma(x, edge_index)
        return mu, sigma

    def configure_optimizers(self):
        # We use SGD here, but Adam works as well
        optimizer = optim.AdamW(
            self.parameters(),
            lr=0.05,#self.hparams["lr"],
            weight_decay=0#self.hparams["weight_decay"],
        )
        return optimizer

    def training_step(self, batch, _):
        mu, sigma = self.forward(batch)
        loss = self.loss_module(mu, batch.y, sigma)
        self.log('train_loss', loss, batch_size=batch.batch_size)
        return loss

    def validation_step(self, batch, _):
        mu, sigma = self.forward(batch)
        val_loss = self.loss_module(mu, batch.y, sigma)
        val_r2_score = r2_score(batch.y.cpu(), mu.cpu())
        self.log('val_r2_score', val_r2_score, batch_size=batch.batch_size, prog_bar=True)
        self.log('val_loss', val_loss, batch_size=batch.batch_size, prog_bar=True)

    def test_step(self, batch, _):
        mu, sigma = self.forward(batch)
        self.log('test_loss', loss, batch_size=batch.batch_size)


In [4]:
#Get adjacency matrix
sq.gr.spatial_neighbors(adata, coord_type="generic")
A=adata.obsp['spatial_connectivities']

#Get features of nodes 
X=adata.obs
X=pd.get_dummies(X)
X=X.to_numpy()
X=torch.tensor(X)

#Get labels of nodes
Y=adata.X
Y=torch.tensor(Y)

In [5]:
Acoo = A.tocoo()
A_sparse = torch.sparse.FloatTensor(torch.LongTensor([Acoo.row.tolist(), Acoo.col.tolist()]),
                              torch.FloatTensor(Acoo.data)).coalesce()

In [6]:
data = Data(x=X,edge_index=A_sparse.indices(), y=Y, train_mask=torch.arange(A.shape[0]))

In [7]:
data

Data(x=[4668, 11], edge_index=[2, 28008], y=[4668, 34], train_mask=[4668])

In [None]:
loss = nn.GaussianNLLLoss()
input = torch.randn(5, 2, requires_grad=True)
target = torch.randn(5, 2)
var = torch.ones(5, 2, requires_grad=True) #heteroscedastic
output = loss(input, target, var)
output.backward()

In [10]:
loader = NeighborLoader(
    data,
    # Sample 30 neighbors for each node for 2 iterations
    num_neighbors=[30] * 2,
    # Use a batch size of 128 for sampling training nodes
    batch_size=128,
)

In [23]:
lightning_loader = LightningNodeData(data, data.train_mask, loader='neighbor',
                                   num_neighbors=[30] * 2, batch_size=128,
                                   num_workers=8)

In [25]:
sampled_data = next(iter(loader))

In [13]:
sampled_data ## this is one batch

Data(x=[292, 11], edge_index=[2, 1254], y=[292, 34], train_mask=[292], batch_size=128)

In [26]:
def print_results(result_dict):
    if "train" in result_dict:
        print(f"Train loss: {result_dict['train']}")
    if "val" in result_dict:
        print(f"Val loss:   {result_dict['val']}")
    print(f"Test loss:  {result_dict['test']}")

In [39]:
def train_node_classifier(cur_dir, model_name, data, **model_kwargs):
    pl.seed_everything(42)
    node_data_loader = lightning_loader 
    strategy = pl.strategies.DDPStrategy(find_unused_parameters=False)
    '''NeighborLoader(
    data,
    # Sample 30 neighbors for each node for 2 iterations
    num_neighbors=[30] * 2,
    # Use a batch size of 128 for sampling training nodes
    batch_size=128)'''

    CHECKPOINT_PATH = cur_dir + "checkpoints"

    # Create a PyTorch Lightning trainer with the generation callback
    root_dir = os.path.join(CHECKPOINT_PATH, model_name)
    os.makedirs(root_dir, exist_ok=True)
    trainer = pl.Trainer(strategy=strategy, default_root_dir=root_dir, callbacks=[ModelCheckpoint(save_weights_only=True, mode="min", monitor="val_loss")],
                         max_epochs=200,
                         progress_bar_refresh_rate=0) # 0 because epoch size is 1


    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(CHECKPOINT_PATH, f"{model_name}.ckpt")
    if os.path.isfile(pretrained_filename):
        print("Found pretrained model, loading...")
        model = NodeLevelGNN.load_from_checkpoint(pretrained_filename)
    else:
        pl.seed_everything()
        model = NodeLevelGNN(model_name=model_name, c_in=data.x.shape[1], c_out=data.y.shape[1], **model_kwargs)
        trainer.fit(model, node_data_loader)
        model = NodeLevelGNN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # Test best model on the test set
    test_result = trainer.test(model, node_data_loader, verbose=False)
    batch = next(iter(node_data_loader))
    batch = batch.to(model.device)
    train_loss = model.forward(batch, mode="train")
    val_loss = model.forward(batch, mode="val")
    result = {"train": train_loss,
              "val": val_loss,
              "test": test_result['test_loss']}
    return model, result

In [40]:
cwd=os.getcwd()
model, result = train_node_classifier(cur_dir=cwd, model_name="nonlinear NCEM",data=data,c_hidden=30,num_layers=2,dp_rate=0.1)

print_results(result)

Global seed set to 42


MisconfigurationException: `Trainer(strategy='ddp')` or `Trainer(accelerator='ddp')` is not compatible with an interactive environment. Run your code as a script, or choose one of the compatible strategies: Trainer(strategy=None|dp|tpu_spawn). In case you are spawning processes yourself, make sure to include the Trainer creation inside the worker function.