<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#DGL--lifescience-example" data-toc-modified-id="DGL--lifescience-example-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>DGL- lifescience example</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Jina-graph" data-toc-modified-id="Jina-graph-1.0.1"><span class="toc-item-num">1.0.1&nbsp;&nbsp;</span>Jina graph</a></span></li></ul></li></ul></li></ul></div>

# DGL- lifescience example


The tutorial from this notebook uses https://github.com/awslabs/dgl-lifesci

In [1]:
import jina
from jina import Document
from jina.types.document.graph import GraphDocument
from jina import Document

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgllife
print(dgllife.__version__)
#import rdkit
from pprint import pprint 

Using backend: pytorch


0.2.8


We will use a feature mapper that converts strings representing atoms to features

In [4]:
from rdkit import Chem
from dgllife.utils import CanonicalAtomFeaturizer

mol_str = 'CCO'
mol = Chem.MolFromSmiles(mol_str)
atom_featurizer = CanonicalAtomFeaturizer(atom_data_field='feat')
x = atom_featurizer(mol)['feat']
print(f'representation for {mol_str} is array {x.shape}')


mol_str = 'CCCO'
mol = Chem.MolFromSmiles(mol_str)
atom_featurizer = CanonicalAtomFeaturizer(atom_data_field='feat')
x = atom_featurizer(mol)['feat']
print(f'representation for {mol_str} is array {x.shape}')

representation for CCO is array torch.Size([3, 74])
representation for CCCO is array torch.Size([4, 74])


The following cell constructs a DGL graph with node and edge features

In [5]:
from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer, CanonicalBondFeaturizer

# Node featurizer
node_featurizer = CanonicalAtomFeaturizer(atom_data_field='h')
# Edge featurizer
edge_featurizer = CanonicalBondFeaturizer(bond_data_field='h')
# SMILES (a string representation for molecule) for Penicillin
smiles = 'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C'
g = smiles_to_bigraph(smiles=smiles, 
                      node_featurizer=node_featurizer,
                      edge_featurizer=edge_featurizer)
print(g)

Graph(num_nodes=23, num_edges=50,
      ndata_schemes={'h': Scheme(shape=(74,), dtype=torch.float32)}
      edata_schemes={'h': Scheme(shape=(12,), dtype=torch.float32)})


In [6]:
print(f"Node features are kept as a {g.ndata['h'].shape} array")
print(f"Edge features are kept as a {g.edata['h'].shape} array")
print(f"g.num_nodes()={g.num_nodes()}")
print(f"g.num_edges()={g.num_edges()}")

Node features are kept as a torch.Size([23, 74]) array
Edge features are kept as a torch.Size([50, 12]) array
g.num_nodes()=23
g.num_edges()=50


In [34]:
from dgllife.model import GCNPredictor
model = GCNPredictor(in_feats=1)

from dgllife.data import Tox21
from dgllife.model import load_pretrained
from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer

dataset = Tox21(smiles_to_bigraph, CanonicalAtomFeaturizer())
model = load_pretrained('GCN_Tox21') # Pretrained model loaded
model.eval()

smiles, g, label, mask = dataset[0]
#feats = g.ndata.pop('h')
label_pred = model(g, g.ndata['h'])

Processing dgl graphs from scratch...
Processing molecule 1000/7831
Processing molecule 2000/7831
Processing molecule 3000/7831
Processing molecule 4000/7831
Processing molecule 5000/7831
Processing molecule 6000/7831
Processing molecule 7000/7831
Downloading GCN_Tox21_pre_trained.pth from https://data.dgl.ai/dgllife/pre_trained/gcn_tox21.pth...
Pretrained model loaded


In [42]:
import torch
label_pred = model(g, torch.tensor(g.ndata['h'].detach().numpy()))
label_pred

tensor([[ 1.4190, -0.1820,  1.2974,  1.8104,  0.5580,  1.4416,  0.6914,  2.0957,
          0.5919,  0.7715,  1.7273,  0.2070]], grad_fn=<AddmmBackward>)

In [8]:
model.forward

<bound method GCNPredictor.forward of GCNPredictor(
  (gnn): GCN(
    (gnn_layers): ModuleList(
      (0): GCNLayer(
        (graph_conv): GraphConv(in=74, out=64, normalization=none, activation=<function relu at 0x7fb999982c10>)
        (dropout): Dropout(p=0.0, inplace=False)
        (res_connection): Linear(in_features=74, out_features=64, bias=True)
        (bn_layer): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): GCNLayer(
        (graph_conv): GraphConv(in=64, out=64, normalization=none, activation=<function relu at 0x7fb999982c10>)
        (dropout): Dropout(p=0.0, inplace=False)
        (res_connection): Linear(in_features=64, out_features=64, bias=True)
        (bn_layer): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (readout): WeightedSumAndMax(
    (weight_and_sum): WeightAndSum(
      (atom_weighting): Sequential(
        (0): Linear(in_features=64, out_features=1, bi

#### How to generate the embedding for a single molecule

In [9]:
smiles, g, label, mask = dataset[0]

In [10]:
model.forward(g, feats= g.ndata['h'])

tensor([[ 1.4190, -0.1820,  1.2974,  1.8104,  0.5580,  1.4416,  0.6914,  2.0957,
          0.5919,  0.7715,  1.7273,  0.2070]], grad_fn=<AddmmBackward>)

In [11]:
g.ndata['h'].shape

torch.Size([16, 74])

#### Generating embeddings for each of the n_sample graphs

In [43]:
from tqdm import tqdm

In [44]:
n_samples = len(dataset)

In [45]:
embeddings=[]
for i in tqdm(range(n_samples)):
    smiles, g, label, mask = dataset[i]
    g = dgl.add_self_loop(g)
    emb = model.forward(g, feats= g.ndata['h'])
    embeddings.append(emb.detach().numpy())

100%|██████████| 7831/7831 [00:11<00:00, 671.04it/s]


In [46]:
embeddings[1].shape

(1, 12)

In [47]:
#aux = GraphDocument.load_from_dgl_graph(g)

### Jina graph

In [48]:
jina_graph = GraphDocument.load_from_dgl_graph(g)

In [49]:
print(jina_graph.adjacency.row)
print(jina_graph.adjacency.col)
print(jina_graph.adjacency.todense())

[ 0  9  9  4  4 15 15  3  3 18 18 23 23 13 13 21 21 12 23 10 10 24 24 22
 22  1  1  8  8 14 14  5  5 16 16 17 17  2  2 19 19  6 16  7  7 20 20 11
 13  4 19 24  6 18  2  8 11  5  0  1  2  3  4  5  6  7  8  9 10 11 12 13
 14 15 16 17 18 19 20 21 22 23 24]
[ 9  0  4  9 15  4  3 15 18  3 23 18 13 23 21 13 12 21 10 23 24 10 22 24
  1 22  8  1 14  8  5 14 16  5 17 16  2 17 19  2  6 19  7 16 20  7 11 20
  4 13 24 19 18  6  8  2  5 11  0  1  2  3  4  5  6  7  8  9 10 11 12 13
 14 15 16 17 18 19 20 21 22 23 24]
[[1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0]
 [0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 1 0 0

In [50]:
jina_graph.num_nodes, jina_graph.num_edges

(25, 83)

Note that both graphs have exactly the same adjacency matrix

In [51]:
np.mean(g.adjacency_matrix(scipy_fmt='csr').todense() == jina_graph.adjacency.todense())

1.0

#### Basic Jina app

In [20]:
dataset[0]

('CCOc1ccc2nc(S(N)(=O)=O)sc2c1',
 Graph(num_nodes=16, num_edges=34,
       ndata_schemes={'h': Scheme(shape=(74,), dtype=torch.float32)}
       edata_schemes={}),
 tensor([0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 tensor([1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1.]))

In [21]:
g = dataset[0][1]

features = g.ndata['h'].detach().numpy()
features.shape

d = GraphDocument.load_from_dgl_graph(g);
d.blob = features
d.blob

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [24]:
from jina.types.document.graph import GraphDocument
from jina import  Document, DocumentArray

d = GraphDocument()
docarray_graph = DocumentArray([d,d])

print(type(d))
print(type(docarray_graph))
print(type(docarray_graph[0]))
print('isinstance(docarray_graph[0], GraphDocument)=',isinstance(docarray_graph[0], GraphDocument))
print('isinstance(d,GraphDocument)=', isinstance(d,GraphDocument))

<class 'jina.types.document.graph.GraphDocument'>
<class 'jina.types.arrays.document.DocumentArray'>
<class 'jina.types.document.Document'>
isinstance(docarray_graph[0], GraphDocument)= False
isinstance(d,GraphDocument)= True


In [27]:
for d in docarray_graph:
    print('type(d)=',type(d))
    print('isinstance(d, GraphDocument)=',isinstance(d, GraphDocument))

type(d)= <class 'jina.types.document.graph.GraphDocument'>
isinstance(d, GraphDocument)= True
type(d)= <class 'jina.types.document.graph.GraphDocument'>
isinstance(d, GraphDocument)= True


In [52]:
from jina import DocumentArray

def create_docs(dataset):
    docs = []
    for molecule_str, dgl_graph, label, mask in dataset:
        tags={'molecule_str': molecule_str}
              #'label':label.detach().numpy(),
              #'mask':mask.detach().numpy()}
        gdoc = GraphDocument.load_from_dgl_graph(dgl_graph)
        gdoc.tags = tags
        gdoc.blob = dgl_graph.ndata['h'].detach().numpy()
        docs.append(gdoc.tags)

    return DocumentArray(docs)

In [53]:
docarray_of_graphs = create_docs(dataset)

In [54]:
type(docarray_of_graphs[0])

jina.types.document.Document

In [58]:
from jina import Executor, requests 

class MoleculeEncoder(Executor):
    
    def __init__(self, model_type: str='GCN_Tox21', *args, **kwargs):

        super().__init__(*args, **kwargs)
        import torch
        from dgllife.model import load_pretrained
        from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer
        self.model = load_pretrained(model_type) 
        self.model.eval()

    @requests()
    def encode(self, docs: DocumentArray, *args, **kwargs):
        for d in docs:
            dgl_graph = d.to_dgl_graph()
            dgl_graph = dgl.add_self_loop(dgl_graph)
            d.embedding = model.forward(dgl_graph, feats= torch.tensor(d.blob))
    