<a href="https://colab.research.google.com/github/dcolinmorgan/grph/blob/main/accelerating_chemical_mappings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Accelerated Chemical Mapping with [Graphistry](graphistry.com)

This notebook visualizes a chemical dataset describing Blood Brain Barrier Permeability (BBBP) from [MoleculeNet](http://moleculenet.ai/datasets-1) and [ECFPs](https://pubs.acs.org/doi/10.1021/ci100050t), then reduce those high-dimensional vectors down to 2 dimensions using conventional statistical tools. Ultimately we demonstrate how such an OPEN-SOURCE analysis can be sped-up and scaled-up massively with the [graphistry](graphistry.com) environment and toolkit

### [baseline HERE](https://github.com/dcolinmorgan/grph/blob/main/generic_chemical_mappings.ipynb)

# Import accelerator libraries

In [None]:
!pip install --extra-index-url=https://pypi.nvidia.com cuml-cu11 cudf-cu11 cugraph-cu11 pylibraft_cu11 raft_dask_cu11 dask_cudf_cu11 pylibcugraph_cu11 pylibraft_cu11
import cuml,cudf
print(cuml.__version__)

!pip install -U --force git+https://github.com/graphistry/pygraphistry.git@cudf
!pip install -U git+https://github.com/graphistry/cu-cat.git@DT3
# !pip install dirty_cat

In [2]:
import graphistry
graphistry.register(api=3,protocol="https", server="hub.graphistry.com", username='dcolinmorgan', password='fXjJnkE3Gik6BWy') ## key id, secret key

# graphistry.register(api=3,protocol="https", server="hub.graphistry.com", username='dcolinmorgan', password='***') ## key id, secret key
print(graphistry.__version__)

import cu_cat
print(cu_cat.__file__)

import os
from collections import Counter
import cProfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pstats import Stats
import cuml,cudf
from time import time
import warnings
warnings.filterwarnings('ignore')
from typing import List
import seaborn as sns
pd.set_option('display.max_colwidth', 200)

0.28.7+463.gfb96400
/usr/local/lib/python3.10/dist-packages/cu_cat/__init__.py


In [3]:
!nvidia-smi --query-gpu=gpu_name --format=csv,noheader

Tesla V100-SXM2-16GB


# Import Basics

In [None]:
!pip install rdkit
!pip install --pre deepchem

from rdkit import Chem, DataStructs
from rdkit.Chem.rdchem import Mol
from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser

from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

# Embed BBBP in Global Chemical Space Approximation (Dataset-Agnostic Embedding)

### Read in and process ChEMBL data
_Note: We want this walk-through to be fully reproducible with publicly available data. Since the dataset we used at Reverie create our internal Dataset-Agnostic Embedding can't be shared here, instead we select a random sample of ChEMBL compounds as our approximation of global chemical space._

In [10]:
# Read in data from MoleculeNet
chembl = pd.read_csv("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz", compression='gzip')

# Sample a random 10k
chembl = chembl.sample(n=20000)

In [11]:

chemblA = pd.read_csv("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz", compression='gzip')
# p=''
# for i in [150000,230000]:
  # chembl = chemblA.sample(n=i)

g2 = graphistry.nodes(cudf.from_pandas(chembl["smiles"][chembl.smiles.str.len()<500]))

  # t=time()
  # g2=g2.featurize(feature_engine='cu_cat',memoize=True)
  # print("featurize: \n"+str(time()-t))

t=time()
g4=g2.umap(feature_engine='cu_cat',engine='cuml',metric = "jaccard",
                      n_neighbors = 25,
                      n_components = 2,
                      dbscan=True,
                      # low_memory = True,
                      min_dist = 0.001)
j=time()-t
print("umap: \n"+str(j))
  # p=p+'_'+str(i)+':'+str(j)
# g4.plot()



Using GPU: cu_cat




umap: 
46.321837425231934


In [None]:
emb2=g4._node_embedding
g22=graphistry.nodes(emb2.reset_index(),'ID').edges(g4._edges,'_src_implicit','_dst_implicit').bind(point_x="x",point_y="y").settings(url_params={"play":0})
# g22.plot()

# Embed BBBP with UMAP

### Read in and process small data

In [None]:
# Read in data from MoleculeNet
bbbp = pd.read_csv("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv")

# Clean up columnn names so they are easier to interpret
bbbp = bbbp[["smiles", "p_np", "name"]].reset_index(drop=True).rename({"p_np": "permeable"}, axis=1)

# Remove extra fragments in SMILES (typically salts, which are irrelevant to BBB permeability)
bbbp["smiles"] = bbbp["smiles"].apply(get_largest_fragment_from_smiles).dropna()
t=time()
# Compute descriptors and keep track of which failed to featurize
ecfp_descriptors, keep_idx = compute_ecfp_descriptors(bbbp["smiles"])

# Only keep those that sucessfully featurized
bbbp = bbbp.iloc[keep_idx]
print(time()-t)
print(bbbp.shape)

In [None]:
BBBP=bbbp[~bbbp.name.duplicated(keep='first')]
BBBP[['name','permeable']][BBBP.smiles.str.len()>3]#.reset_index(drop=True)

### ... and with graphistry

In [None]:
ecfp_descriptors

In [None]:
BBBP=bbbp[~bbbp.name.duplicated(keep='first')]

g = graphistry.nodes(cudf.from_pandas(BBBP[['smiles','permeable']][BBBP.smiles.str.len()>3]))
t=time()
# g2=g.featurize(feature_engine='cu_cat',memoize=True)
g3=g.umap(feature_engine='cu_cat',engine='cuml',metric = "jaccard",
                      n_neighbors = 25,
                      n_components = 2,
                      low_memory = False,
                      min_dist = 0.001)
print("\n"+str(time()-t))

In [None]:
# g2.bind(node=BBBP.permeable) #[BBBP.smiles.str.len()>3])
g3.encode_point_color('permeable',palette=["hotpink", "dodgerblue"],as_continuous=True).plot()


## using a [universal fingerprint](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00445-4) rather than [morgan fingerprint from 1965](https://pubs.acs.org/doi/abs/10.1021/c160017a018)

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!pip install tmap
!pip install git+https://github.com/reymond-group/map4@v1.0


In [None]:
# !pip install -q condacolab
# import condacolab
condacolab.install()

!conda install -c tmap tmap

In [None]:
from rdkit import Chem
import tmap as tm
from map4 import MAP4Calculator

dim = 1024

MAP4 = MAP4Calculator(dimensions=dim)
ENC = tm.Minhash(dim)

smiles_a = 'c1ccccc1'
mol_a = Chem.MolFromSmiles(smiles_a)
map4_a = MAP4.calculate(mol_a)



### ... and with graphistry

In [None]:
# PPB = PPB[PPB.columns.drop(list(PPB.filter(regex='UMAP')))]
g = graphistry.nodes(cudf.from_pandas(PPB[['smiles']][PPB.smiles.str.len()>3]))

t=time()
# g2=g.featurize(feature_engine='cu_cat',memoize=True)
g3=g.umap(feature_engine='cu_cat',engine='cuml',metric = "jaccard",
                      n_neighbors = 25,
                      n_components = 2,
                      low_memory = False,
                      min_dist = 0.001)
print("\n"+str(time()-t))
emb2=g3._node_embedding
g222=graphistry.nodes(emb2.reset_index(),'index').edges(g3._edges,'_src_implicit','_dst_implicit').bind(point_x="x",point_y="y").settings(url_params={"play":0})
g222.plot()

## and larger data

In [None]:
PPB = pd.read_csv("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm9.csv")
t=time()
#Compute ECFP Descriptors
ecfp_descriptors, keep_idx = compute_ecfp_descriptors(PPB["smiles"])
PPB = PPB.iloc[keep_idx]
print(PPB.shape)
print(time()-t)

### ... and with graphistry

In [None]:
# PPB = PPB[PPB.columns.drop(list(PPB.filter(regex='UMAP')))]
g = graphistry.nodes(cudf.from_pandas(PPB[['smiles']][PPB.smiles.str.len()>3]))
t=time()
# g2=g.featurize(feature_engine='cu_cat',memoize=True)
g3=g.umap(feature_engine='cu_cat',engine='cuml',metric = "jaccard",
                      n_neighbors = 25,
                      n_components = 2,
                      low_memory = False,
                      min_dist = 0.001)

print("\n"+str(time()-t))


In [None]:
emb2=g3._node_embedding
g222=graphistry.nodes(emb2.reset_index(),'index').edges(g3._edges,'_src_implicit','_dst_implicit').bind(point_x="x",point_y="y").settings(url_params={"play":0})
g222.plot()