# Featurize data for UMAP visualization
After curating your dataset, you need to featurize the data by taking the smiles string representations of each compound and translating that into fingerprints or descriptors. During modeling, this happens within the pipeline. But to explore chemical space and do other visualizations, you need to featurize separately. 
- ECFP4 fingerprints (radius 2 / 1024 length and radius 3 / 2048 length)
- Mordred descriptors
- RDKit descriptors

# Load packages, functions & dataset

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem

import deepchem as dc

import atomsci.ddm.pipeline.featurization as feat 

# helper functions to visualize compounds
import tempfile
from rdkit.Chem import Draw
from itertools import islice
from IPython.display import Image, display

def display_images(filenames):
    """Helper to pretty-print images."""
    for file in filenames:
      display(Image(file))

def mols_to_pngs(mols, basename="test"):
    """Helper to write RDKit mols to png files."""
    filenames = []
    for i, mol in enumerate(mols):
        filename = "%s%d.png" % (basename, i)
        Draw.MolToFile(mol, filename)
        filenames.append(filename)
    return filenames

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
dataset_file = "transporters_pIC50_crada_merged.csv"
df = pd.read_csv(dataset_file, index_col = 0)

df

FileNotFoundError: [Errno 2] File b'transporters_pIC50_crada_merged.csv' does not exist: b'transporters_pIC50_crada_merged.csv'

# Morgan ECFP fingerprints
Use rdkit `GetMorganFingerprintAsBitVect()` function. 

In [3]:
mols = [Chem.MolFromSmiles(s) for s in df["base_rdkit_smiles"]]
fprints1024 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols]
fprints1024data = np.array(fprints1024, dtype=int)
fprints1024data.shape

(1334, 1024)

In [4]:
df_fprints1024 = df.join(pd.DataFrame(fprints1024data).add_prefix("ecfp2_"))
print(df_fprints1024.shape)
df_fprints1024.to_csv("transporters_morgan_2_1024.csv")

(1334, 1039)


In [5]:
fprints2048 = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, 2048) for mol in mols]
fprints2048data = np.array(fprints2048, dtype=int)
fprints2048data.shape

(1334, 2048)

In [6]:
df_fprints2048 = df.join(pd.DataFrame(fprints2048data).add_prefix("ecfp3_"))
print(df_fprints2048.shape)
df_fprints2048.to_csv("transporters_morgan_3_2048.csv")

(1334, 2063)


In [7]:
df_fprints2048

Unnamed: 0,base_rdkit_smiles,relation_MRP3,pIC50_MRP3,active_MRP3,relation_MRP4,pIC50_MRP4,active_MRP4,relation_BSEP,pIC50_BSEP,active_BSEP,...,ecfp3_2038,ecfp3_2039,ecfp3_2040,ecfp3_2041,ecfp3_2042,ecfp3_2043,ecfp3_2044,ecfp3_2045,ecfp3_2046,ecfp3_2047
3,COc1ccc(N2CCN(Cc3nc(-c4ccc(C(F)(F)F)cc4)sc3CSc...,,4.42,Active,,5.31,Active,,5.550000,Active,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,NS(=O)(=O)c1ccc(-c2c(-c3cccc(F)c3)nn3cc(C(F)(F...,,4.85,Active,,4.92,Active,,4.780000,Active,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,CN1CCC(C2=CCN(Cc3cc(Cl)cc(Cl)c3)C2=O)(c2ccc(F)...,<,4.00,Inactive,,4.04,Active,<,4.000000,Inactive,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,CC(C)[C@@H]1C(=O)N(S(C)(=O)=O)[C@@H]2CCN(C(=O)...,<,4.00,Inactive,<,4.00,Inactive,<,4.000000,Inactive,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,CC(C)c1onc(-c2c(Cl)cccc2Cl)c1COc1ccc(-c2ccc3nc...,<,4.00,Inactive,,6.00,Active,<,4.000000,Inactive,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331,CC(=O)NC[C@H]1CN(c2cc(F)c(C3=CCN(C(=O)CO)CC3)c...,,,,,,,<,3.000000,Inactive,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1332,COC(=O)N/N=C/c1c[n+]([O-])c2ccccc2[n+]1[O-],,,,,,,,3.026965,Inactive,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1333,CC(=O)OCC[N+](C)(C)C,,,,,,,<,3.000000,Inactive,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1334,CCN(CC)CCOC(=O)C(c1ccccc1)c1ccccc1,,,,,,,,3.800245,Inactive,...,,,,,,,,,,


# Mordred Descriptors
These take a bit of time to compute, ~5 minutes. Use AMPL `compute_mordred_descriptors_from_smiles()` function.

In [10]:
desc_df, is_valid = feat.compute_mordred_descriptors_from_smiles(df["base_rdkit_smiles"], smiles_col='base_rdkit_smiles')

RDKit ERROR: [11:57:18] UFFTYPER: Unrecognized charge state for atom: 8
RDKit ERROR: [11:57:43] UFFTYPER: Unrecognized charge state for atom: 14
RDKit ERROR: [11:57:44] UFFTYPER: Unrecognized charge state for atom: 8
RDKit ERROR: [11:57:47] UFFTYPER: Unrecognized charge state for atom: 23
RDKit ERROR: [11:58:10] UFFTYPER: Unrecognized charge state for atom: 14
RDKit ERROR: [11:58:40] UFFTYPER: Unrecognized charge state for atom: 20
RDKit ERROR: [11:58:41] UFFTYPER: Unrecognized charge state for atom: 12
RDKit ERROR: [11:59:04] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [11:59:06] UFFTYPER: Unrecognized charge state for atom: 3
RDKit ERROR: [11:59:18] UFFTYPER: Unrecognized charge state for atom: 14
RDKit ERROR: [11:59:18] UFFTYPER: Unrecognized charge state for atom: 5
RDKit ERROR: [11:59:30] UFFTYPER: Unrecognized charge state for atom: 5
RDKit ERROR: [11:59:34] UFFTYPER: Unrecognized atom type: Se2+2 (8)
RDKit ERROR: [11:59:34] UFFTYPER: Unrecognized atom type: Se2+

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [11]:
print("Number of descriptors:", desc_df.shape[1])
print("Number of smiles successfully featurized:", is_valid.sum())

Number of descriptors: 1557
Number of smiles successfully featurized: 1334


In [12]:
# from AMPL/atomsci/ddm/data/descriptor_sets_sources_by_descr_type.csv:
# we should be using "mordred_filtered" but the only difference between 
# compute_mordred_descriptors_from_smiles() and mordred_filtered is "PBF"
# so, drop column "PBF"
desc_df.drop(columns="PBF", inplace=True)
desc_df.shape

(1334, 1556)

In [13]:
df_mordred = df.merge(desc_df)
print(df_mordred.shape)
df_mordred.to_csv("transporters_mordred_1555.csv")

(1334, 1570)


In [14]:
desc_noNA = desc_df.dropna(axis = "columns")
print(desc_noNA.shape)

(1334, 1013)


In [15]:
mordred_nonan = pd.merge(df, desc_noNA)
mordred_nonan.to_csv('transporters_mordred_1012.csv')
mordred_nonan

Unnamed: 0,base_rdkit_smiles,relation_MRP3,pIC50_MRP3,active_MRP3,relation_MRP4,pIC50_MRP4,active_MRP4,relation_BSEP,pIC50_BSEP,active_BSEP,...,SdO,SssO,SaaO,SsF,SdS,SssS,SaaS,SddssS,SsCl,SsBr
0,Cc1ncc([N+](=O)[O-])n1CC(O)CCl,<,3.88,Inactive,<,3.88,Inactive,<,3.876148,Inactive,...,10.535014,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,5.399566,0.0
1,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...,<,3.88,Inactive,<,3.88,Inactive,<,3.876148,Inactive,...,36.249155,0.000000,0.0,0.000000,0.0,1.445549,0.000000,0.0,0.000000,0.0
2,Clc1cccc(N2CCNCC2)c1,<,3.88,Inactive,<,3.88,Inactive,<,3.876148,Inactive,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,5.921780,0.0
3,COc1ccc(N2CCN(Cc3nc(-c4ccc(C(F)(F)F)cc4)sc3CSc...,,4.42,Active,,5.31,Active,,5.550000,Active,...,11.400377,10.885459,0.0,39.478052,0.0,1.610467,1.496791,0.0,0.000000,0.0
4,COc1c(N2CCNC(C)C2)c(F)cc2c(=O)c(C(=O)O)cn(C3CC...,<,3.88,Inactive,<,3.88,Inactive,<,3.876148,Inactive,...,24.182095,5.592648,0.0,15.102264,0.0,0.000000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1329,CC(=O)NC[C@H]1CN(c2cc(F)c(C3=CCN(C(=O)CO)CC3)c...,,,,,,,<,3.000000,Inactive,...,34.528693,5.108030,0.0,29.423488,0.0,0.000000,0.000000,0.0,0.000000,0.0
1330,COC(=O)N/N=C/c1c[n+]([O-])c2ccccc2[n+]1[O-],,,,,,,,3.026965,Inactive,...,10.767305,4.296691,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
1331,CC(=O)OCC[N+](C)(C)C,,,,,,,<,3.000000,Inactive,...,10.300428,4.763333,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
1332,CCN(CC)CCOC(=O)C(c1ccccc1)c1ccccc1,,,,,,,,3.800245,Inactive,...,12.661534,5.566549,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0


# RDKit descriptors
Use deepchem `featurizer` class with `RDKitDescriptors()`.

In [16]:
featurizer = dc.feat.RDKitDescriptors()

In [17]:
loader = dc.data.CSVLoader(
      tasks=["pIC50_MRP3", "pIC50_MRP4", "pIC50_BSEP"], 
      smiles_field="base_rdkit_smiles",
      id_field="base_rdkit_smiles",
      featurizer=featurizer)
dataset = loader.featurize(dataset_file)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from transporters_pIC50_crada_merged.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 0 took 11.972 s
TIMING: dataset construction took 12.009 s
Loading dataset from disk.


In [18]:
# from deepchem.data.datasets.py function to_dataframe
X = dataset.X
ids = dataset.ids
if len(X.shape) == 1 or X.shape[1] == 1:
  columns = ['X']
else:
  columns = [f'X{i+1}' for i in range(X.shape[1])]
X_df = pd.DataFrame(X, columns=columns)

ids_df = pd.DataFrame(ids, columns=['ids'])
feat_df = pd.concat([ids_df, X_df], axis=1, sort=False)
feat_df.shape

(1334, 112)

In [19]:
# add more columns back in like custom labels
rdkit_df = pd.merge(df, feat_df, left_on = "base_rdkit_smiles", right_on = "ids")
# save to csv
rdkit_df.to_csv('transporters_rdkit_118.csv')

In [20]:
feat_noNA = feat_df.dropna(axis = "columns")
feat_noNA.shape

(1334, 108)

In [21]:
rdkit_nonan = pd.merge(df, feat_noNA, left_on = "base_rdkit_smiles", right_on = "ids")
rdkit_nonan.to_csv('transporters_rdkit_107.csv')
rdkit_nonan

Unnamed: 0,base_rdkit_smiles,relation_MRP3,pIC50_MRP3,active_MRP3,relation_MRP4,pIC50_MRP4,active_MRP4,relation_BSEP,pIC50_BSEP,active_BSEP,...,X102,X103,X104,X105,X106,X107,X108,X109,X110,X111
0,Cc1ncc([N+](=O)[O-])n1CC(O)CCl,<,3.88,Inactive,<,3.88,Inactive,<,3.876148,Inactive,...,5.0,1.0,7.0,4.0,0.0,0.0,0.0,1.0,0.69942,50.3452
1,C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[...,<,3.88,Inactive,<,3.88,Inactive,<,3.876148,Inactive,...,6.0,3.0,9.0,5.0,0.0,2.0,2.0,3.0,-0.30800,96.1113
2,Clc1cccc(N2CCNCC2)c1,<,3.88,Inactive,<,3.88,Inactive,<,3.876148,Inactive,...,2.0,1.0,3.0,1.0,0.0,1.0,1.0,2.0,1.74960,56.5347
3,COc1ccc(N2CCN(Cc3nc(-c4ccc(C(F)(F)F)cc4)sc3CSc...,,4.42,Active,,5.31,Active,,5.550000,Active,...,8.0,1.0,12.0,11.0,0.0,1.0,1.0,5.0,7.69400,171.3818
4,COc1c(N2CCNC(C)C2)c(F)cc2c(=O)c(C(=O)O)cn(C3CC...,<,3.88,Inactive,<,3.88,Inactive,<,3.876148,Inactive,...,6.0,2.0,8.0,4.0,1.0,1.0,2.0,4.0,1.98040,99.6260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1329,CC(=O)NC[C@H]1CN(c2cc(F)c(C3=CCN(C(=O)CO)CC3)c...,,,,,,,<,3.000000,Inactive,...,5.0,2.0,10.0,5.0,0.0,1.0,1.0,3.0,1.03410,98.9165
1330,COC(=O)N/N=C/c1c[n+]([O-])c2ccccc2[n+]1[O-],,,,,,,,3.026965,Inactive,...,5.0,1.0,8.0,2.0,0.0,0.0,0.0,2.0,-0.20350,65.0027
1331,CC(=O)OCC[N+](C)(C)C,,,,,,,<,3.000000,Inactive,...,2.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.25570,39.5324
1332,CCN(CC)CCOC(=O)C(c1ccccc1)c1ccccc1,,,,,,,,3.800245,Inactive,...,3.0,0.0,3.0,8.0,0.0,0.0,0.0,2.0,3.70350,93.4740


# Visualize chemicals with UMAP

In [8]:
import umap
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import umap.plot
import seaborn as sns

from matplotlib.ticker import MultipleLocator

sns.set_context("poster")
sns.set_style("whitegrid")
sns.set_palette("Set2")
pal = sns.color_palette()
plt.rcParams['figure.figsize'] = [10,10]

In [10]:
# MRP3
morgan31 = df_fprints1024.dropna(subset = ["pIC50_MRP3"]).reset_index(drop=True)
# morgan32 = df_fprints2048.dropna(subset = ["pIC50_MRP3"]).reset_index(drop=True)
# mordred3 = mordred_nonan.dropna(subset = ["pIC50_MRP3"]).reset_index(drop=True)
# rdkit3 = rdkit_nonan.dropna(subset = ["pIC50_MRP3"]).reset_index(drop=True)

# # MRP4
# morgan41 = df_fprints1024.dropna(subset = ["pIC50_MRP4"]).reset_index(drop=True)
# morgan42 = df_fprints2048.dropna(subset = ["pIC50_MRP4"]).reset_index(drop=True)
# mordred4 = mordred_nonan.dropna(subset = ["pIC50_MRP4"]).reset_index(drop=True)
# rdkit4 = rdkit_nonan.dropna(subset = ["pIC50_MRP4"]).reset_index(drop=True)

# # BSEP
# morganb1 = df_fprints1024.dropna(subset = ["pIC50_BSEP"]).reset_index(drop=True)
# morganb2 = df_fprints2048.dropna(subset = ["pIC50_BSEP"]).reset_index(drop=True)
# mordredb = mordred_nonan.dropna(subset = ["pIC50_BSEP"]).reset_index(drop=True)
# rdkitb = rdkit_nonan.dropna(subset = ["pIC50_BSEP"]).reset_index(drop=True)

In [14]:
morgan31.columns#[1:20]

Index(['base_rdkit_smiles', 'relation_MRP3', 'pIC50_MRP3', 'active_MRP3',
       'relation_MRP4', 'pIC50_MRP4', 'active_MRP4', 'relation_BSEP',
       'pIC50_BSEP', 'active_BSEP',
       ...
       'ecfp2_1014', 'ecfp2_1015', 'ecfp2_1016', 'ecfp2_1017', 'ecfp2_1018',
       'ecfp2_1019', 'ecfp2_1020', 'ecfp2_1021', 'ecfp2_1022', 'ecfp2_1023'],
      dtype='object', length=1039)

In [28]:
# here i choose to plot MRP4 data
plot_df = morgan31       # plot with whole df

map_df = plot_df.loc[:,"ABC":"SsBr"] # umap on feat values only

feat_type = "Mordred"
dataset_name = "MRP4"
labelcol = "active_MRP4"
valuecol = "pIC50_MRP4"
labelcols = ["active_MRP4", "pIC50_MRP4", "compound_id", "Morgan", "SMDC", "Prop"]

In [29]:
# run the rest of the plotting code without changing anything
mapper = umap.UMAP(n_neighbors=15, n_components=2, metric='jaccard').fit(map_df)

umap_coords = pd.DataFrame(mapper.embedding_, columns=("UMAP_X", "UMAP_Y"))
umap_coords = umap_coords.join(plot_df[labelcols])

  "inverse_transform will be unavailable".format(self.metric)


In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,8), gridspec_kw={'width_ratios': [8, 10]})

# active labels
sclab = ax1.scatter(umap_coords["UMAP_X"], umap_coords["UMAP_Y"],
            s=15, c=umap_coords[labelcol],  cmap="spring", )
ax1.set_facecolor('lightgray')
ax1.legend(*sclab.legend_elements(), loc="lower left")
ax1.title.set_text(f"{feat_type} UMAP with {labelcol} labels")

# pIC50 value labels
umap.plot._matplotlib_points(mapper.embedding_, 
            ax = ax2,
            values=plot_df[valuecol],
            cmap="jet",
            background="lightgray",
            width=8, height=10)
ax2.title.set_text(f"{feat_type} UMAP with {valuecol} values")

# color bar properties
norm = matplotlib.colors.Normalize(vmin=plot_df.loc[:, valuecol].min(), vmax=plot_df.loc[:,valuecol].max())
mappable = cm.ScalarMappable(norm=norm, cmap='jet')
fig.colorbar(mappable, ax = ax2);

plt.savefig(f"{dataset_name}_{feat_type}_UMAP.png", dpi=300)

In [None]:
hover_data = pd.DataFrame({'CID':plot_df.compound_id, 
                           'active':plot_df[labelcol],
                           'pIC50':plot_df[valuecol]})
umap.plot.output_notebook()
p = umap.plot.interactive(mapper, labels=plot_df[valuecol], hover_data=hover_data, point_size=5, color_key_cmap = 'jet', background = 'lightgray')
umap.plot.show(p)

In [None]:
# diagnostic plots
#ax1 = umap.plot.diagnostic(mapper, diagnostic_type='pca')
#ax2 = umap.plot.diagnostic(mapper, diagnostic_type='local_dim')
#ax3 = umap.plot.diagnostic(mapper, diagnostic_type='neighborhood')
#ax4 = umap.plot.connectivity(mapper, show_points=True)
#ax5 = umap.plot.connectivity(mapper, edge_bundling='hammer')

# Visualize groups of molecules on plot

In [None]:
fig, ax = plt.subplots(figsize = (12,12))
ax=umap.plot._matplotlib_points(mapper.embedding_,
                             ax=ax,
                             labels=plot_df[labelcol],
                             color_key_cmap="spring",
                             background="lightgray")
ax.title.set_text(f"{feat_type} UMAP with {labelcol} labels")

# Set minor tick locations.
ax.yaxis.set_minor_locator(MultipleLocator(0.5))
ax.xaxis.set_minor_locator(MultipleLocator(0.5))

# Set grid to use minor tick locations. 
ax.grid(which = 'both', axis = 'both', linewidth=1)

In [None]:
ids = umap_coords.loc[
    ((umap_coords['UMAP_X'] >= 7) & 
     (umap_coords['UMAP_X'] <= 8)) & 
    
    ((umap_coords['UMAP_Y'] >= 13.5) & 
     (umap_coords['UMAP_Y'] <= 14))]

ids

In [None]:
# plot actives
viz=plot_df[plot_df['compound_id'].isin(ids['compound_id'])]
viz = viz[viz[labelcol]==1] # select actives
viz

In [None]:
molecules = []
for _, data in islice(viz.iterrows(), len(viz)):
    molecules.append(Chem.MolFromSmiles(data["base_rdkit_smiles"]))
    
display_images(mols_to_pngs(molecules))

In [None]:
# plot inactives
viz=plot_df[plot_df['compound_id'].isin(ids['compound_id'])]
viz = viz[viz[labelcol]==0] # select inactives
molecules = []
for _, data in islice(viz.iterrows(), len(viz)):
    molecules.append(Chem.MolFromSmiles(data["base_rdkit_smiles"]))
    
display_images(mols_to_pngs(molecules))

In [None]:
from IPython.display import Image
from IPython.display import display

w = Image(filename='UMAP_ECFP_1024_MRP3.png') 
x = Image(filename='UMAP_ECFP_2048_MRP3.png') 
y = Image(filename='UMAP_Mordred_MRP3.png') 
z = Image(filename='UMAP_RDKit_MRP3.png') 
display(w, x, y, z)

In [3]:
# start with featurized matrix of compounds x features: df

# subset matrix to be only Tox21 - also subset for feature columns only
df_tox21=df[df["dataset_type"]=="Tox21"]
df_tox21 = [["feature columns"]]

# create a mapper to calculate dimensionality reduction on Tox21-specific space
mapper = umap.UMAP(n_neighbors=15, n_components=2, metric='jaccard').fit(df_tox21)

# use the mapper to transform all of the data (feature columns only)
# into Tox21-dimension-reduced-space
all_mapped = mapper.transform(df[["feature columns"]])

# plot data - this is how to plot it directly from the UMAP object
fig, ax = plt.subplots(figsize = (12,12))
ax=umap.plot._matplotlib_points(mapper.embedding_,
                             ax=ax,
                             labels=plot_df["dataset_type"],
                             color_key_cmap="spring",
                             background="lightgray")
ax.title.set_text(f"{feat_type} UMAP with {labelcol} labels")

# you can also extract the data from the UMAP object and plot it 
# more freely, examples in notebook


NameError: name 'df' is not defined