## Introduction

In this notebook, I generate [MolCLR](https://github.com/yuyangw/MolCLR) embeddings for ECBD compounds based on their SMILES strings. MolCLR uses graph-based contrastive learning to produce general-purpose molecular representations that can capture complex chemical structure patterns.

Main steps:
- Load SMILES strings from the preprocessed compound dataset.
- Convert SMILES to molecular graphs using RDKit.
- Use a pretrained MolCLR encoder to generate embeddings with dimensionality 512.
- Save the resulting feature vectors for downstream classification tasks.

In [17]:
import os
import glob
from datetime import date

import numpy as np
import pandas as pd
from tqdm import tqdm

In [18]:
# paths to data
save_path = "result/"

features_count = 512

In [19]:
# loading our initial dataset
file_ecbd_type = '/*[0-9]_' + 'ecbd.csv'
files_ecbd = glob.glob(save_path + file_ecbd_type)

# gets latest file
max_file_ecbd = max(files_ecbd, key=os.path.getctime)

# load file
df_ecbd = pd.read_csv(max_file_ecbd)
df_ecbd

Unnamed: 0,Metadata_EOS,Metadata_pdid,Metadata_name,Metadata_smiles,Metadata_inchi,Metadata_inchikey,probe,experimental probe,calculated probe,available,...,moa_potentiator,moa_stabiliser,moa_stimulator,moa_substrate,drug_status_approved,drug_status_experimental,drug_status_investigational,drug_status_nutraceutical,drug_status_vet_approved,drug_status_withdrawn
0,EOS101163,PD000002,I-BRD9,CCn1cc(-c2cccc(C(F)(F)F)c2)c2sc(C(=N)NC3CCS(=O...,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,WRUWGLUCNBMGPS-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,EOS101593,PD000003,UNC0638,COc1cc2c(NC3CCN(C(C)C)CC3)nc(C3CCCCC3)nc2cc1OC...,InChI=1S/C30H47N5O2/c1-22(2)35-17-12-24(13-18-...,QOECJCJVIMVJGX-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,EOS101154,PD000005,UNC1215,O=C(c1ccc(C(=O)N2CCC(N3CCCC3)CC2)c(Nc2ccccc2)c...,InChI=1S/C32H43N5O2/c38-31(36-20-12-27(13-21-3...,PQOOIERVZAXHBP-UHFFFAOYSA-N,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,EOS101601,PD000007,IOX1,O=C(O)c1ccc(O)c2ncccc12,InChI=1S/C10H7NO3/c12-8-4-3-7(10(13)14)6-2-1-5...,JGRPKOGHYBAVMW-UHFFFAOYSA-N,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,EOS101116,PD000008,IOX2,O=C(O)CNC(=O)c1c(O)c2ccccc2n(Cc2ccccc2)c1=O,InChI=1S/C19H16N2O5/c22-15(23)10-20-18(25)16-1...,CAOSCCRYLYQBES-UHFFFAOYSA-N,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,EOS102392,PD164271,PD164271,N#CCc1ccccn1,"InChI=1S/C7H6N2/c8-5-4-7-3-1-2-6-9-7/h1-3,6H,4H2",UKVQBONVSSLJBB-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2460,EOS102394,PD164272,ALPHA-HEXACHLOROCYCLOHEXANE,Cl[C@H]1[C@H](Cl)[C@H](Cl)[C@@H](Cl)[C@H](Cl)[...,InChI=1S/C6H6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9...,JLYXXMFPNIAWKQ-LKPKBOIGSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2461,EOS102396,PD164274,PD164274,O=C(O)/C=C1/CCCc2ccccc2[C@@H]1O,InChI=1S/C13H14O3/c14-12(15)8-10-6-3-5-9-4-1-2...,UADPGHINQMWEAG-CHOZFAJLSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2462,EOS102400,PD164275,PD164275,CC1CCC2C(=O)OC(=O)C2C1,InChI=1S/C9H12O3/c1-5-2-3-6-7(4-5)9(11)12-8(6)...,FKBMTBAXDISZGN-UHFFFAOYSA-N,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Let's use code from MolCLR repository to convert SMILES strings into molecular graphs:

In [20]:
from rdkit import Chem
import torch
import numpy as np
from torch_geometric.data import Data

# List of features for MolCLR
ATOM_LIST = list(range(1, 119))
CHIRALITY_LIST = [
    Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
    Chem.rdchem.ChiralType.CHI_OTHER
]
BOND_LIST = [
    Chem.rdchem.BondType.SINGLE,
    Chem.rdchem.BondType.DOUBLE,
    Chem.rdchem.BondType.TRIPLE,
    Chem.rdchem.BondType.AROMATIC
]
BONDDIR_LIST = [
    Chem.rdchem.BondDir.NONE,
    Chem.rdchem.BondDir.ENDUPRIGHT,
    Chem.rdchem.BondDir.ENDDOWNRIGHT
]

def smiles2graph(smiles: str) -> Data:
    mol = Chem.MolFromSmiles(smiles)
    assert mol is not None, f"Invalid SMILES: {smiles}"

    # Vertices (x): atomic number + chirality
    type_idx = []
    chirality_idx = []
    for atom in mol.GetAtoms():
        type_idx.append(ATOM_LIST.index(atom.GetAtomicNum()))
        chirality_idx.append(CHIRALITY_LIST.index(atom.GetChiralTag()))
    x = torch.tensor(np.stack([type_idx, chirality_idx], axis=1), dtype=torch.long)

    # Edges (edge_index) and features (edge_attr): connection type and direction
    row, col, edge_feat = [], [], []
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        row += [i, j]
        col += [j, i]
        bt = BOND_LIST.index(bond.GetBondType())
        bd = BONDDIR_LIST.index(bond.GetBondDir())
        edge_feat += [[bt, bd], [bt, bd]]  # both directions

    edge_index = torch.tensor([row, col], dtype=torch.long)
    edge_attr = torch.tensor(edge_feat, dtype=torch.long)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    data.batch = torch.zeros(x.size(0), dtype=torch.long)  # batch with size 1

    return data

We will use a pretrained MolCLR model:

In [21]:
import torch
from mol_crl.ginet_molclr import GINet

model = GINet(num_layer=5, emb_dim=300, feat_dim=features_count, drop_ratio=0.0, pool="mean")
model.load_state_dict(torch.load("mol_crl/model.pth", map_location="cpu"))
model.eval()

GINet(
  (x_embedding1): Embedding(119, 300)
  (x_embedding2): Embedding(3, 300)
  (gnns): ModuleList(
    (0-4): 5 x GINEConv()
  )
  (batch_norms): ModuleList(
    (0-4): 5 x BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (feat_lin): Linear(in_features=300, out_features=512, bias=True)
  (out_lin): Sequential(
    (0): Linear(in_features=512, out_features=512, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=512, out_features=256, bias=True)
  )
)

In [22]:
# this function generates the embedings
@torch.no_grad()
def smiles_to_embedings(smiles):
    data = smiles2graph(smiles)
    h, _ = model(data)
    return h.squeeze().numpy()

In [23]:
# apply the function to the DataFrame
tqdm.pandas()
embedings = df_ecbd['Metadata_smiles'].progress_apply(smiles_to_embedings)

# add prefix fp_ to the columns with embedings
df_embedings = pd.DataFrame(
    np.vstack(embedings),
    columns=[f'fp_{i}' for i in range(features_count)],
    index=df_ecbd.index
)
df_embedings

100%|██████████| 2464/2464 [00:11<00:00, 221.01it/s]


Unnamed: 0,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,fp_6,fp_7,fp_8,fp_9,...,fp_502,fp_503,fp_504,fp_505,fp_506,fp_507,fp_508,fp_509,fp_510,fp_511
0,0.019976,-0.011272,-0.008687,-0.098019,-0.159003,-0.002874,0.005149,0.358278,0.221513,-0.026369,...,-0.009652,-0.009014,-0.004872,0.001793,0.006755,-0.015224,-0.003066,0.003297,0.004927,0.005235
1,0.020783,-0.014937,-0.007422,-0.061525,-0.192063,-0.003884,0.004853,0.527803,0.224431,-0.027390,...,-0.004147,-0.011597,0.000007,-0.000260,0.006644,-0.014582,-0.005006,0.003152,0.009103,0.003386
2,0.020444,-0.014176,-0.008218,-0.064312,-0.238408,-0.001753,0.005848,0.441818,0.237485,-0.028086,...,-0.005562,-0.010420,-0.001604,0.000005,0.006492,-0.015744,-0.004839,0.005303,0.006258,0.003480
3,0.022697,-0.010005,-0.004576,0.015846,-0.200103,-0.004072,0.010034,0.629596,0.203441,-0.030663,...,-0.005991,-0.010519,0.000367,-0.004945,0.009628,-0.012861,-0.008997,0.003438,0.011160,0.006716
4,0.020307,-0.009704,-0.006840,0.008118,-0.209203,-0.001726,0.008003,0.533234,0.202700,-0.030358,...,-0.008961,-0.007427,-0.003365,-0.002786,0.007068,-0.014306,-0.006787,0.005757,0.005799,0.006099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,0.021565,-0.008765,-0.009557,-0.074770,-0.052879,-0.001751,0.011400,0.404451,0.125579,-0.030505,...,-0.004179,-0.003555,-0.012376,0.000566,0.004126,-0.015840,0.002919,0.007134,0.004434,0.008699
2460,0.019851,-0.014923,-0.009927,0.048902,-0.147366,-0.007193,0.003019,0.451565,0.218986,-0.024493,...,0.003403,-0.016317,0.010066,0.004196,0.000373,-0.019137,-0.002833,-0.007359,0.013387,-0.003956
2461,0.021020,-0.009558,-0.008151,-0.011575,-0.138371,-0.002516,0.007459,0.429901,0.178988,-0.025446,...,-0.004886,-0.007972,-0.001957,-0.001061,0.006149,-0.013120,-0.006037,0.004062,0.007926,0.005134
2462,0.019772,-0.013306,-0.010161,-0.066725,-0.113565,-0.003066,0.005415,0.487236,0.129863,-0.023799,...,-0.006399,-0.005893,-0.001018,0.002218,0.004813,-0.016106,-0.002260,0.003595,0.005777,0.004184


In [24]:
# concat the fingerprints with the original DataFrame
df_ecbd_embedings = pd.concat([df_ecbd, df_embedings], axis=1)
df_ecbd_embedings

Unnamed: 0,Metadata_EOS,Metadata_pdid,Metadata_name,Metadata_smiles,Metadata_inchi,Metadata_inchikey,probe,experimental probe,calculated probe,available,...,fp_502,fp_503,fp_504,fp_505,fp_506,fp_507,fp_508,fp_509,fp_510,fp_511
0,EOS101163,PD000002,I-BRD9,CCn1cc(-c2cccc(C(F)(F)F)c2)c2sc(C(=N)NC3CCS(=O...,InChI=1S/C22H22F3N3O3S2/c1-2-28-12-17(13-4-3-5...,WRUWGLUCNBMGPS-UHFFFAOYSA-N,1,1,1,1,...,-0.009652,-0.009014,-0.004872,0.001793,0.006755,-0.015224,-0.003066,0.003297,0.004927,0.005235
1,EOS101593,PD000003,UNC0638,COc1cc2c(NC3CCN(C(C)C)CC3)nc(C3CCCCC3)nc2cc1OC...,InChI=1S/C30H47N5O2/c1-22(2)35-17-12-24(13-18-...,QOECJCJVIMVJGX-UHFFFAOYSA-N,1,1,1,1,...,-0.004147,-0.011597,0.000007,-0.000260,0.006644,-0.014582,-0.005006,0.003152,0.009103,0.003386
2,EOS101154,PD000005,UNC1215,O=C(c1ccc(C(=O)N2CCC(N3CCCC3)CC2)c(Nc2ccccc2)c...,InChI=1S/C32H43N5O2/c38-31(36-20-12-27(13-21-3...,PQOOIERVZAXHBP-UHFFFAOYSA-N,1,1,1,1,...,-0.005562,-0.010420,-0.001604,0.000005,0.006492,-0.015744,-0.004839,0.005303,0.006258,0.003480
3,EOS101601,PD000007,IOX1,O=C(O)c1ccc(O)c2ncccc12,InChI=1S/C10H7NO3/c12-8-4-3-7(10(13)14)6-2-1-5...,JGRPKOGHYBAVMW-UHFFFAOYSA-N,1,1,0,1,...,-0.005991,-0.010519,0.000367,-0.004945,0.009628,-0.012861,-0.008997,0.003438,0.011160,0.006716
4,EOS101116,PD000008,IOX2,O=C(O)CNC(=O)c1c(O)c2ccccc2n(Cc2ccccc2)c1=O,InChI=1S/C19H16N2O5/c22-15(23)10-20-18(25)16-1...,CAOSCCRYLYQBES-UHFFFAOYSA-N,1,1,0,1,...,-0.008961,-0.007427,-0.003365,-0.002786,0.007068,-0.014306,-0.006787,0.005757,0.005799,0.006099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,EOS102392,PD164271,PD164271,N#CCc1ccccn1,"InChI=1S/C7H6N2/c8-5-4-7-3-1-2-6-9-7/h1-3,6H,4H2",UKVQBONVSSLJBB-UHFFFAOYSA-N,0,0,0,1,...,-0.004179,-0.003555,-0.012376,0.000566,0.004126,-0.015840,0.002919,0.007134,0.004434,0.008699
2460,EOS102394,PD164272,ALPHA-HEXACHLOROCYCLOHEXANE,Cl[C@H]1[C@H](Cl)[C@H](Cl)[C@@H](Cl)[C@H](Cl)[...,InChI=1S/C6H6Cl6/c7-1-2(8)4(10)6(12)5(11)3(1)9...,JLYXXMFPNIAWKQ-LKPKBOIGSA-N,0,0,0,1,...,0.003403,-0.016317,0.010066,0.004196,0.000373,-0.019137,-0.002833,-0.007359,0.013387,-0.003956
2461,EOS102396,PD164274,PD164274,O=C(O)/C=C1/CCCc2ccccc2[C@@H]1O,InChI=1S/C13H14O3/c14-12(15)8-10-6-3-5-9-4-1-2...,UADPGHINQMWEAG-CHOZFAJLSA-N,0,0,0,1,...,-0.004886,-0.007972,-0.001957,-0.001061,0.006149,-0.013120,-0.006037,0.004062,0.007926,0.005134
2462,EOS102400,PD164275,PD164275,CC1CCC2C(=O)OC(=O)C2C1,InChI=1S/C9H12O3/c1-5-2-3-6-7(4-5)9(11)12-8(6)...,FKBMTBAXDISZGN-UHFFFAOYSA-N,0,0,0,1,...,-0.006399,-0.005893,-0.001018,0.002218,0.004813,-0.016106,-0.002260,0.003595,0.005777,0.004184


In [26]:
# save the DataFrame with fingerprints to a CSV file
filename = save_path + str(date.today()) + "_ecbd_molcrl.csv"
df_ecbd_embedings.to_csv(filename, index = False)

## Summary
- Successfully generated 512-dimensional MolCLR embeddings for all valid compounds.
- Stored embeddings in a dataframe indexed by compound ID.
- Output: high-dimensional molecular features ready for integration with other modalities (e.g., Cell Painting, MoA labels).