### Download training data from the web

Download a CSV file containing molecular data for training.

In [33]:
import base64
import requests

input_data="https://raw.githubusercontent.com/dgront/chem-ml/refs/heads/main/INPUTS/xlogp_JChemEdu/xlogp.tsv"
req = requests.get(input_data)
table = []
for row in req.text.splitlines():
  tokens = row.split("\t")
  if len(tokens) == 4:
    if len(tokens[2]) == 0: continue
    table.append(tokens)

print(table[0:5])
print(len(table))

[['cmpdname', 'mf', 'isosmiles', 'xlogp'], ['1-Aminopropan-2-ol', 'C3H9NO', 'CC(CN)O', '-1'], ['"1-Chloro-2,4-dinitrobenzene"', 'C6H3ClN2O4', 'C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl', '2.3'], ['9-Ethyladenine', 'C7H9N5', 'CCN1C=NC2=C(N=CN=C21)N', '0.2'], ['"1,2-Dichloroethane"', 'C2H4Cl2', 'C(CCl)Cl', '1.5']]
266712


### Install dependencies

We need RDKit to process SMILES; tensorflow to prepare training data as tensors

In [34]:
!pip3.11 install numpy rdkit-pypi
from rdkit import Chem
import tensorflow as tf
import numpy as np



### Extract features from SMILES

First we define a function that accepts a SMILES and returns data we need to train GNN model. For a molecule of N atoms and M bonds, the function returns:

*   list of element names for all atoms - N strings like "C", "Fe", etc
*   list of hybridisation for all atoms - N strings like "sp2", "sp" etc
*   list of bond types for all bonds - M tuples (int, int, str) for a bond that connects atoms given by the two indexes; the string may be "SINGLE", "DOUBLE" etc to denote the bond type



In [35]:
from typing import List, Tuple

def extract_atoms_and_bonds(smiles: str) -> Tuple[List[str], List[str], List[Tuple[int, int, str]]]:
    # Parse SMILES to molecule
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string.")

    mol = Chem.AddHs(mol)

    # Extract atom list
    atom_list = []
    hybrid_list = []
    for atom in mol.GetAtoms():
        atom_symbol = atom.GetSymbol()
        atom_list.append(atom_symbol)
        atom_hybridization = atom.GetHybridization()
        hybrid_list.append(str(atom_hybridization))

    # Extract bond list
    bond_list = []
    for bond in mol.GetBonds():
        begin_idx = bond.GetBeginAtomIdx()
        end_idx = bond.GetEndAtomIdx()
        bond_type = bond.GetBondType()  # e.g., SINGLE, DOUBLE, etc.
        bond_list.append((begin_idx, end_idx, str(bond_type)))

    return atom_list, hybrid_list, bond_list

**Define a list of elements and bond types that will be explicitely encoded** in one-hot manner. Here we list only most popular elements. Every element that is not on the list, will be encoded as *unknown*.

In [36]:
allowed_bond_types = ['SINGLE', 'DOUBLE', 'TRIPLE', 'AROMATIC']
allowed_elements = ['C', 'N', 'O', 'H', 'Cl', 'P', 'S', 'As', 'Br', 'Ca', 'Se', 'I', 'K', 'Mg', 'Na', 'Ni', 'W', 'F', 'B', 'Hg', 'Al',  'Li', 'Zn', 'Si',  'Co', 'Pb', 'Sn',  'Cu',  'Ba', 'Fe', 'Mn', 'Cr']
allowed_hybridization = ['SP', 'SP2', 'SP3', 'SP3D', 'SP3D2', 'S', 'UNSPECIFIED']

allowed_bond_indexed = {token: idx for idx, token in enumerate(allowed_bond_types)}
allowed_elements_indexed = {token: idx for idx, token in enumerate(allowed_elements)}
allowed_hybridization_indexed = {token: idx for idx, token in enumerate(allowed_hybridization)}

A function that provides **one-hot encoding of a string** by finding its position on a list of K allowed strings. If not found, the given `input_string` is encoded as K+1st variant

In [37]:
import numpy as np

def one_hot_encode(allowed_tokens, input_string):
    """
    One-hot encode a string based on a given dict of string tokens.

    Returns:
        np.ndarray: A 1D numpy array representing the one-hot encoded token.
    """

    # Initialize an empty matrix
    one_hot_matrix = np.zeros((len(allowed_tokens) + 1), dtype=int)

    # Fill in the one-hot matrix
    if input_string in allowed_tokens:
        one_hot_matrix[allowed_tokens[input_string]] = 1
    else:
        one_hot_matrix[len(allowed_tokens)] = 1

    return one_hot_matrix

# ---- a short test
print(tf.convert_to_tensor(one_hot_encode(allowed_elements_indexed, "Cl")))
print(tf.convert_to_tensor(one_hot_encode(allowed_elements_indexed, "C")))
print(tf.convert_to_tensor(one_hot_encode(allowed_bond_indexed, "DOUBLE")))


tf.Tensor([0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(33,), dtype=int64)
tf.Tensor([1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(33,), dtype=int64)
tf.Tensor([0 1 0 0 0], shape=(5,), dtype=int64)


Load the SMILES dataset; use the extract_atoms_and_bonds() function to extract bonds and atoms

In [38]:
import tensorflow as tf

V_features = []       # features for atoms = element one-hot + hybridisation one-hot
E_indexing = []       # bond indexes as (in, int) tuples
E_features = []       # bond features: bond type one-hot
Y_labels   = []       # log(p) - the value we predict
names      = []       # names of the compounds (just in case)
formulas   = []       # formular of the compounds (just in case)

# iterate over all the smiles
for name, formula, smiles, logp in table[1:1000]:
    # get atoms and bonds data for that SMILES
    try:
      atoms, hybridization, bonds = extract_atoms_and_bonds(smiles)
      if len(atoms) < 3 or len(bonds) < 2: continue   # skip if the compound is too small
    except:
      print("Problematic SMILES:",smiles)
    try:
      logp_flt = float(logp)
    except:
      print("can't parse log(p):",logp)

    encoded_elements = tf.stack([one_hot_encode(allowed_elements_indexed, a) for a in atoms])
    encoded_hybrid = tf.stack([one_hot_encode(allowed_hybridization_indexed, h) for h in hybridization])
    encoded_atoms = tf.concat([encoded_elements, encoded_hybrid], axis=1)

    indexing = tf.convert_to_tensor([(bi, bj) for (bi, bj, _t) in bonds])
    indexing = tf.transpose(indexing)

    encoded_bonds = tf.stack([one_hot_encode(allowed_bond_indexed, b[2]) for b in bonds])
    V_features.append(encoded_atoms)
    E_indexing.append(indexing)
    E_features.append(encoded_bonds)
    logp_flt = tf.convert_to_tensor([logp_flt], dtype=tf.float32)

    Y_labels.append(logp_flt)
    names.append(name)
    formulas.append(formula)



can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL




can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL
can't parse log(p): NULL


In [39]:
# ---- a short test
print(formulas[10])
print(V_features[10])
print(E_features[10])
print(E_indexing[10])

C2H3ClO
tf.Tensor(
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 0]], shape=(7, 41), dtype=int32)
tf.Tensor(
[[1 0 0 0 0]
 [0 1 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]], shape=(6, 5), dtype=int32)
tf.Tensor(
[[0 1 0 0 0 1]
 [1 2 3 4 5 6]], shape=(2, 6), dtype=int32)


**Preapre training / validation / test data sets**

All the molecules are split in 60%, 20%, 20% ratio

In [40]:
import random

# Define the split ratios
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

# Calculate the split indices
data_len = len(V_features)
train_split = int(train_ratio * data_len)
val_split = int((train_ratio + val_ratio) * data_len)

# Generate shuffled indices
indices = list(range(data_len))
random.shuffle(indices)

# Split the lists using the shuffled indices
V_train = [V_features[i] for i in indices[:train_split]]
V_val = [V_features[i] for i in indices[train_split:val_split]]
V_test = [V_features[i] for i in indices[val_split:]]

E_i_train = [E_indexing[i] for i in indices[:train_split]]
E_i_val = [E_indexing[i] for i in indices[train_split:val_split]]
E_i_test = [E_indexing[i] for i in indices[val_split:]]

E_f_train = [E_features[i] for i in indices[:train_split]]
E_f_val = [E_features[i] for i in indices[train_split:val_split]]
E_f_test = [E_features[i] for i in indices[val_split:]]

Y_train = [Y_labels[i] for i in indices[:train_split]]
Y_val = [Y_labels[i] for i in indices[train_split:val_split]]
Y_test = [Y_labels[i] for i in indices[val_split:]]

names_train = [names[i] for i in indices[:train_split]]
names_val = [names[i] for i in indices[train_split:val_split]]
names_test = [names[i] for i in indices[val_split:]]

formulas_train = [formulas[i] for i in indices[:train_split]]
formulas_val = [formulas[i] for i in indices[train_split:val_split]]
formulas_test = [formulas[i] for i in indices[val_split:]]
