In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

from chemprop.utils import make_mol

smiles_list = [
    "[H][H]",
    "C",
    "CN",
    "CN",
    "CC",
    "[CH2:3]=[N+:1]([H:4])[H:2]",
    "CCCC",
    "CO",
    "CC#N",
    "C1NN1",
    "c1cc[n-]c1",
]
mols = [make_mol(smi, keep_h=True, add_h=False, reorder_atoms=True) for smi in smiles_list]
columns = ["smiles", "mol_y1", "mol_y2", "atom_y1", "atom_y2", "bond_y1", "bond_y2", "weight"]

In [2]:
X_d = []
V_fds = []
E_fds = []

for i, mol in enumerate(mols):
    x_d1 = int(any(atom.GetAtomicNum() == 6 for atom in mol.GetAtoms()))  # Has carbon
    x_d2 = int(any(atom.GetAtomicNum() == 7 for atom in mol.GetAtoms()))  # Has nitrogen
    v_fd = np.array(
        [[int(atom.GetAtomicNum() == 6), int(atom.GetAtomicNum() == 7)] for atom in mol.GetAtoms()]
    )  # Is carbon, Is nitrogen
    e_fd = np.array(
        [
            [
                int(bond.GetBondType() == Chem.rdchem.BondType.SINGLE),
                int(bond.GetBondType() == Chem.rdchem.BondType.DOUBLE),
            ]
            for bond in mol.GetBonds()
        ]
    ).reshape(
        mol.GetNumBonds(), 2
    )  # Is single bond, Is double bond

    X_d.append([x_d1, x_d2])
    V_fds.append(v_fd)
    E_fds.append(e_fd)

np.savez("atom_features_descriptors.npz", *V_fds)
np.savez("bond_features_descriptors.npz", *E_fds)
np.savez("descriptors.npz", np.array(X_d))

In [3]:
X_d, V_fds, E_fds

([[0, 0],
  [1, 0],
  [1, 1],
  [1, 1],
  [1, 0],
  [1, 1],
  [1, 0],
  [1, 0],
  [1, 1],
  [1, 1],
  [1, 1]],
 [array([[0, 0],
         [0, 0]]),
  array([[1, 0]]),
  array([[1, 0],
         [0, 1]]),
  array([[1, 0],
         [0, 1]]),
  array([[1, 0],
         [1, 0]]),
  array([[0, 1],
         [0, 0],
         [1, 0],
         [0, 0]]),
  array([[1, 0],
         [1, 0],
         [1, 0],
         [1, 0]]),
  array([[1, 0],
         [0, 0]]),
  array([[1, 0],
         [1, 0],
         [0, 1]]),
  array([[1, 0],
         [0, 1],
         [0, 1]]),
  array([[1, 0],
         [1, 0],
         [1, 0],
         [0, 1],
         [1, 0]])],
 [array([[1, 0]]),
  array([], shape=(0, 2), dtype=float64),
  array([[1, 0]]),
  array([[1, 0]]),
  array([[1, 0]]),
  array([[0, 1],
         [1, 0],
         [1, 0]]),
  array([[1, 0],
         [1, 0],
         [1, 0]]),
  array([[1, 0]]),
  array([[1, 0],
         [0, 0]]),
  array([[1, 0],
         [1, 0],
         [1, 0]]),
  array([[0, 0],
       

In [4]:
data = []
for i, (mol, smiles) in enumerate(zip(mols, smiles_list)):
    # Molecular properties
    mol_y1 = Descriptors.MolWt(mol)  # Molecular weight
    mol_y2 = mol.GetNumAtoms()  # Number of atoms

    # Atom properties
    atom_y1 = [atom.GetAtomicNum() for atom in mol.GetAtoms()]  # Atomic numbers
    atom_y2 = [atom.GetMass() for atom in mol.GetAtoms()]  # Atomic masses

    # Bond properties
    bond_y1 = [
        sum((bond.GetBeginAtom().GetAtomicNum(), bond.GetEndAtom().GetAtomicNum()))
        for bond in mol.GetBonds()
    ]  # Sum of atomic numbers of bonded atoms
    bond_y2 = [-x for x in bond_y1]  # Negative of bond_y1

    weight = (i + 1) / len(smiles_list)

    if i == 3:
        mol_y2 = None
        atom_y2[0] = None
        bond_y2[0] = None

    data.append([smiles, mol_y1, mol_y2, atom_y1, atom_y2, bond_y1, bond_y2, weight])

df = pd.DataFrame(data, columns=columns)
df.to_csv("regression.csv", index=False)
df

Unnamed: 0,smiles,mol_y1,mol_y2,atom_y1,atom_y2,bond_y1,bond_y2,weight
0,[H][H],2.016,2.0,"[1, 1]","[1.008, 1.008]",[2],[-2],0.090909
1,C,16.043,1.0,[6],[12.011],[],[],0.181818
2,CN,31.058,2.0,"[6, 7]","[12.011, 14.007]",[13],[-13],0.272727
3,CN,31.058,,"[6, 7]","[None, 14.007]",[13],[None],0.363636
4,CC,30.07,2.0,"[6, 6]","[12.011, 12.011]",[12],[-12],0.454545
5,[CH2:3]=[N+:1]([H:4])[H:2],30.05,4.0,"[7, 1, 6, 1]","[14.007, 1.008, 12.011, 1.008]","[13, 8, 8]","[-13, -8, -8]",0.545455
6,CCCC,58.124,4.0,"[6, 6, 6, 6]","[12.011, 12.011, 12.011, 12.011]","[12, 12, 12]","[-12, -12, -12]",0.636364
7,CO,32.042,2.0,"[6, 8]","[12.011, 15.999]",[14],[-14],0.727273
8,CC#N,41.053,3.0,"[6, 6, 7]","[12.011, 12.011, 14.007]","[12, 13]","[-12, -13]",0.818182
9,C1NN1,44.057,3.0,"[6, 7, 7]","[12.011, 14.007, 14.007]","[13, 14, 13]","[-13, -14, -13]",0.909091


In [5]:
bounds = ["<", "", ">"]

data = []
for i, (mol, smiles) in enumerate(zip(mols, smiles_list)):
    # Molecular properties
    mol_y1 = Descriptors.MolWt(mol)  # Molecular weight
    mol_y2 = mol.GetNumAtoms()  # Number of atoms

    # Atom properties
    atom_y1 = [atom.GetAtomicNum() for atom in mol.GetAtoms()]  # Atomic numbers
    atom_y2 = [atom.GetMass() for atom in mol.GetAtoms()]  # Atomic masses

    # Bond properties
    bond_y1 = [
        sum((bond.GetBeginAtom().GetAtomicNum(), bond.GetEndAtom().GetAtomicNum()))
        for bond in mol.GetBonds()
    ]  # Sum of atomic numbers of bonded atoms
    bond_y2 = [-x for x in bond_y1]  # Negative of bond_y1

    weight = (i + 1) / len(smiles_list)

    mol_y1 = bounds[np.random.randint(0, 3)] + str(mol_y1)
    mol_y2 = bounds[np.random.randint(0, 3)] + str(mol_y2)
    atom_y1 = [bounds[np.random.randint(0, 3)] + str(x) for x in atom_y1]
    atom_y2 = [bounds[np.random.randint(0, 3)] + str(x) for x in atom_y2]
    bond_y1 = [bounds[np.random.randint(0, 3)] + str(x) for x in bond_y1]
    bond_y2 = [bounds[np.random.randint(0, 3)] + str(x) for x in bond_y2]

    if i == 3:
        mol_y2 = None
        atom_y2[0] = None
        bond_y2[0] = None

    data.append([smiles, mol_y1, mol_y2, atom_y1, atom_y2, bond_y1, bond_y2, weight])

df = pd.DataFrame(data, columns=columns)
df.to_csv("bounded.csv", index=False)
df

Unnamed: 0,smiles,mol_y1,mol_y2,atom_y1,atom_y2,bond_y1,bond_y2,weight
0,[H][H],<2.016,2,"[<1, >1]","[>1.008, 1.008]",[<2],[>-2],0.090909
1,C,<16.043,1,[6],[<12.011],[],[],0.181818
2,CN,<31.058,2,"[6, 7]","[>12.011, >14.007]",[<13],[<-13],0.272727
3,CN,>31.058,,"[<6, 7]","[None, 14.007]",[>13],[None],0.363636
4,CC,<30.07,>2,"[>6, <6]","[12.011, <12.011]",[>12],[<-12],0.454545
5,[CH2:3]=[N+:1]([H:4])[H:2],30.05,>4,"[>7, 1, 6, 1]","[>14.007, <1.008, >12.011, 1.008]","[<13, >8, >8]","[-13, -8, <-8]",0.545455
6,CCCC,58.123999999999995,4,"[6, <6, >6, <6]","[<12.011, >12.011, >12.011, 12.011]","[12, >12, >12]","[-12, >-12, >-12]",0.636364
7,CO,32.042,2,"[<6, <8]","[<12.011, <15.999]",[>14],[-14],0.727273
8,CC#N,41.053,>3,"[>6, 6, 7]","[>12.011, >12.011, >14.007]","[12, 13]","[<-12, >-13]",0.818182
9,C1NN1,<44.057,3,"[6, 7, 7]","[<12.011, 14.007, <14.007]","[<13, 14, >13]","[<-13, <-14, -13]",0.909091


In [6]:
data = []
for i, (mol, smiles) in enumerate(zip(mols, smiles_list)):
    # Molecule-level features
    mol_y1 = int(any(atom.GetAtomicNum() == 6 for atom in mol.GetAtoms()))  # Has carbon
    mol_y2 = int(any(atom.GetAtomicNum() == 7 for atom in mol.GetAtoms()))  # Has nitrogen

    # Atom-level features
    atom_y1 = [int(atom.GetAtomicNum() == 6) for atom in mol.GetAtoms()]  # Is carbon
    atom_y2 = [int(atom.GetAtomicNum() == 7) for atom in mol.GetAtoms()]  # Is nitrogen

    # Bond-level features
    bond_y1 = [
        int(bond.GetBondType() == Chem.rdchem.BondType.SINGLE) for bond in mol.GetBonds()
    ]  # Is single bond
    bond_y2 = [
        int(bond.GetBondType() == Chem.rdchem.BondType.DOUBLE) for bond in mol.GetBonds()
    ]  # Is double bond

    weight = (i + 1) / len(smiles_list)

    if i == 3:
        mol_y2 = None
        atom_y2[0] = None
        bond_y2[0] = None

    data.append([smiles, mol_y1, mol_y2, atom_y1, atom_y2, bond_y1, bond_y2, weight])

df = pd.DataFrame(data, columns=columns)
df.to_csv("classification.csv", index=False)
df

Unnamed: 0,smiles,mol_y1,mol_y2,atom_y1,atom_y2,bond_y1,bond_y2,weight
0,[H][H],0,0.0,"[0, 0]","[0, 0]",[1],[0],0.090909
1,C,1,0.0,[1],[0],[],[],0.181818
2,CN,1,1.0,"[1, 0]","[0, 1]",[1],[0],0.272727
3,CN,1,,"[1, 0]","[None, 1]",[1],[None],0.363636
4,CC,1,0.0,"[1, 1]","[0, 0]",[1],[0],0.454545
5,[CH2:3]=[N+:1]([H:4])[H:2],1,1.0,"[0, 0, 1, 0]","[1, 0, 0, 0]","[0, 1, 1]","[1, 0, 0]",0.545455
6,CCCC,1,0.0,"[1, 1, 1, 1]","[0, 0, 0, 0]","[1, 1, 1]","[0, 0, 0]",0.636364
7,CO,1,0.0,"[1, 0]","[0, 0]",[1],[0],0.727273
8,CC#N,1,1.0,"[1, 1, 0]","[0, 0, 1]","[1, 0]","[0, 0]",0.818182
9,C1NN1,1,1.0,"[1, 0, 0]","[0, 1, 1]","[1, 1, 1]","[0, 0, 0]",0.909091


In [7]:
mol_classes1 = lambda x: 0 if x < 10 else 1 if x <= 40 else 2
mol_classes2 = {2: 0, 3: 1}
atom_classes1 = {6: 0, 7: 1}
atom_classes2 = {Chem.rdchem.HybridizationType.SP3: 0, Chem.rdchem.HybridizationType.SP2: 1}
bond_classes1 = {Chem.rdchem.BondType.SINGLE: 0, Chem.rdchem.BondType.TRIPLE: 1}
bond_classes2 = {0: 0, 1: 1}

data = []
for i, (mol, smiles) in enumerate(zip(mols, smiles_list)):
    # Molecule-level features
    mol_y1 = mol_classes1(Descriptors.MolWt(mol))  # Molecular weight class
    mol_y2 = mol_classes2.get(mol.GetNumAtoms(), 2)  # Number of atoms class

    # Atom-level features
    atom_y1 = [
        atom_classes1.get(atom.GetAtomicNum(), 2) for atom in mol.GetAtoms()
    ]  # Atomic number class
    atom_y2 = [
        atom_classes2.get(atom.GetHybridization(), 2) for atom in mol.GetAtoms()
    ]  # Hybridization class

    # Bond-level features
    bond_y1 = [
        bond_classes1.get(bond.GetBondType(), 2) for bond in mol.GetBonds()
    ]  # Bond type class
    bond_y2 = [
        bond_classes2.get(
            sum([a.GetAtomicNum() == 6 for a in [bond.GetBeginAtom(), bond.GetEndAtom()]]), 2
        )
        for bond in mol.GetBonds()
    ]  # Bond type class

    weight = (i + 1) / len(smiles_list)

    if i == 3:
        mol_y2 = None
        atom_y2[0] = None
        bond_y2[0] = None

    data.append([smiles, mol_y1, mol_y2, atom_y1, atom_y2, bond_y1, bond_y2, weight])

df = pd.DataFrame(data, columns=columns)
df.to_csv("multiclass.csv", index=False)
df

Unnamed: 0,smiles,mol_y1,mol_y2,atom_y1,atom_y2,bond_y1,bond_y2,weight
0,[H][H],0,0.0,"[2, 2]","[2, 2]",[0],[0],0.090909
1,C,1,2.0,[0],[0],[],[],0.181818
2,CN,1,0.0,"[0, 1]","[0, 0]",[0],[1],0.272727
3,CN,1,,"[0, 1]","[None, 0]",[0],[None],0.363636
4,CC,1,0.0,"[0, 0]","[0, 0]",[0],[2],0.454545
5,[CH2:3]=[N+:1]([H:4])[H:2],1,2.0,"[1, 2, 0, 2]","[1, 2, 1, 2]","[2, 0, 0]","[1, 0, 0]",0.545455
6,CCCC,2,2.0,"[0, 0, 0, 0]","[0, 0, 0, 0]","[0, 0, 0]","[2, 2, 2]",0.636364
7,CO,1,0.0,"[0, 2]","[0, 0]",[0],[1],0.727273
8,CC#N,2,1.0,"[0, 0, 1]","[0, 2, 2]","[0, 1]","[2, 1]",0.818182
9,C1NN1,2,1.0,"[0, 1, 1]","[0, 0, 0]","[0, 0, 0]","[1, 0, 1]",0.909091


In [8]:
data = []
constraints = []
for i, (mol, smiles) in enumerate(zip(mols, smiles_list)):
    # Molecular properties
    mol_y = Chem.GetFormalCharge(mol)  # Formal charge

    # Atom properties
    atom_y1 = [atom.GetFormalCharge() for atom in mol.GetAtoms()]  # Atomic charges
    atom_y2 = [atom.GetMass() for atom in mol.GetAtoms()]  # Atomic masses

    # Bond properties
    bond_y1 = [
        sum((bond.GetBeginAtom().GetAtomicNum(), bond.GetEndAtom().GetAtomicNum()))
        for bond in mol.GetBonds()
    ]  # Sum of atomic numbers of bonded atoms
    n_electrons = {
        Chem.rdchem.BondType.SINGLE: 2,
        Chem.rdchem.BondType.AROMATIC: 3,
        Chem.rdchem.BondType.DOUBLE: 4,
        Chem.rdchem.BondType.TRIPLE: 6,
    }
    bond_y2 = [n_electrons[bond.GetBondType()] for bond in mol.GetBonds()]

    data.append([smiles, mol_y, atom_y1, atom_y2, bond_y1, bond_y2])
    constraints.append([mol_y, sum(atom_y2), sum(bond_y2)])

df = pd.DataFrame(data, columns=["smiles", "mol_y", "atom_y1", "atom_y2", "bond_y1", "bond_y2"])
df.to_csv("constrained_regression.csv", index=False)
display(df)

df_constraints = pd.DataFrame(
    constraints, columns=["atom_y1_constraint", "atom_y2_constraint", "bond_y2_constraint"]
)
df_constraints.to_csv("constrained_regression_constraints.csv", index=False)
df_constraints

Unnamed: 0,smiles,mol_y,atom_y1,atom_y2,bond_y1,bond_y2
0,[H][H],0,"[0, 0]","[1.008, 1.008]",[2],[2]
1,C,0,[0],[12.011],[],[]
2,CN,0,"[0, 0]","[12.011, 14.007]",[13],[2]
3,CN,0,"[0, 0]","[12.011, 14.007]",[13],[2]
4,CC,0,"[0, 0]","[12.011, 12.011]",[12],[2]
5,[CH2:3]=[N+:1]([H:4])[H:2],1,"[1, 0, 0, 0]","[14.007, 1.008, 12.011, 1.008]","[13, 8, 8]","[4, 2, 2]"
6,CCCC,0,"[0, 0, 0, 0]","[12.011, 12.011, 12.011, 12.011]","[12, 12, 12]","[2, 2, 2]"
7,CO,0,"[0, 0]","[12.011, 15.999]",[14],[2]
8,CC#N,0,"[0, 0, 0]","[12.011, 12.011, 14.007]","[12, 13]","[2, 6]"
9,C1NN1,0,"[0, 0, 0]","[12.011, 14.007, 14.007]","[13, 14, 13]","[2, 2, 2]"


Unnamed: 0,atom_y1_constraint,atom_y2_constraint,bond_y2_constraint
0,0,2.016,2
1,0,12.011,0
2,0,26.018,2
3,0,26.018,2
4,0,24.022,2
5,1,28.034,8
6,0,48.044,6
7,0,28.01,2
8,0,38.029,8
9,0,40.025,6
