# Importing packages

In [5]:
import pandas as pd
import numpy as np

from chemprop import data, utils

# Change data inputs here

In [6]:
test_path = '../tests/data/regression.csv'
smiles_column = 'smiles'
target_columns = ['logSolubility']

period_table_path = '../tests/data/periodic_table_of_elements.csv'
feature_headings = ['AtomicRadius', 'Electronegativity', 'Metal', 'Nonmetal', 'Metalloid']

## Load data

In [7]:
df_test = pd.read_csv(test_path)
df_test

Unnamed: 0,smiles,logSolubility
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.770
1,Cc1occc1C(=O)Nc2ccccc2,-3.300
2,CC(C)=CCCC(C)=CC(=O),-2.060
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.870
4,c1ccsc1,-1.330
...,...,...
495,Nc1cc(nc(N)n1=O)N2CCCCC2,-1.989
496,Nc2cccc3nc1ccccc1cc23,-4.220
497,c1ccc2cc3c4cccc5cccc(c3cc2c1)c45,-8.490
498,OC(c1ccc(Cl)cc1)(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,-5.666


In [8]:
pt_fs = pd.read_csv(period_table_path)
pt_fs

Unnamed: 0,AtomicNumber,Element,Symbol,AtomicMass,NumberofNeutrons,NumberofProtons,NumberofElectrons,Period,Group,Phase,...,FirstIonization,Density,MeltingPoint,BoilingPoint,NumberOfIsotopes,Discoverer,Year,SpecificHeat,NumberofShells,NumberofValence
0,1,Hydrogen,H,1.007,0,1,1,1,1.0,gas,...,13.5984,0.000090,14.175,20.28,3.0,Cavendish,1766.0,14.304,1,1.0
1,2,Helium,He,4.002,2,2,2,1,18.0,gas,...,24.5874,0.000179,,4.22,5.0,Janssen,1868.0,5.193,1,
2,3,Lithium,Li,6.941,4,3,3,2,1.0,solid,...,5.3917,0.534000,453.850,1615.00,5.0,Arfvedson,1817.0,3.582,2,1.0
3,4,Beryllium,Be,9.012,5,4,4,2,2.0,solid,...,9.3227,1.850000,1560.150,2742.00,6.0,Vaulquelin,1798.0,1.825,2,2.0
4,5,Boron,B,10.811,6,5,5,2,13.0,solid,...,8.2980,2.340000,2573.150,4200.00,6.0,Gay-Lussac,1808.0,1.026,2,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,114,Flerovium,Fl,289.000,175,114,114,7,14.0,artificial,...,,,,,,,1999.0,,7,4.0
114,115,Moscovium,Mc,288.000,173,115,115,7,15.0,artificial,...,,,,,,,2010.0,,7,5.0
115,116,Livermorium,Lv,292.000,176,116,116,7,16.0,artificial,...,,,,,,,2000.0,,7,6.0
116,117,Tennessine,Ts,295.000,178,117,117,7,17.0,artificial,...,,,,,,,2010.0,,7,7.0


In [9]:
smis = df_test['smiles']

# Load atom features

In [10]:
features = pt_fs.loc[:, feature_headings]
features

Unnamed: 0,AtomicRadius,Electronegativity,Metal,Nonmetal,Metalloid
0,0.79,2.20,,1.0,
1,0.49,,,1.0,
2,2.10,0.98,1.0,,
3,1.40,1.57,1.0,,
4,1.20,2.04,,,1.0
...,...,...,...,...,...
113,,,1.0,,
114,,,1.0,,
115,,,1.0,,
116,,,,1.0,


# Generating features
Define a function to generate an array of atom (vertex) features ```vf``` and bond (edge) features ```ef``` given a SMILES molecule.

For each atom, we include its
- Atomic Radius
- Electronegativity
- A one-hot encoding for whether it is a metal, nonmetal or metalloid.

For each bond, we include the polarity of the bond, computed using the difference in electronegativity between the atoms.

In [11]:
def calculate_extra_features_from_smiles(smi, keep_h=False, add_h=False):
    mol = utils.make_mol(smi, keep_h, add_h)
    atoms = mol.GetAtoms()

    # Atom features denoted by V for vertex
    vf = []
    for atom in atoms:
        z = atom.GetAtomicNum()
        x_v = features[:].iloc[z - 1].values.tolist()
        vf.append([f if f != "nan" else 0 for f in x_v])

    # Bond features denoted by E for edges
    ef = np.empty((2 * mol.GetNumBonds(), 1))
    i = 0
    for u in range(len(atoms)):
        for v in range(u + 1, len(atoms)):
            bond = mol.GetBondBetweenAtoms(u, v)
            if bond is None:
                continue

            u_elec = pt_fs['Electronegativity'].iloc[atoms[u].GetAtomicNum()]
            v_elec = pt_fs['Electronegativity'].iloc[atoms[v].GetAtomicNum()]

            x_e = [abs(u_elec - v_elec)]

            ef[i : i + 2] = x_e

            i += 2

    vf = np.array(vf)
    return (vf, ef)

In [12]:
ys = df_test.loc[:, target_columns].values
fs = [calculate_extra_features_from_smiles(smi) for smi in smis]
fs

[(array([[0.65, 3.44,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.65, 3.44,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.65, 3.44,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.65, 3.44,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.65, 3.44,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.75, 3.04,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.65, 3.44,  nan, 1.  ,  nan],
         [0.91, 2.55,  nan, 1.  ,  nan],
         [0.65, 3.44,  nan, 1.  ,  nan],
         [0.91, 

In [13]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y=y, V_f=f[0], E_f=f[1]) for smi, y, f in zip(smis, ys, fs)]
all_data

[MoleculeDatapoint(mol=<rdkit.Chem.rdchem.Mol object at 0x7f4bd824aab0>, y=array([-0.77]), weight=1, gt_mask=None, lt_mask=None, x_f=None, x_phase=None, V_f=array([[0.65, 3.44, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.65, 3.44, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.65, 3.44, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.65, 3.44, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.65, 3.44, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.75, 3.04, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
        [0.91, 2.55, 0.  , 1.  , 0.  ],
    