In [176]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
from scipy.spatial import distance_matrix
FOLDER_DIR = "./preprocessed/"

### Step 1 : create dataset

The goal is to create 5 numpy arrays : nodes_train, in_edges_train, out_edges_train, nodes_test, in_edges_test

In [177]:
train_df = pd.read_csv(os.path.join(FOLDER_DIR,'train_df.csv'))
test_df = pd.read_csv(os.path.join(FOLDER_DIR,'test_df.csv'))

train_structures_df = pd.read_csv(os.path.join(FOLDER_DIR,'train_structures_df.csv'))
test_structures_df = pd.read_csv(os.path.join(FOLDER_DIR,'test_structures_df.csv'))

# train_bonds and test_bonds come from BondFeatures.ipynb
train_bonds = pd.read_csv(os.path.join(FOLDER_DIR,'train_bonds.csv'))
test_bonds = pd.read_csv(os.path.join(FOLDER_DIR,'test_bonds.csv'))

In [178]:
train_df["molecule_index"] = pd.factorize(train_df["molecule_name"])[0]
test_df["molecule_index"] = pd.factorize(test_df["molecule_name"])[0]
train_df

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x0,y0,z0,...,dist_z,1JHC,1JHN,2JHC,2JHH,2JHN,3JHC,3JHH,3JHN,molecule_index
0,0,dsgdb9nsd_000001,1,0,1JHC,84.80760,H,0.002150,-0.006031,0.001976,...,0.006025,True,False,False,False,False,False,False,False,0
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.25700,H,0.002150,-0.006031,0.001976,...,0.001700,False,False,False,True,False,False,False,False,0
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.25480,H,0.002150,-0.006031,0.001976,...,0.878620,False,False,False,True,False,False,False,False,0
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.25430,H,0.002150,-0.006031,0.001976,...,0.904421,False,False,False,True,False,False,False,False,0
4,4,dsgdb9nsd_000001,2,0,1JHC,84.80740,H,1.011731,1.463751,0.000277,...,0.007724,True,False,False,False,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3724006,4658993,dsgdb9nsd_133883,16,4,3JHC,6.18566,H,-0.254709,0.115179,-2.320941,...,2.766766,False,False,False,False,False,True,False,False,68008
3724007,4658994,dsgdb9nsd_133883,16,5,3JHC,5.27455,H,-0.254709,0.115179,-2.320941,...,3.282113,False,False,False,False,False,True,False,False,68008
3724008,4658995,dsgdb9nsd_133883,16,6,2JHC,1.52689,H,-0.254709,0.115179,-2.320941,...,2.139460,False,False,True,False,False,False,False,False,68008
3724009,4658996,dsgdb9nsd_133883,16,7,1JHC,92.46210,H,-0.254709,0.115179,-2.320941,...,0.838362,True,False,False,False,False,False,False,False,68008


In [179]:
train_structures_df["molecule_index"] = pd.factorize(train_structures_df["molecule_name"])[0]
test_structures_df["molecule_index"] = pd.factorize(test_structures_df["molecule_name"])[0]
train_structures_df

Unnamed: 0,molecule_name,atom_index,atom,x,y,z,C,F,H,N,O,molecule_index
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001,True,False,False,False,False,0
1,dsgdb9nsd_000001,1,H,0.002150,-0.006031,0.001976,False,False,True,False,False,0
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277,False,False,True,False,False,0
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644,False,False,True,False,False,0
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397,False,False,True,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1226160,dsgdb9nsd_133883,12,H,0.167157,-2.642346,0.003546,False,False,True,False,False,68008
1226161,dsgdb9nsd_133883,13,H,2.336668,-1.165247,0.799579,False,False,True,False,False,68008
1226162,dsgdb9nsd_133883,14,H,1.287517,1.303344,1.376396,False,False,True,False,False,68008
1226163,dsgdb9nsd_133883,15,H,1.160599,1.078773,-1.801647,False,False,True,False,False,68008


In [180]:
train_bonds["molecule_index"] = pd.factorize(train_bonds["molecule_name"])[0]
test_bonds["molecule_index"] = pd.factorize(test_bonds["molecule_name"])[0]
train_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(train_bonds['nbond'])
test_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(test_bonds['nbond'])
train_bonds

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,nbond,L2dist,error,bond_type,molecule_index,nbond_1,nbond_1.5,nbond_2,nbond_3
0,dsgdb9nsd_000001,0,1,1.0,1.091953,0,1.0CH,0,True,False,False,False
1,dsgdb9nsd_000001,0,2,1.0,1.091952,0,1.0CH,0,True,False,False,False
2,dsgdb9nsd_000001,0,3,1.0,1.091946,0,1.0CH,0,True,False,False,False
3,dsgdb9nsd_000001,0,4,1.0,1.091948,0,1.0CH,0,True,False,False,False
4,dsgdb9nsd_000002,0,1,1.0,1.017190,0,1.0HN,1,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1268463,dsgdb9nsd_133883,2,6,1.0,1.541542,0,1.0CC,68008,True,False,False,False
1268464,dsgdb9nsd_133883,3,4,1.0,1.482432,0,1.0CC,68008,True,False,False,False
1268465,dsgdb9nsd_133883,4,5,1.0,1.510342,0,1.0CC,68008,True,False,False,False
1268466,dsgdb9nsd_133883,5,6,1.0,1.541538,0,1.0CC,68008,True,False,False,False


First thing is to find the size of the ajdacency matrix which is the biggest number of atoms in a molecule in the dataset (size of the biggest molecule).

In [181]:
max_size_train = max(train_df.groupby('molecule_name')['atom_index_0'].max())
max_size_test = max(test_df.groupby('molecule_name')['atom_index_0'].max())

max_size = max(max_size_train, max_size_test) + 1 # We are given indexes so that goes from 0 to max_size_train or max_size_test
print(max_size)

29


This means that :

nodes_train.size = [nb_molecule_train, max_size, nb_features_nodes] = [68009, 29, 8]

nodes_test.size = [nb_molecule_test, max_size, nb_features_nodes] = [17003, 29, 8]

in_edges_train.size = [nb_molecule_train, max_size, max_size, nb_features_edges] = [68009, 29, 29, 16]

in_edges_test.size = [nb_molecule_test, max_size, max_size, nb_features_edges] = [17003, 29, 29, 16]

out_edges_train.size = [nb_molecule_train, max_size, max_size, 1] = [68009, 29, 29, 1]

Because the features for the nodes are : the atome, its position (x,y,z).
And the features for the edges are : the distance, dist_x, dist_y, dist_z, the type of the coupling. 

In [182]:
n_train = train_df['molecule_name'].nunique()
n_test = test_df['molecule_name'].nunique()
n_train, n_test

(68009, 17003)

In [183]:
def make_nodes(train_structures_df, test_structures_df):
    nodes_train = np.zeros((n_train, max_size, 8))
    nodes_test = np.zeros((n_test, max_size, 8))

    for df, nodes in zip([train_structures_df, test_structures_df], [nodes_train, nodes_test]):
        molecule_indices = df["molecule_index"].values
        atom_indices = df["atom_index"].values
        features = df[["x", "y", "z", "C", "F", "H", "N", "O"]].values

        nodes[molecule_indices, atom_indices] = features
 
    return nodes_train, nodes_test
    
def make_in_edges(train_df, test_df, train_structures_df, test_structures_df, train_bonds, test_bonds):
    in_edges_train = np.zeros((n_train, max_size, max_size, 16))
    in_edges_test = np.zeros((n_test, max_size, max_size, 16))

    # First, iterate through train_df and test_df
    for df, in_edges in zip([train_df, test_df], [in_edges_train, in_edges_test]):
        molecule_indices = df["molecule_index"].values
        atom_indices_0 = df["atom_index_0"].values
        atom_indices_1 = df["atom_index_1"].values
        features = df[["dist", "dist_x", "dist_y", "dist_z", '1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']].values

        in_edges[molecule_indices, atom_indices_0, atom_indices_1,:12] = features
        in_edges[molecule_indices, atom_indices_1, atom_indices_0,:12] = features

    # Then, iterate through train_structures_df and test_structures_df to complete the adjency matrix
    for df, in_edges in zip([train_structures_df, test_structures_df], [in_edges_train, in_edges_test]):
        for molecule_index, molecule_df in df.groupby("molecule_index"):
            features = np.zeros((max_size, max_size, 4))

            for i, coords_df in enumerate([molecule_df[["x", "y", "z"]], molecule_df["x"], molecule_df["y"], molecule_df["z"]]):
                coords = coords_df.values

                if i != 0:
                    coords = coords.reshape((len(coords), 1)) # Converts the 1D array into a 2D matrix

                dist = distance_matrix(coords, coords)
                features[:dist.shape[0], :dist.shape[1], i] = dist

            in_edges[molecule_index, :, :, :4] = features

    # Finally, add the bond features 
    for df, in_edges in zip([train_bonds, test_bonds], [in_edges_train, in_edges_test]):
        molecule_indices = df["molecule_index"].values
        atom_indices_0 = df["atom_index_0"].values
        atom_indices_1 = df["atom_index_1"].values
        features = df[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']].values

        in_edges[molecule_indices, atom_indices_0, atom_indices_1,12:] = features
        in_edges[molecule_indices, atom_indices_1, atom_indices_0,12:] = features

    return in_edges_train, in_edges_test
    
def make_out_edges(train_df):

    out_edges_train = np.zeros((n_train, max_size, max_size))

    molecule_indices = train_df["molecule_index"].values
    atom_indices_0 = train_df["atom_index_0"].values
    atom_indices_1 = train_df["atom_index_1"].values
    scc_values = train_df["scalar_coupling_constant"].values

    out_edges_train[molecule_indices, atom_indices_0, atom_indices_1] = scc_values
    out_edges_train[molecule_indices, atom_indices_1, atom_indices_0] = scc_values

    return out_edges_train


In [184]:
nodes_train, nodes_test = make_nodes(train_structures_df, test_structures_df)
nodes_train[3,1]

array([ 0.00231072, -0.01915859,  0.00192873,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ])

In [185]:
in_edges_train, _ = make_in_edges(train_df, test_df, train_structures_df, test_structures_df, train_bonds, test_bonds)
in_edges_train[0,0,1]

array([1.09195306, 0.01484855, 1.09183548, 0.00602488, 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        ])

In [187]:
in_edges_train[0,2,1], in_edges_train[0,0,5], 

(array([1.78311976e+00, 1.00958043e+00, 1.46978248e+00, 1.69954560e-03,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [98]:
out_edges_train = make_out_edges(train_df)
out_edges_train[-1,0,9], out_edges_train[0,1,0], out_edges_train[0,0,2], out_edges_train[0,3,0], out_edges_train[0,1,4], out_edges_train[0,3,4]

(105.769, 84.8076, 84.8074, 84.8093, -11.2543, -11.2543)

### Step 2 : train MPNN