In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
import os
from scipy.spatial import distance_matrix

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.utils import shuffle
from torch.utils.data import Dataset, DataLoader

FOLDER_DIR = "./preprocessed/"

### Step 1 : create dataset

The goal is to create 5 numpy arrays : nodes_train, in_edges_train, out_edges_train, nodes_test, in_edges_test

In [2]:
train_df = pd.read_csv(os.path.join(FOLDER_DIR,'train_df.csv'))
test_df = pd.read_csv(os.path.join(FOLDER_DIR,'test_df.csv'))

train_structures_df = pd.read_csv(os.path.join(FOLDER_DIR,'train_structures_df.csv'))
test_structures_df = pd.read_csv(os.path.join(FOLDER_DIR,'test_structures_df.csv'))

# train_bonds and test_bonds come from BondFeatures.ipynb
train_bonds = pd.read_csv(os.path.join(FOLDER_DIR,'train_bonds.csv'))
test_bonds = pd.read_csv(os.path.join(FOLDER_DIR,'test_bonds.csv'))

# train_angles_df and test_angles_df come from make_angles_dataframe.ipynb
train_angles = pd.read_csv(os.path.join(FOLDER_DIR,'train_angles_df.csv'))
test_angles = pd.read_csv(os.path.join(FOLDER_DIR,'test_angles_df.csv'))

train_angles['shortest_path_n_bonds'] = train_angles['shortest_path_n_bonds'] / 6
test_angles['shortest_path_n_bonds'] = test_angles['shortest_path_n_bonds'] / 6
train_angles['dihedral'] = train_angles['dihedral'] / np.pi
test_angles['dihedral'] = test_angles['dihedral'] / np.pi
train_angles["molecule_index"] = pd.factorize(train_angles["molecule_name"])[0]
test_angles["molecule_index"] = pd.factorize(test_angles["molecule_name"])[0]
train_angles = train_angles.fillna(0)
test_angles = test_angles.fillna(0)

In [3]:
train_df["molecule_index"] = pd.factorize(train_df["molecule_name"])[0]
test_df["molecule_index"] = pd.factorize(test_df["molecule_name"])[0]

In [4]:
minimum = train_df['scalar_coupling_constant'].min()
maximum = train_df['scalar_coupling_constant'].max()
print(minimum, maximum)
scale_mid = (minimum + maximum) / 2
scale_norm = maximum - minimum
train_df['scalar_coupling_constant'] = (train_df['scalar_coupling_constant'] - scale_mid) / scale_norm

-44.7605 207.709


In [5]:
train_structures_df["molecule_index"] = pd.factorize(train_structures_df["molecule_name"])[0]
test_structures_df["molecule_index"] = pd.factorize(test_structures_df["molecule_name"])[0]
train_structures_df

Unnamed: 0,molecule_name,atom_index,x,y,z,C,F,H,N,O,molecule_index
0,dsgdb9nsd_000001,0,-0.012698,1.085804,0.008001,True,False,False,False,False,0
1,dsgdb9nsd_000001,1,0.002150,-0.006031,0.001976,False,False,True,False,False,0
2,dsgdb9nsd_000001,2,1.011731,1.463751,0.000277,False,False,True,False,False,0
3,dsgdb9nsd_000001,3,-0.540815,1.447527,-0.876644,False,False,True,False,False,0
4,dsgdb9nsd_000001,4,-0.523814,1.437933,0.906397,False,False,True,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...
1226160,dsgdb9nsd_133883,12,0.167157,-2.642346,0.003546,False,False,True,False,False,68008
1226161,dsgdb9nsd_133883,13,2.336668,-1.165247,0.799579,False,False,True,False,False,68008
1226162,dsgdb9nsd_133883,14,1.287517,1.303344,1.376396,False,False,True,False,False,68008
1226163,dsgdb9nsd_133883,15,1.160599,1.078773,-1.801647,False,False,True,False,False,68008


In [6]:
train_bonds["molecule_index"] = pd.factorize(train_bonds["molecule_name"])[0]
test_bonds["molecule_index"] = pd.factorize(test_bonds["molecule_name"])[0]
train_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(train_bonds['nbond'])
test_bonds[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']] = pd.get_dummies(test_bonds['nbond'])
train_bonds

Unnamed: 0,molecule_name,atom_index_0,atom_index_1,nbond,L2dist,error,bond_type,molecule_index,nbond_1,nbond_1.5,nbond_2,nbond_3
0,dsgdb9nsd_000001,0,1,1.0,1.091953,0,1.0CH,0,True,False,False,False
1,dsgdb9nsd_000001,0,2,1.0,1.091952,0,1.0CH,0,True,False,False,False
2,dsgdb9nsd_000001,0,3,1.0,1.091946,0,1.0CH,0,True,False,False,False
3,dsgdb9nsd_000001,0,4,1.0,1.091948,0,1.0CH,0,True,False,False,False
4,dsgdb9nsd_000002,0,1,1.0,1.017190,0,1.0HN,1,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1268463,dsgdb9nsd_133883,2,6,1.0,1.541542,0,1.0CC,68008,True,False,False,False
1268464,dsgdb9nsd_133883,3,4,1.0,1.482432,0,1.0CC,68008,True,False,False,False
1268465,dsgdb9nsd_133883,4,5,1.0,1.510342,0,1.0CC,68008,True,False,False,False
1268466,dsgdb9nsd_133883,5,6,1.0,1.541538,0,1.0CC,68008,True,False,False,False


First thing is to find the size of the ajdacency matrix which is the biggest number of atoms in a molecule in the dataset (size of the biggest molecule).

In [7]:
max_size_train = max(train_df.groupby('molecule_name')['atom_index_0'].max())
max_size_test = max(test_df.groupby('molecule_name')['atom_index_0'].max())

max_size = max(max_size_train, max_size_test) + 1 # We are given indexes so that goes from 0 to max_size_train or max_size_test
print(max_size)

29


This means that :

nodes_train.size = [nb_molecule_train, max_size, nb_features_nodes] = [68009, 29, 8]

nodes_test.size = [nb_molecule_test, max_size, nb_features_nodes] = [17003, 29, 8]

in_edges_train.size = [nb_molecule_train, max_size, max_size, nb_features_edges] = [68009, 29, 29, 19]

in_edges_test.size = [nb_molecule_test, max_size, max_size, nb_features_edges] = [17003, 29, 29, 19]

out_edges_train.size = [nb_molecule_train, max_size, max_size, 1] = [68009, 29, 29, 1]

Because the features for the nodes are : the atome, its position (x,y,z).
And the features for the edges are : the distance, dist_x, dist_y, dist_z, the type of the coupling. 

In [8]:
n_train = train_df['molecule_name'].nunique()
n_test = test_df['molecule_name'].nunique()
n_train, n_test

(68009, 17003)

In [9]:
def make_nodes(train_structures_df, test_structures_df):
    nodes_train = np.zeros((n_train, max_size, 8), dtype=np.float32)
    nodes_test = np.zeros((n_test, max_size, 8), dtype=np.float32)

    for df, nodes in zip([train_structures_df, test_structures_df], [nodes_train, nodes_test]):
        molecule_indices = df["molecule_index"].values
        atom_indices = df["atom_index"].values
        features = df[["x", "y", "z", "C", "F", "H", "N", "O"]].values

        nodes[molecule_indices, atom_indices] = features
 
    return nodes_train, nodes_test
    
def make_in_edges(train_df, test_df, train_structures_df, test_structures_df, train_bonds, test_bonds):
    in_edges_train = np.zeros((n_train, max_size, max_size, 19), dtype=np.float32)
    in_edges_test = np.zeros((n_test, max_size, max_size, 19), dtype=np.float32)

    # First, iterate through train_df and test_df
    for df, in_edges in zip([train_df, test_df], [in_edges_train, in_edges_test]):
        molecule_indices = df["molecule_index"].values
        atom_indices_0 = df["atom_index_0"].values
        atom_indices_1 = df["atom_index_1"].values
        features = df[["dist", "dist_x", "dist_y", "dist_z", '1JHC', '1JHN', '2JHC', '2JHH', '2JHN', '3JHC', '3JHH', '3JHN']].values

        in_edges[molecule_indices, atom_indices_0, atom_indices_1,:12] = features
        in_edges[molecule_indices, atom_indices_1, atom_indices_0,:12] = features

    # Then, iterate through train_structures_df and test_structures_df to complete the adjency matrix
    for df, in_edges in zip([train_structures_df, test_structures_df], [in_edges_train, in_edges_test]):
        for molecule_index, molecule_df in df.groupby("molecule_index"):
            features = np.zeros((max_size, max_size, 4))

            for i, coords_df in enumerate([molecule_df[["x", "y", "z"]], molecule_df["x"], molecule_df["y"], molecule_df["z"]]):
                coords = coords_df.values

                if i != 0:
                    coords = coords.reshape((len(coords), 1)) # Converts the 1D array into a 2D matrix

                dist = distance_matrix(coords, coords)
                features[:dist.shape[0], :dist.shape[1], i] = dist

            in_edges[molecule_index, :, :, :4] = features

    # Add the bond features 
    for df, in_edges in zip([train_bonds, test_bonds], [in_edges_train, in_edges_test]):
        molecule_indices = df["molecule_index"].values
        atom_indices_0 = df["atom_index_0"].values
        atom_indices_1 = df["atom_index_1"].values
        features = df[['nbond_1', 'nbond_1.5', 'nbond_2', 'nbond_3']].values

        in_edges[molecule_indices, atom_indices_0, atom_indices_1,12:16] = features
        in_edges[molecule_indices, atom_indices_1, atom_indices_0,12:16] = features
        
    # Finally, add the angles features
    for df, in_edges in zip([train_angles, test_angles], [in_edges_train, in_edges_test]):
        molecule_indices = df["molecule_index"].values
        atom_indices_0 = df["atom_index_0"].values
        atom_indices_1 = df["atom_index_1"].values
        features = df[['shortest_path_n_bonds', 'cosinus', 'dihedral']].values

        in_edges[molecule_indices, atom_indices_0, atom_indices_1,16:] = features
        in_edges[molecule_indices, atom_indices_1, atom_indices_0,16:] = features

    return in_edges_train, in_edges_test
    
def make_out_edges(train_df):

    out_edges_train = np.zeros((n_train, max_size, max_size), dtype=np.float32)

    molecule_indices = train_df["molecule_index"].values
    atom_indices_0 = train_df["atom_index_0"].values
    atom_indices_1 = train_df["atom_index_1"].values
    scc_values = train_df["scalar_coupling_constant"].values

    out_edges_train[molecule_indices, atom_indices_0, atom_indices_1] = scc_values
    out_edges_train[molecule_indices, atom_indices_1, atom_indices_0] = scc_values

    return out_edges_train


In [10]:
nodes_train, nodes_test = make_nodes(train_structures_df, test_structures_df)
nodes_train[3,1]

array([ 0.00231072, -0.01915859,  0.00192873,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ], dtype=float32)

In [11]:
in_edges_train, in_edges_test = make_in_edges(train_df, test_df, train_structures_df, test_structures_df, train_bonds, test_bonds)
in_edges_train[0,0,1]

array([1.091953  , 0.01484855, 1.0918355 , 0.00602488, 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.16666667, 0.        , 0.        ], dtype=float32)

In [12]:
in_edges_train[0,2,1], in_edges_train[0,0,5], 

(array([ 1.7831198e+00,  1.0095804e+00,  1.4697825e+00,  1.6995457e-03,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  1.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         3.3333334e-01, -3.3329001e-01,  0.0000000e+00], dtype=float32),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], dtype=float32))

In [13]:
out_edges_train = make_out_edges(train_df)
out_edges_train[-1,0,9], out_edges_train[0,1,0], out_edges_train[0,0,2], out_edges_train[0,3,0], out_edges_train[0,1,4], out_edges_train[0,3,4]

(0.09622846, 0.013202981, 0.013202189, 0.013209715, -0.36728615, -0.36728615)

### Step 2 : train MPNN

In [14]:
N_EPOCHS = 20
LEARNING_RATE = 0.001
BATCH_SIZE = 16

RATIO_TRAIN_VAL = 0.8

#### Data processing

In [15]:
print(out_edges_train.shape)
print(in_edges_train.shape)
print(nodes_train.shape)
print(nodes_test.shape)
print(in_edges_test.shape)

(68009, 29, 29)
(68009, 29, 29, 19)
(68009, 29, 8)
(17003, 29, 8)
(17003, 29, 29, 19)


In [16]:
out_edges_train = out_edges_train.reshape(-1,out_edges_train.shape[1]*out_edges_train.shape[2],1)
in_edges_train = in_edges_train.reshape(-1,in_edges_train.shape[1]*in_edges_train.shape[2],in_edges_train.shape[3])
in_edges_test  = in_edges_test.reshape(-1,in_edges_test.shape[1]*in_edges_test.shape[2],in_edges_test.shape[3])

nodes_train, in_edges_train, out_labels = shuffle(nodes_train, in_edges_train, out_edges_train)

In [17]:
print(nodes_train.shape)
print(in_edges_train.shape)
print(out_labels.shape)
print(nodes_test.shape)
print(in_edges_test.shape)

(68009, 29, 8)
(68009, 841, 19)
(68009, 841, 1)
(17003, 29, 8)
(17003, 841, 19)


#### Dataset Definition

In [18]:
class Set(Dataset):
    def __init__(self, in_nodes, in_edges, out_edges):
        self.nodes = in_nodes
        self.in_edges = in_edges
        self.out_edges = out_edges
        
    def __len__(self):
        return len(self.nodes)
        
    def __getitem__(self, idx):
        s1 = self.nodes[idx]
        s2 = self.in_edges[idx]
        s3 = self.out_edges[idx]
        return s1, s2, s3

nb_train = int(len(nodes_train)*RATIO_TRAIN_VAL)
train_set = Set(nodes_train[:nb_train], in_edges_train[:nb_train], out_labels[:nb_train]) 
val_set = Set(nodes_train[nb_train:], in_edges_train[nb_train:], out_labels[nb_train:]) 

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True)

#### Model

In [18]:
class Message_Passer_NNM(nn.Module):
    def __init__(self, node_dim, nb_features):
        super(Message_Passer_NNM, self).__init__()
        self.nb_features = nb_features
        self.node_dim = node_dim
        self.nn = nn.Sequential(
                    nn.Linear(in_features=self.nb_features, out_features=self.node_dim**2),
                    nn.ReLU()
                )
      
    def forward(self, node_j, edge_ij):
        # Embed the edge as a matrix
        A = self.nn(edge_ij)
        
        # Reshape so matrix mult can be done
        A = A.view(-1, self.node_dim, self.node_dim)
        node_j = node_j.view(-1, self.node_dim, 1)
        
        # Multiply edge matrix by node and shape into message list
        messages = torch.matmul(A, node_j)
        messages = messages.view(-1, edge_ij.shape[1], self.node_dim)

        return messages

In [19]:
class Message_Agg(nn.Module):
    def __init__(self):
        super(Message_Agg, self).__init__()
    
    def forward(self, messages):
        return torch.sum(messages, 2)

In [20]:
class Update_Func_GRU(nn.Module):
    def __init__(self, state_dim):
        super(Update_Func_GRU, self).__init__()
        self.state_dim = state_dim
        self.GRU = nn.GRU(input_size=state_dim, hidden_size=state_dim, batch_first=True)
        
    def forward(self, old_state, agg_messages):
        # Concat so old_state and messages are in sequence
        n_nodes = old_state.shape[1]
        
        concat = torch.cat([old_state.view(-1, 1, self.state_dim), agg_messages.view(-1, 1, self.state_dim)], dim=1)
        # Concat size 29*batch_size, 2, state_dim
        
        # Apply GRU
        # print("Before GRU", concat.shape)
        activation, _ = self.GRU(concat)
        # print("After GRU", activation.shape)
    
        return activation[:, -1, :].view(-1, n_nodes, self.state_dim)

In [21]:
# Define the final output layer 
class Edge_Regressor(nn.Module):
    def __init__(self, state_dim, nb_features_edge, intermediate_dim):
        super(Edge_Regressor, self).__init__()
        
        self.hidden_layer_1 = nn.Sequential(
                    nn.Linear(2*state_dim + nb_features_edge, intermediate_dim),
                    nn.ReLU()
                )
        
        self.hidden_layer_2 = nn.Sequential(
                    nn.Linear(intermediate_dim, intermediate_dim),
                    nn.ReLU()
                )
        
        self.output_layer = nn.Linear(intermediate_dim, 1)

    def forward(self, nodes, edges):
        n_nodes = nodes.shape[1]
        node_dim = nodes.shape[2]

        state_i = nodes.repeat(1, 1, n_nodes).view(-1, n_nodes ** 2, node_dim)
        state_j = nodes.repeat(1, n_nodes, 1)

        concat = torch.cat([state_i, edges, state_j], dim=-1)
        activation_1 = self.hidden_layer_1(concat)
        activation_2 = self.hidden_layer_2(activation_1)

        return self.output_layer(activation_2)

In [22]:
class MP_Layer(nn.Module):
    def __init__(self, state_dim, nb_features_edge):
        super(MP_Layer, self).__init__()
        
        self.message_passers = Message_Passer_NNM(node_dim=state_dim, nb_features=nb_features_edge)
        self.message_aggs = Message_Agg()
        self.update_functions = Update_Func_GRU(state_dim=state_dim)

    def forward(self, nodes, edges, mask):
        n_nodes = nodes.shape[1]
        node_dim = nodes.shape[2]

        state_j = nodes.repeat(1, n_nodes, 1)
        messages = self.message_passers(state_j, edges)

        # Multiply messages by the mask to ignore messages from non-existent nodes
        masked = messages * mask
        masked = masked.view(messages.shape[0], n_nodes, n_nodes, node_dim)

        agg_m = self.message_aggs(masked)
        nodes_out = self.update_functions(nodes, agg_m)
        # Maybe add a batch norm.
        return nodes_out

In [23]:
class MPNN(nn.Module):
    def __init__(self, nb_features_node, nb_features_edge, out_int_dim, state_dim, T):
        super(MPNN, self).__init__()

        self.T = T
        self.embed = nn.Sequential(
                nn.Linear(nb_features_node, state_dim),
                nn.ReLU()
        )
        self.MP = MP_Layer(state_dim, nb_features_edge)
        self.edge_regressor = Edge_Regressor(state_dim, nb_features_edge, out_int_dim)
        self.relu = nn.ReLU()

    def forward(self, edges, nodes):
        # Get distances, and create a mask wherever 0 (i.e., non-existent nodes)
        # This also masks node self-interactions...
        # This assumes distance is last
        len_edges = edges.shape[-1]

        x, _ = torch.split(edges, [1, len_edges - 1], dim=2)

        mask = torch.where(x == 0, x, torch.ones_like(x))
       
        # Embed nodes to the chosen node dimension
        nodes = self.embed(nodes)

        # Run the T message-passing steps
        for mp in range(self.T):
            nodes = self.MP(nodes, edges, mask)

        # Regress the output values
        con_edges = self.edge_regressor(nodes, edges)

        return con_edges

### Training Loop

In [24]:
mpnn = MPNN(nb_features_node = 8, nb_features_edge = 19, out_int_dim = 512, state_dim = 128, T = 4)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
mpnn = mpnn.to(device)

In [26]:
def log_mae(orig, preds):
    # Mask values for which no scalar coupling exists
    mask = orig != 0
    nums = orig[mask]
    preds = preds[mask]

    reconstruction_error = torch.log(torch.mean(torch.abs(nums - preds)))

    return reconstruction_error

In [27]:
optimizer = torch.optim.Adam(params = mpnn.parameters(), lr=LEARNING_RATE)

for i in range(N_EPOCHS):
    print(f"__________EPOCH {i+1}__________")
    mpnn.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        nodes, in_edges, out_edges = batch
        nodes, in_edges, out_edges = nodes.to(device), in_edges.to(device), out_edges.to(device)
        out = mpnn(in_edges, nodes)
        loss = log_mae(out_edges, out)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_loader)
    print("average train loss over an epoch :", average_loss)

    mpnn.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            nodes, in_edges, out_edges = batch
            nodes, in_edges, out_edges = nodes.to(device), in_edges.to(device), out_edges.to(device)
            out = mpnn(in_edges, nodes)
            loss = log_mae(out_edges, out)
            val_loss += loss.item()
    average_loss = val_loss / len(val_loader)
    print("average val loss", average_loss)

__________EPOCH 1__________


  0%|          | 0/3401 [00:00<?, ?it/s]

average train loss over an epoch : -4.1611976817193295


  0%|          | 0/851 [00:00<?, ?it/s]

average val loss -4.311858524306821
__________EPOCH 2__________


  0%|          | 0/3401 [00:00<?, ?it/s]

average train loss over an epoch : -4.3090277665224335


  0%|          | 0/851 [00:00<?, ?it/s]

average val loss -4.3469884202287
__________EPOCH 3__________


  0%|          | 0/3401 [00:00<?, ?it/s]

average train loss over an epoch : -4.353612696623248


  0%|          | 0/851 [00:00<?, ?it/s]

average val loss -4.3875353420383085
__________EPOCH 4__________


  0%|          | 0/3401 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [36]:
torch.save(mpnn.state_dict(), './models/mpnn.pt')

## Get score 

In [25]:
model = MPNN(nb_features_node = 8, nb_features_edge = 19, out_int_dim = 512, state_dim = 128, T = 4)

In [26]:
model = torch.load('./models/mpnn.pt')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

MPNN(
  (embed): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): ReLU()
  )
  (MP): MP_Layer(
    (message_passers): Message_Passer_NNM(
      (nn): Sequential(
        (0): Linear(in_features=19, out_features=16384, bias=True)
        (1): ReLU()
      )
    )
    (message_aggs): Message_Agg()
    (update_functions): Update_Func_GRU(
      (GRU): GRU(128, 128, batch_first=True)
    )
  )
  (edge_regressor): Edge_Regressor(
    (hidden_layer_1): Sequential(
      (0): Linear(in_features=275, out_features=512, bias=True)
      (1): ReLU()
    )
    (hidden_layer_2): Sequential(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): ReLU()
    )
    (output_layer): Linear(in_features=512, out_features=1, bias=True)
  )
  (relu): ReLU()
)

In [28]:
preds = []

for node, edge in tqdm(zip(nodes_test, in_edges_test)):
    node =  torch.tensor(node).unsqueeze(0).to(device)
    edge =  torch.tensor(edge).unsqueeze(0).to(device)
    pred = model(edge, node)
    pred = pred.cpu().detach().numpy()
    preds.append(pred)

0it [00:00, ?it/s]

AttributeError: 'list' object has no attribute 'view'

In [34]:
preds = np.array(preds)

In [36]:
preds = preds.reshape(len(preds), max_size, max_size, 1)

In [38]:
test_group = test_df.groupby('molecule_name')

In [39]:
def make_outs(test_group, preds):
    i = 0
    x = np.array([])
    for test_gp, preds in zip(test_group, preds):

        gp = test_gp[1]
        
        x = np.append(x, (preds[gp['atom_index_0'].values, gp['atom_index_1'].values] + preds[gp['atom_index_1'].values, gp['atom_index_0'].values])/2.0)
        
        i = i+1
    return x

preds = make_outs(test_group, preds)
preds = preds*scale_norm + scale_mid
targets = test_df['scalar_coupling_constant']

In [40]:
def score(preds, targets):
    return np.log(np.mean(np.abs(targets - preds)))

print("Score :", score(preds, targets))

Score : 1.1541991736195896
