# Prepare Data Set

First, a data set is loaded. Function `load_data_from_df` automatically saves calculated features to the provided data directory (unless `use_data_saving` is set to `False`). Every next run will use the saved features.

In [1]:
import os
import pandas as pd
import torch
os.chdir('src')
import copy

In [2]:
from featurization.data_utils import load_data_from_df, construct_loader
from tqdm import tqdm

In [3]:
batch_size = 64

# Formal charges are one-hot encoded to keep compatibility with the pre-trained weights.
# If you do not plan to use the pre-trained weights, we recommend to set one_hot_formal_charge to False.
X, y = load_data_from_df('../data/freesolv/freesolv.csv', one_hot_formal_charge=False)

train_x = X[:500]
train_y = y[:500]
test_x = X[500:]
test_y = y[500:]
data_loader = construct_loader(train_x, train_y, batch_size)
test_data_loader = construct_loader(test_x, test_y, batch_size)

You can use your data, but the CSV file should contain two columns as shown below:

In [4]:
pd.read_csv('../data/freesolv/freesolv.csv').head()

Unnamed: 0,smiles,y
0,CN(C)C(=O)c1ccc(cc1)OC,-1.874467
1,CS(=O)(=O)Cl,-0.277514
2,CC(C)C=C,1.465089
3,CCc1cnccn1,-0.428367
4,CCCCCCCO,-0.105855


# Prepare Model

In [5]:
from transformer import make_model

In [13]:
d_atom = X[0][0].shape[1]  # It depends on the used featurization.

model_params = {
    'd_atom': d_atom,
    'd_model': 1024,
    'N': 8,
    'h': 16,
    'N_dense': 1,
    'lambda_attention': 0.33, 
    'lambda_distance': 0.33,
    'leaky_relu_slope': 0.1, 
    'dense_output_nonlinearity': 'relu', 
    'distance_matrix_kernel': 'exp', 
    'dropout': 0.1,
    'aggregation_type': 'mean'
}

model = make_model(**model_params)

(0.33, 0.33, 0.3399999999999999)


# Run Training/Evaluation Loop

In [16]:
d_atom = X[0][0].shape[1]  # It depends on the used featurization.

distance_model_params = {
    'd_atom': d_atom,
    'd_model': 1024,
    'N': 8,
    'h': 16,
    'N_dense': 1,
    'lambda_attention': 0.33, 
    'lambda_distance': 0.33,
    'leaky_relu_slope': 0.1, 
    'dense_output_nonlinearity': 'relu', 
    'distance_matrix_kernel': 'exp', 
    'dropout': 0.1,
    'aggregation_type': 'mean'
}

no_distance_model_params = copy.deepcopy(distance_model_params)
no_distance_model_params['lambda_distance'] = 0.0
no_distance_model_params['lambda_attention'] = 1.0

n_epochs = 30
n_trials = 5
d_trial_results = run_experiment(distance_model_params, n_trials, epochs=n_epochs)
nd_trial_results = run_experiment(no_distance_model_params, n_trials, epochs=n_epochs)

  0%|          | 0/5 [00:00<?, ?it/s](0.33, 0.33, 0.3399999999999999)
 20%|██        | 1/5 [00:00<00:01,  2.32it/s](0.33, 0.33, 0.3399999999999999)
 40%|████      | 2/5 [00:00<00:01,  2.46it/s](0.33, 0.33, 0.3399999999999999)
 60%|██████    | 3/5 [00:01<00:00,  2.55it/s](0.33, 0.33, 0.3399999999999999)
 80%|████████  | 4/5 [00:01<00:00,  2.47it/s](0.33, 0.33, 0.3399999999999999)
100%|██████████| 5/5 [00:01<00:00,  2.51it/s]
  0%|          | 0/5 [00:00<?, ?it/s](1.0, 0.0, 0.0)
 20%|██        | 1/5 [00:00<00:01,  2.57it/s](1.0, 0.0, 0.0)
 40%|████      | 2/5 [00:00<00:01,  2.63it/s](1.0, 0.0, 0.0)
 60%|██████    | 3/5 [00:01<00:00,  2.69it/s](1.0, 0.0, 0.0)
 80%|████████  | 4/5 [00:01<00:00,  2.75it/s](1.0, 0.0, 0.0)
100%|██████████| 5/5 [00:01<00:00,  2.72it/s]


In [41]:
#0.55
d_trial_results = []
nd_trial_results = []