### To do: 
- build Ball Tree for cosine similarity
- implement Bayesian optimisation 

In [None]:
# Install RDKit. Takes 2-3 minutes
# !wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
# !chmod +x Miniconda3-latest-Linux-x86_64.sh
# !time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
# !time conda install -q -y -c conda-forge python=3.7
# !time conda install -q -y -c conda-forge rdkit

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

In [3]:
# DRIVE_PATH_TO_PICKLE = '/content/gdrive/My Drive/rxn_ebm/USPTO_50k_Schneider/clean_rxn_50k_nomap_noreagent.pickle'
# VM_PATH_TO_PICKLE = '/content/'

# !cp '/content/gdrive/My Drive/rxn_ebm/USPTO_50k_Schneider/clean_rxn_50k_nomap_noreagent.pickle' '/content/'

In [1]:
import sys
# sys.path.append('/usr/local/lib/python3.7/site-packages/') 
# for Colab 
import os

import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.ipython_useSVG=True
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdqueries # faster than iterating atoms https://sourceforge.net/p/rdkit/mailman/message/34538007/ 
from rdkit.Chem.rdchem import Atom
from rdkit import DataStructs
import numpy as np

from itertools import chain
import random

from tqdm import tqdm
import csv
import re 
import pickle
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F

### utils

In [2]:
import shutil
import torch
import torch.nn as nn

def get_activation_function(activation: str) -> nn.Module:
    """
    Gets an activation function module given the name of the activation.
    Supports:
    * :code:`ReLU`
    * :code:`LeakyReLU`
    * :code:`PReLU`
    * :code:`tanh`
    * :code:`SELU`
    * :code:`ELU`
    :param activation: The name of the activation function.
    :return: The activation function module.
    """
    if activation == 'ReLU':
        return nn.ReLU()
    elif activation == 'LeakyReLU':
        return nn.LeakyReLU(0.1)
    elif activation == 'PReLU':
        return nn.PReLU()
    elif activation == 'tanh':
        return nn.Tanh()
    elif activation == 'SELU':
        return nn.SELU()
    elif activation == 'ELU':
        return nn.ELU()
    else:
        raise ValueError(f'Activation "{activation}" not supported.')
    
def initialize_weights(model: nn.Module) -> None:
    """
    Initializes the weights of a model in place.
    :param model: An PyTorch model.
    """
    for param in model.parameters():
        if param.dim() == 1:
            nn.init.constant_(param, 0)
        else:
            nn.init.xavier_normal_(param)
            
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

### model

In [3]:
import torch
import torch.nn as nn

class FF_ebm(nn.Module):
    '''
    trainargs: dictionary containing hyperparameters to be optimised, 
    hidden_sizes must be a list e.g. [1024, 512, 256]
    
    To do: bayesian optimisation
    '''
    def __init__(self, trainargs):
        super(FF_ebm, self).__init__()
        self.output_size = trainargs['output_size']
        self.num_layers = len(trainargs['hidden_sizes']) + 1

        if trainargs['model'] == 'FF_sep':
          self.input_dim = trainargs['rctfp_size'] + trainargs['prodfp_size'] # will be rctfp_size + prodfp_size for FF_sep
        elif trainargs['model'] == 'FF_diff':
          self.input_dim = trainargs['rctfp_size']
          assert trainargs['rctfp_size'] == trainargs['prodfp_size'], 'rctfp_size != prodfp_size, unable to make difference FPs!!!'

        self.create_ffn(trainargs)
        initialize_weights(self)  # is it necessary to initialize weights?? 
    
    def create_ffn(self, trainargs):
        '''
        Creates feed-forward network using trainargs dict
        '''
        dropout = nn.Dropout(trainargs['dropout'])
        activation = get_activation_function(trainargs['activation'])

        if self.num_layers == 1:
            ffn = [
                dropout,
                nn.Linear(self.input_dim, self.output_size)
            ]
        else:
            ffn = [
                dropout,
                nn.Linear(self.input_dim, trainargs['hidden_sizes'][0])
            ]
            
            # intermediate hidden layers 
            for i, layer in enumerate(range(self.num_layers - 2)):
                ffn.extend([
                    activation,
                    dropout,
                    nn.Linear(trainargs['hidden_sizes'][i], trainargs['hidden_sizes'][i+1]),
                ])
                
            # last hidden layer
            ffn.extend([
                activation,
                dropout,
                nn.Linear(trainargs['hidden_sizes'][-1], self.output_size),
            ])

        self.ffn = nn.Sequential(*ffn)
        
    def forward(self, batch):
        '''
        Runs FF_ebm on input
        
        batch: a N x K x 1 tensor of N training samples, where each sample contains 
        a positive rxn on the first column, and K-1 negative rxn on subsequent columns 
        supplied by DataLoader on custom ReactionDataset 
        '''
        energy_scores = self.ffn(batch) # tensor of size N x K x 1
        return energy_scores 

### train

In [17]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm
import torch.nn as nn
import time

class Run():
    def __init__(self, model, trainargs,
                 optimizer=None, load_checkpoint=False):
        self.device = trainargs['device']
        model = model.to(self.device)
        self.model = model
        if load_checkpoint: 
            self.optimizer = optimizer
        else:
            self.optimizer = trainargs['optimizer'](model.parameters(), lr=trainargs['learning_rate'])
        self.trainargs = trainargs 

        train_dataset = ReactionDataset(trainargs['path_to_pickle'], 'train', trainargs)
        self.train_loader = DataLoader(train_dataset, trainargs['batch_size'], shuffle=True)
        
        val_dataset = ReactionDataset(trainargs['path_to_pickle'], 'valid', trainargs)
        self.val_loader = DataLoader(val_dataset, 2 * trainargs['batch_size'], shuffle=False)
        
        test_dataset = ReactionDataset(self.trainargs['path_to_pickle'], 'test', self.trainargs)
        self.test_loader = DataLoader(test_dataset, 2 * self.trainargs['batch_size'], shuffle=False)
        del train_dataset, val_dataset, test_dataset

        self.mean_train_loss = []
        self.min_val_loss = 1e9
        self.mean_val_loss = []
        self.stats = {'trainargs': self.trainargs} # to store training statistics  

        torch.manual_seed(trainargs['model_seed'])
        random.seed(trainargs['random_seed'])
    
    def train_one(self, batch, val=False):
        '''
        Trains model for 1 epoch 

        TO DO: learning rate scheduler + logger 
        '''
        self.model.zero_grad()
        scores = self.model.forward(batch).squeeze(dim=-1) # scores: size N x K x 1 --> N x K after squeezing

        softmax = nn.Softmax(dim=1)
        probs = softmax(scores) # size N x K

        # positives are the 0-th index of each sample, add a small epsilon 1e-9 to stabilise log 
        loss = -torch.log(probs[:, 0]+1e-9).mean() # probs[:, 0] is size N x 1 --> sum/mean to 1 value

        if not val:
            self.optimizer.zero_grad()
            loss.backward()
        #     if args.grad_clip:
        #         nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
            self.optimizer.step()

        return loss.data.cpu()

    def train(self):
        '''
        Trains model for num_epochs provided in trainargs
        Currently supports feed-forward networks: 
            FF_diff: takes as input a difference FP of fp_size & fp_radius
            FF_sep: takes as input a concatenation of [reactants FP, product FP] 

        trainargs: dict of params 
        '''
        start = time.time()

        for epoch in np.arange(self.trainargs['epochs']):
            self.model.train() # set model to training mode
            train_loss = []
            for batch in tqdm(self.train_loader): 
                batch = batch.to(self.device)
                train_loss.append(self.train_one(batch, val=False))
                self.mean_train_loss.append(np.mean(train_loss)) 
                # print('train_loss: {}'.format(train_loss))

            self.model.eval() # validation mode
            val_loss = []
            with torch.no_grad():
                for batch in tqdm(self.val_loader):
                    batch = batch.to(self.device)
                    val_loss.append(self.train_one(batch, val=True))

                if self.trainargs['early_stop'] and self.min_val_loss - np.mean(val_loss) < self.trainargs['min_delta']:
                    if self.trainargs['patience'] <= wait:
                        print('Early stopped at the end of epoch: ', epoch)
                        print('mean_val_loss: ', np.mean(val_loss))
                        stats['early_stop_epoch'] = epoch 
                        break 
                    else:
                        wait += 1
                        print('Decrease in val loss < min_delta, patience count: ', wait)
                else:
                    wait = 0
                    self.min_val_loss = min(self.min_val_loss, np.mean(val_loss))
                self.mean_val_loss.append(np.mean(val_loss))

            if self.trainargs['checkpoint']: # adapted from moco: main_moco.py
                save_checkpoint({
                        'epoch': epoch + 1,
                        'model': self.trainargs['model'],
                        'state_dict': self.model.state_dict(),
                        'optimizer' : self.optimizer.state_dict(),
                        'stats' : self.stats,
                    }, is_best=False, 
                    filename=self.trainargs['checkpoint_path']+'{}_{}_checkpoint_{:04d}.pth.tar'.format(
                        self.trainargs['model'], self.trainargs['expt_name'], epoch))

            print('Epoch: {}, train_loss: {}, val_loss: {}'.format(epoch, 
                                             np.around(np.mean(train_loss), decimals=4), 
                                             np.around(np.mean(val_loss), decimals=4)))

        self.stats['mean_train_loss'] = self.mean_train_loss
        self.stats['mean_val_loss'] = self.mean_val_loss
        self.stats['min_val_loss'] = self.min_val_loss
        self.stats['train_time'] = time.time() - start 
        # save training stats
        torch.save(self.stats, self.trainargs['checkpoint_path']+'{}_{}_stats.pkl'.format(
            self.trainargs['model'], self.trainargs['expt_name']))

    def test(self):
        '''
        Evaluates the model on the test set 
        '''
        test_loss = []
        self.model.eval()
        with torch.no_grad():
            for batch in tqdm(test_loader):
                batch = batch.to(self.device)
                test_loss.append(self.train_one(batch, val=True))

        self.stats['test_loss'] = test_loss 
        self.stats['mean_test_loss'] = np.mean(test_loss)
        print('train_time: {}'.format(self.stats['train_time']))
        print('test_loss: {}'.format(self.stats['test_loss']))
        print('mean_test_loss: {}'.format(self.stats['mean_test_loss']))
        # overrides training stats w/ training + test stats
        torch.save(self.stats, self.trainargs['checkpoint_path']+'{}_{}_stats.pkl'.format(
            self.trainargs['model'], self.trainargs['expt_name'])) 

    def get_scores(self, dataloader):
        ''' 
        Gets raw energy values (scores) from a trained model on a given dataloader
        '''
        scores = []
        self.model.eval()
        with torch.no_grad():
          for batch in tqdm(dataloader):
              batch = batch.to(self.device)
              self.model.zero_grad()
              scores.append(self.model.forward(batch).squeeze(dim=-1)) 
            # scores: size N x K x 1 --> N x K after squeezing

        return torch.cat(scores, dim=0)

    def get_topk_acc(self, dataloader, k=1):
        '''
        Computes top-k accuracy of trained model in classifying feasible vs infeasible chemical rxns
        (i.e. maximum energy score assigned to label 0 of each training sample) 
        '''
        self.model.eval()
        scores = self.get_scores(dataloader)
        predicted_labels = torch.topk(scores, k, dim=1)[1]
        
        return torch.where(predicted_labels == 0)[0].shape[0]/predicted_labels.shape[0]

### data

In [5]:
# https://github.com/pytorch/tutorials/blob/master/beginner_source/data_loading_tutorial.py
import torch
from torch.utils.data import Dataset
import random
import pickle

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdChemReactions
from rdkit import DataStructs
import numpy as np

def create_rxn_MorganFP(rxn_smi, fp_type='diff', radius=2, 
                        rctfp_size=16384, prodfp_size=16384, 
                        useChirality=True, dtype='int8'):
    '''
    fp_type: 'diff' or 'sep', 
    'diff' (difference):
    Creates reaction MorganFP following Schneider et al in J. Chem. Inf. Model. 2015, 55, 1, 39–53
    reactionFP = productFP - sum(reactantFPs)
    
    'sep' (separate):
    Creates separate reactantsFP and productFP following Gao et al in ACS Cent. Sci. 2018, 4, 11, 1465–1476
    '''
    # initialise empty fp numpy arrays
    if fp_type == 'diff':
        diff_fp = np.empty(rctfp_size, dtype = dtype)
    elif fp_type == 'sep':
        rcts_fp = np.empty(rctfp_size, dtype = dtype)
        prod_fp = np.empty(prodfp_size, dtype = dtype)
    else:
        print('ERROR: fp_type not recognised!')
        return
    
    # create product FP
    prod_mol = Chem.MolFromSmiles(rxn_smi.split('>')[-1])
    try:
        prod_fp_bit = AllChem.GetMorganFingerprintAsBitVect(
                        mol=prod_mol, radius=radius, nBits=prodfp_size, useChirality=useChirality)

        fp = np.empty(prodfp_size, dtype = dtype)   # temporarily store numpy array as fp 
        DataStructs.ConvertToNumpyArray(prod_fp_bit, fp)
        if fp_type == 'diff':
            diff_fp += fp
        elif fp_type == 'sep':
            prod_fp = fp
    except Exception as e:
        print("Cannot build product fp due to {}".format(e))
        return
                                  
    # create reactant FPs, subtracting each from product FP
    rcts_smi = rxn_smi.split('>')[0].split('.')
    for rct_smi in rcts_smi:
        rct_mol = Chem.MolFromSmiles(rct_smi)
        try:
            rct_fp_bit = AllChem.GetMorganFingerprintAsBitVect(
                            mol=rct_mol, radius=radius, nBits=rctfp_size, useChirality=useChirality)
            fp = np.empty(rctfp_size, dtype = dtype)
            DataStructs.ConvertToNumpyArray(rct_fp_bit, fp)
            if fp_type == 'diff':
                diff_fp -= fp
            elif fp_type == 'sep':
                rcts_fp += fp
        except Exception as e:
            print("Cannot build reactant fp due to {}".format(e))
            return
    
    if fp_type == 'diff':
        return diff_fp
    elif fp_type == 'sep':
        return np.concatenate([rcts_fp, prod_fp])

    
class ReactionDataset(Dataset):
    '''
    The Dataset class ReactionDataset prepares training samples of length K: 
    [pos_rxn, neg_rxn_1, ..., neg_rxn_K-1], ... where K-1 = num_neg 

    TO DO: can this be further optimised? Augmentation is the clear bottleneck during training
    '''
    def __init__(self, path_to_pickle, key, trainargs):
        '''
        pickle is dict w/ keys 'train', 'valid', 'test' each storing a list of rxn_smiles (str)
        IMPORTANT: molAtomMapNumbers have been cleared during data pre-processing 
        ''' 
        # feels like loading the entire pickle is not feasible when the dataset gets larger 
        # is there a more memory-efficient way to do this? 
        with open(path_to_pickle, 'rb') as handle: 
            self.rxn_smiles = pickle.load(handle)[key] 
        self.fp_radius = trainargs['fp_radius']
        self.fp_type = trainargs['fp_type']
        self.rctfp_size = trainargs['rctfp_size']
        self.prodfp_size = trainargs['prodfp_size']
        self.num_neg = trainargs['num_neg']
    
    def random_sample_negative(self, pos_rxn_smi, pos_rxn_idx):
        '''
        Generates 1 negative reaction given a positive reaction SMILES
        Returns neg_rxn_smi (str)
        '''
        rcts_smi = pos_rxn_smi.split('>')[0].split('.')
        prod_smi = pos_rxn_smi.split('>')[-1]       
            
        rct_or_prod = random.choice([0, 1])
        if rct_or_prod == 0: # randomly change one of the reactant(s)
            orig_idx = random.choice(np.arange(len(rcts_smi))) # randomly choose 1 reactant to be replaced
            
            found = False
            while not found: # searches randomly to find a different rct molecule to swap with 
                rdm_rxn_idx = random.choice(np.arange(len(self.rxn_smiles))) # randomly choose 1 rxn
                if rdm_rxn_idx == pos_rxn_idx: continue # don't choose the original rxn
                        
                new_rxn_smi = self.rxn_smiles[rdm_rxn_idx]
                new_rcts_smi = new_rxn_smi.split('>')[0].split('.')

                rdm_rcts_idx = random.choice(np.arange(len(new_rcts_smi)))
                if new_rcts_smi[rdm_rcts_idx] != rcts_smi[orig_idx]:
                    found = True
                    rcts_smi[orig_idx] = new_rcts_smi[rdm_rcts_idx]
            
        else: # randomly change the product            
            found = False
            while not found:  # searches randomly to find a different prod molecule to swap with 
                rdm_rxn_idx = random.choice(np.arange(len(self.rxn_smiles)))
                if rdm_rxn_idx == pos_rxn_idx: continue # don't choose the original rxn
                        
                new_rxn_smi = self.rxn_smiles[rdm_rxn_idx]      
                new_prod_smi = new_rxn_smi.split('>')[-1]
                if new_prod_smi != prod_smi:
                    found = True
                    prod_smi = new_prod_smi
        
        return '{}>>{}'.format('.'.join(rcts_smi), prod_smi)
    
    def __getitem__(self, idx):
      ''' 
      Returns 1 training sample in the form [pos_rxn, neg_rxn_1, ..., neg_rxn_K-1]
      num_neg: a hyperparameter to be tuned
      '''
      if torch.is_tensor(idx): # may not be needed, taken from data loading tutorial
          idx = idx.tolist() 

      pos_rxn_smi = self.rxn_smiles[idx]
      pos_rxn_fp = create_rxn_MorganFP(pos_rxn_smi, radius=self.fp_radius, 
                                      rctfp_size=self.rctfp_size, prodfp_size=self.prodfp_size, fp_type=self.fp_type)
      
      assert self.num_neg > 0, 'num_neg cannot be negative!'
      neg_rxn_smis = [self.random_sample_negative(pos_rxn_smi, idx) for i in range(self.num_neg)]
      neg_rxn_fps = [create_rxn_MorganFP(neg_rxn_smi, radius=self.fp_radius,  
                                        rctfp_size=self.rctfp_size, prodfp_size=self.prodfp_size, fp_type=self.fp_type)
                    for neg_rxn_smi in neg_rxn_smis]

      return torch.Tensor([pos_rxn_fp, *neg_rxn_fps])

    def __len__(self):
        return len(self.rxn_smiles)

### Train and Test

In [6]:
trainargs = {
    'model': 'FF_diff', # must change both model & fp_type 
    'hidden_sizes': [256],  
    'output_size': 1,
    'dropout': 0.3, # adapted from Reaction Condition Recommender   
    
    'batch_size': 512,
    'activation': 'ReLU', # trying ELU for its differentiability everywhere (vs ReLU which is not differentiable at x=0)
    'optimizer': torch.optim.Adam,
    'learning_rate': 5e-5, # to try: integrate w/ fast.ai lr_finder & lr_schedulers 
    'epochs': 50,
    'early_stop': True,
    'min_delta': 1e-5, 
    'patience': 3,

    'checkpoint': True,
    'model_seed': 1337,
    'random_seed': 0, # affects neg rxn sampling since it is random
    
    'rctfp_size': 1024, # if fp_type == 'diff', ensure that both rctfp_size & prodfp_size are identical!
    'prodfp_size': 1024,
    'fp_radius': 2,
    'fp_type': 'diff',
    
    'num_neg': 1, # to be tuned, 9 seems to be superior to 5 (overfitting occured quickly)
    
    'path_to_pickle': os.getcwd()+'/clean_rxn_50k_nomap_noreagent.pickle', 
    'checkpoint_path': os.getcwd()+'/checkpoints/',
    'expt_name': 'test',
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

In [11]:
# initialises fingerprint-based feedforward EBM model 
model = FF_ebm(trainargs)
run = Run(model, trainargs)

In [None]:
run.train()

In [None]:
run.test()

In [None]:
run.stats

In [None]:
import IPython.display as display
display.Audio(url="https://static.sfdict.com/audio/C07/C0702600.mp3", autoplay=True)

### load trained models from prev expts, & get topk accuracy

In [30]:
stats_filename = 'FF_sep_2048_1layer_9neg_rad2_ReLU_stats.pkl'
stats = torch.load(os.getcwd()+'/checkpoints/{}'.format(stats_filename),
          map_location=torch.device('cpu'))
stats  

{'trainargs': {'model': 'FF_sep',
  'hidden_sizes': [256],
  'output_size': 1,
  'dropout': 0.5,
  'batch_size': 512,
  'activation': 'ReLU',
  'optimizer': torch.optim.adam.Adam,
  'learning_rate': 1e-05,
  'epochs': 50,
  'early_stop': True,
  'min_delta': 1e-05,
  'patience': 3,
  'checkpoint': True,
  'model_seed': 1337,
  'random_seed': 0,
  'rctfp_size': 2048,
  'prodfp_size': 2048,
  'fp_radius': 2,
  'fp_type': 'sep',
  'num_neg': 9,
  'path_to_pickle': '/content/clean_rxn_50k_nomap_noreagent.pickle',
  'checkpoint_path': '/content/gdrive/My Drive/rxn_ebm/',
  'expt_name': '2048_1layer_9neg_rad2_ReLU'},
 'early_stop_epoch': 15,
 'mean_train_loss': [12.479294,
  11.597199,
  11.315463,
  11.207648,
  11.078769,
  10.98713,
  11.030012,
  11.107498,
  11.272827,
  11.2081175,
  11.163873,
  11.160275,
  11.181705,
  11.137172,
  11.099286,
  11.139931,
  11.090733,
  11.087778,
  11.052986,
  11.010392,
  10.987789,
  10.988465,
  10.971515,
  10.981416,
  10.945549,
  10.928935,

In [31]:
checkpoint_filename = 'FF_sep_2048_1layer_9neg_rad2_ReLU_checkpoint_0011.pth.tar'
checkpoint = torch.load(os.getcwd()+'/checkpoints/{}'.format(checkpoint_filename),
          map_location=torch.device('cpu'))

In [32]:
trainargs = {'model': 'FF_sep',
  'hidden_sizes': [256],
  'output_size': 1,
  'dropout': 0.5,
  'batch_size': 512,
  'activation': 'ReLU',
  'optimizer': torch.optim.Adam,
  'learning_rate': 1e-05,
  'epochs': 50,
  'early_stop': True,
  'min_delta': 1e-05,
  'patience': 3,
  'checkpoint': True,
  'model_seed': 1337,
  'random_seed': 0,
  'rctfp_size': 2048,
  'prodfp_size': 2048,
  'fp_radius': 2,
  'fp_type': 'sep',
  'num_neg': 9,
    'path_to_pickle': os.getcwd()+'/clean_rxn_50k_nomap_noreagent.pickle', 
    'checkpoint_path': os.getcwd()+'/checkpoints/',
 'expt_name': '2048_1layer_1neg_rad2_ReLU',
     'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")        }

In [33]:
model = FF_ebm(trainargs)
optimizer = trainargs['optimizer'](model.parameters(), lr=trainargs['learning_rate'])

model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])

run = Run(model, trainargs, optimizer, load_checkpoint=True)

In [34]:
test_dataset = ReactionDataset(trainargs['path_to_pickle'], 'test', trainargs)
test_loader = DataLoader(test_dataset, 2 * trainargs['batch_size'], shuffle=False)

run.get_topk_acc(test_loader, k=1)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:25<00:00, 17.12s/it]


0.11543838625923707

In [35]:
run.get_topk_acc(test_loader, k=2)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:45<00:00, 21.12s/it]


0.19412822049131218

### load best model (87% top-1 acc)

In [54]:
# can just load trainargs from checkpoint['stats']['trainargs'] also, 
# but most likely we want to change epochs or learning rate, so redefining it this way is better 

trainargs = {
    'model': 'FF_diff', # must change both model & fp_type 
    'hidden_sizes': [256],  
    'output_size': 1,
    'dropout': 0.3, # adapted from Reaction Condition Recommender   
    
    'batch_size': 512,
    'activation': 'ReLU', # trying ELU for its differentiability everywhere (vs ReLU which is not differentiable at x=0)
    'optimizer': torch.optim.Adam,
    'learning_rate': 5e-5, # to try: integrate w/ fast.ai lr_finder & lr_schedulers 
    'epochs': 50,
    'early_stop': True,
    'min_delta': 1e-5, 
    'patience': 3,

    'checkpoint': True,
    'model_seed': 1337,
    'random_seed': 0, # affects neg rxn sampling since it is random
    
    'rctfp_size': 4096, # if fp_type == 'diff', ensure that both rctfp_size & prodfp_size are identical!
    'prodfp_size': 4096,
    'fp_radius': 3,
    'fp_type': 'diff',
    
    'num_neg': 5, # to be tuned, 9 seems to be superior to 5 (overfitting occured quickly)
    
    'path_to_pickle': os.getcwd()+'/clean_rxn_50k_nomap_noreagent.pickle', 
    'checkpoint_path': os.getcwd()+'/checkpoints/',
    'expt_name': 'test',
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

In [50]:
checkpoint = torch.load(os.getcwd()+'/checkpoints/FF_diff_DIFF_4096_1layer_5neg_rad3_ReLU_checkpoint_0049.pth.tar',
          map_location=torch.device('cpu'))
checkpoint.keys()

dict_keys(['epoch', 'model', 'state_dict', 'optimizer', 'stats'])

In [94]:
stats = torch.load(os.getcwd()+'/checkpoints/FF_diff_DIFF_4096_1layer_5neg_rad3_ReLU_stats.pkl',
          map_location=torch.device('cpu'))
stats  

{'trainargs': {'model': 'FF_diff',
  'hidden_sizes': [256],
  'output_size': 1,
  'dropout': 0.5,
  'batch_size': 512,
  'activation': 'ReLU',
  'optimizer': torch.optim.adam.Adam,
  'learning_rate': 5e-05,
  'epochs': 50,
  'early_stop': True,
  'min_delta': 1e-05,
  'patience': 3,
  'checkpoint': True,
  'model_seed': 1337,
  'random_seed': 0,
  'rctfp_size': 4096,
  'prodfp_size': 4096,
  'fp_radius': 3,
  'fp_type': 'diff',
  'num_neg': 5,
  'path_to_pickle': '/content/clean_rxn_50k_nomap_noreagent.pickle',
  'checkpoint_path': '/content/gdrive/My Drive/rxn_ebm/',
  'expt_name': 'DIFF_4096_1layer_5neg_rad3_ReLU'},
 'mean_train_loss': [5.541507,
  5.738795,
  5.986399,
  6.1413307,
  6.0781894,
  6.124495,
  6.146295,
  6.0141373,
  5.8681197,
  5.8325505,
  5.797415,
  5.7253375,
  5.736352,
  5.6945977,
  5.7136016,
  5.6857224,
  5.7454643,
  5.759552,
  5.749079,
  5.777142,
  5.755688,
  5.7428727,
  5.725483,
  5.7730203,
  5.8116364,
  5.8061557,
  5.791269,
  5.785485,
  5.7

In [91]:
model = FF_ebm(trainargs)
optimizer = trainargs['optimizer'](model.parameters(), lr=trainargs['learning_rate'])

model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])

run = Run(model, trainargs, optimizer, load_checkpoint=True)

In [60]:
test_dataset = ReactionDataset(trainargs['path_to_pickle'], 'test', trainargs)
test_loader = DataLoader(test_dataset, 2 * trainargs['batch_size'], shuffle=False)

run.get_topk_acc(test_loader, k=1)



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A[A

 20%|████████████████▊                                                                   | 1/5 [00:13<00:52, 13.23s/it][A[A

 40%|█████████████████████████████████▌                                                  | 2/5 [00:25<00:39, 13.09s/it][A[A

 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:39<00:26, 13.24s/it][A[A

 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:53<00:13, 13.47s/it][A[A

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:06<00:00, 13.34s/it][A[A


0.8765727980826843

In [92]:
run.get_topk_acc(test_loader, k=2)



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A[A

 20%|████████████████▊                                                                   | 1/5 [00:09<00:37,  9.44s/it][A[A

 40%|█████████████████████████████████▌                                                  | 2/5 [00:18<00:28,  9.37s/it][A[A

 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:29<00:19,  9.90s/it][A[A

 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:40<00:10, 10.19s/it][A[A

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:50<00:00, 10.11s/it][A[A


0.9358897543439185

In [93]:
run.get_topk_acc(test_loader, k=3)



  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A[A

 20%|████████████████▊                                                                   | 1/5 [00:09<00:37,  9.31s/it][A[A

 40%|█████████████████████████████████▌                                                  | 2/5 [00:19<00:28,  9.55s/it][A[A

 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:30<00:19,  9.99s/it][A[A

 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:40<00:10, 10.14s/it][A[A

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:50<00:00, 10.14s/it][A[A


0.9658478130617136

In [85]:
# check which idx has lowest prob[:, 0] --> most incorrect predictions 
# but cannot retrieve structures, bcos morganFP loses info about structure 
# but an idea to explore in future for sure 

# torch.argmin(torch.topk(energy_scores, 1, dim=1)[0]) --> tensor(1285) 
# test_dataset.__getitem__(1285)

tensor(1285)

### archive

In [None]:
stats = test(model, stats, trainargs)

100%|██████████| 5/5 [00:10<00:00,  2.15s/it]

train_time: 593.6578595638275
test_loss: [tensor(2.1238), tensor(1.8693), tensor(1.9509), tensor(1.8589), tensor(2.0245)]
mean_test_loss: 1.9654802083969116





In [None]:
test_dataset = ReactionDataset(trainargs['path_to_pickle'], 'test', trainargs)
test_loader = DataLoader(test_dataset, 2 * trainargs['batch_size'], shuffle=False)

test_scores = get_scores(model, test_loader)

100%|██████████| 5/5 [00:11<00:00,  2.26s/it]


In [None]:
test_scores[0][:20]

tensor([[ 31.7568,   0.6232],
        [  0.6059,   1.6602],
        [  0.5389,   0.5825],
        [  0.6551,   1.1973],
        [  0.6551,   0.2540],
        [  0.5131,   1.4135],
        [  0.7937,   0.3980],
        [  1.1794,   0.3391],
        [  0.9399,   1.2554],
        [  1.1911,   0.8047],
        [  0.3444,   1.1524],
        [  1.5007,   0.3585],
        [  1.1836,   1.3253],
        [  1.5233,   1.3346],
        [-13.7237,   1.2437],
        [  1.3159,   0.5960],
        [  1.3634,   0.2379],
        [  1.0724,   0.2915],
        [  0.3685,   1.1581],
        [  1.3275,   1.2685]], device='cuda:0')

In [None]:
stats

{'early_stop_epoch': 5,
 'mean_test_loss': 1.9654802,
 'mean_train_loss': [3.79339,
  4.147327,
  4.1827908,
  4.0512996,
  4.0508604,
  4.0648336,
  4.0370502,
  4.0900755,
  4.050375,
  4.071214,
  4.116623,
  4.1053796,
  4.14862,
  4.1839542,
  4.147746,
  4.131441,
  4.1477966,
  4.174607,
  4.1787944,
  4.1921945,
  4.1756163,
  4.181274,
  4.191578,
  4.2076545,
  4.203568,
  4.2123814,
  4.213587,
  4.215596,
  4.2241488,
  4.2082777,
  4.2193255,
  4.2122393,
  4.200224,
  4.1940913,
  4.19744,
  4.1791553,
  4.1880274,
  4.198185,
  4.2058096,
  4.218196,
  4.205752,
  4.1958714,
  4.180777,
  4.176884,
  4.179717,
  4.1756716,
  4.186036,
  4.1780953,
  4.180181,
  4.1730504,
  4.171473,
  4.1738334,
  4.1639833,
  4.154662,
  4.146443,
  4.146165,
  4.142267,
  4.142902,
  4.1295094,
  4.1229115,
  4.1232905,
  4.1239963,
  4.118799,
  4.118685,
  4.1207676,
  4.1217036,
  4.121556,
  4.1220613,
  4.113785,
  4.1244617,
  4.1224647,
  4.113756,
  4.1132717,
  4.1098924,
  4

In [None]:
stats = test(model, stats, trainargs)

100%|██████████| 5/5 [00:48<00:00,  9.73s/it]

train_time: 7941.052913427353
test_loss: [tensor(3.6462), tensor(4.0916), tensor(3.9566), tensor(3.9063), tensor(3.8079)]
mean_test_loss: 3.8817272186279297



