### To do:  (latest notebook is here)
- build Ball Tree for cosine similarity
- implement Bayesian optimisation 

Qns: 
- are morganFP deterministically calculated --> yes. just be very careful when using np.empty()

In [None]:
# Install RDKit. Takes 2-3 minutes
# !wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
# !chmod +x Miniconda3-latest-Linux-x86_64.sh
# !time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
# !time conda install -q -y -c conda-forge python=3.7
# !time conda install -q -y -c conda-forge rdkit

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

In [3]:
# !cp '/content/gdrive/My Drive/rxn_ebm/USPTO_50k_Schneider/clean_rxn_50k_nomap_noreagent.pickle' '/content/'

In [54]:
LOCAL = True 
# change folders as needed
if LOCAL: 
    checkpoint_folder = 'checkpoints/'
    base_path = 'USPTO_50k_data/clean_rxn_50k_sparse_FPs_numrcts'
else: # colab 
    checkpoint_folder = '/content/gdrive/My Drive/rxn_ebm/checkpoints/' 
    base_path = '/content/clean_rxn_50k_sparse_FPs_numrcts'

In [92]:
import sys
# sys.path.append('/usr/local/lib/python3.7/site-packages/') 
# for Colab 
import os

import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.ipython_useSVG=True
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdqueries # faster than iterating atoms https://sourceforge.net/p/rdkit/mailman/message/34538007/ 
from rdkit.Chem.rdchem import Atom
from rdkit import DataStructs
import numpy as np

from itertools import chain
import random

from tqdm import tqdm
getattr(tqdm, '_instances', {}).clear()
import csv
import re 
import pickle
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F

### utils

In [56]:
import shutil
import torch
import torch.nn as nn

def get_activation_function(activation: str) -> nn.Module:
    """
    Gets an activation function module given the name of the activation.
    Supports:
    * :code:`ReLU`
    * :code:`LeakyReLU`
    * :code:`PReLU`
    * :code:`tanh`
    * :code:`SELU`
    * :code:`ELU`
    :param activation: The name of the activation function.
    :return: The activation function module.
    """
    if activation == 'ReLU':
        return nn.ReLU()
    elif activation == 'LeakyReLU':
        return nn.LeakyReLU(0.1)
    elif activation == 'PReLU':
        return nn.PReLU()
    elif activation == 'tanh':
        return nn.Tanh()
    elif activation == 'SELU':
        return nn.SELU()
    elif activation == 'ELU':
        return nn.ELU()
    else:
        raise ValueError(f'Activation "{activation}" not supported.')
    
def initialize_weights(model: nn.Module) -> None:
    """
    Initializes the weights of a model in place.
    :param model: An PyTorch model.
    """
    for param in model.parameters():
        if param.dim() == 1:
            nn.init.constant_(param, 0)
        else:
            nn.init.xavier_normal_(param)
            
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

### model

In [57]:
import torch
import torch.nn as nn

class FF_ebm(nn.Module):
    '''
    trainargs: dictionary containing hyperparameters to be optimised, 
    hidden_sizes must be a list e.g. [1024, 512, 256]
    
    To do: bayesian optimisation
    '''
    def __init__(self, trainargs):
        super(FF_ebm, self).__init__()
        self.output_size = trainargs['output_size']
        self.num_layers = len(trainargs['hidden_sizes']) + 1

        if trainargs['model'] == 'FF_sep':
          self.input_dim = trainargs['rctfp_size'] + trainargs['prodfp_size'] # will be rctfp_size + prodfp_size for FF_sep
        elif trainargs['model'] == 'FF_diff':
          self.input_dim = trainargs['rctfp_size']
          assert trainargs['rctfp_size'] == trainargs['prodfp_size'], 'rctfp_size != prodfp_size, unable to make difference FPs!!!'

        self.create_ffn(trainargs)
        initialize_weights(self)  # is it necessary to initialize weights?? 
    
    def create_ffn(self, trainargs):
        '''
        Creates feed-forward network using trainargs dict
        '''
        dropout = nn.Dropout(trainargs['dropout'])
        activation = get_activation_function(trainargs['activation'])

        if self.num_layers == 1:
            ffn = [
                dropout,
                nn.Linear(self.input_dim, self.output_size)
            ]
        else:
            ffn = [
                dropout,
                nn.Linear(self.input_dim, trainargs['hidden_sizes'][0])
            ]
            
            # intermediate hidden layers 
            for i, layer in enumerate(range(self.num_layers - 2)):
                ffn.extend([
                    activation,
                    dropout,
                    nn.Linear(trainargs['hidden_sizes'][i], trainargs['hidden_sizes'][i+1]),
                ])
                
            # last hidden layer
            ffn.extend([
                activation,
                dropout,
                nn.Linear(trainargs['hidden_sizes'][-1], self.output_size),
            ])

        self.ffn = nn.Sequential(*ffn)
        
    def forward(self, batch):
        '''
        Runs FF_ebm on input
        
        batch: a N x K x 1 tensor of N training samples, where each sample contains 
        a positive rxn on the first column, and K-1 negative rxn on subsequent columns 
        supplied by DataLoader on custom ReactionDataset 
        '''
        energy_scores = self.ffn(batch) # tensor of size N x K x 1
        return energy_scores.squeeze(dim=-1)  # scores: N x K after squeezing

### train

In [58]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm
import torch.nn as nn
import time

class Run():
    '''
    epochs are 1-indexed (i.e. start from 1, 2, 3 ... not 0, 1, 2 ...)
    if load_checkpoint == True, load_optimizer, load_stats & begin_epoch must be provided 
    '''
    def __init__(self, model, trainargs,
                 load_optimizer=None, load_checkpoint=False, load_stats=None, begin_epoch=None):
        self.device = trainargs['device']
        model = model.to(self.device)
        self.model = model
        self.trainargs = trainargs 
        self.best_epoch = None # will be automatically assigned after 1 epoch
        
        if load_checkpoint: 
            assert load_optimizer is not None, 'load_checkpoint requires load_optimizer!'
            self.optimizer = load_optimizer # load optimizer w/ state dict from checkpoint
            
            assert load_stats is not None, 'load_checkpoint requires load_stats!'
            self.stats = load_stats
            self.mean_train_loss = self.stats['mean_train_loss']
            self.min_val_loss = self.stats['min_val_loss']
            self.mean_val_loss = self.stats['mean_val_loss']
            
            assert begin_epoch is not None, 'load_checkpoint requires begin_epoch!'
            self.begin_epoch = begin_epoch

        else: # init fresh optimizer 
            self.optimizer = trainargs['optimizer'](model.parameters(), lr=trainargs['learning_rate'])
            
            self.mean_train_loss = []
            self.min_val_loss = 1e9
            self.mean_val_loss = []
            self.begin_epoch = 1
            self.stats = {'trainargs': self.trainargs, 'train_time': 0} # to store training statistics  

        train_dataset = ReactionDataset(trainargs['base_path'], 'train', trainargs)
        self.train_loader = DataLoader(train_dataset, trainargs['batch_size'], shuffle=True)
        
        val_dataset = ReactionDataset(trainargs['base_path'], 'valid', trainargs)
        self.val_loader = DataLoader(val_dataset, 2 * trainargs['batch_size'], shuffle=False)
        
        test_dataset = ReactionDataset(self.trainargs['base_path'], 'test', self.trainargs)
        self.test_loader = DataLoader(test_dataset, 2 * self.trainargs['batch_size'], shuffle=False)
        del train_dataset, val_dataset, test_dataset # save memory

        torch.manual_seed(trainargs['model_seed'])
        random.seed(trainargs['random_seed'])
    
    def train_one(self, batch, val=False):
        '''
        Trains model for 1 epoch
        TO DO: learning rate scheduler + logger 
        '''
        self.model.zero_grad()
        scores = self.model(batch) # size N x K 

        softmax = nn.Softmax(dim=1) 
        probs = torch.clamp(softmax(scores), min=1e-12) # size N x K, clamped to >= 1e-12 for safe log 

        # positives are the 0-th index of each sample 
        loss = -torch.log(probs[:, 0]).mean() # probs[:, 0] is size N x 1 --> sum/mean to 1 value

        if not val:
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        return loss.item()

    def train(self):
        '''
        Trains model for epochs provided in trainargs
        Currently supports feed-forward networks: 
            FF_diff: takes as input a difference FP of fp_size & fp_radius
            FF_sep: takes as input a concatenation of [reactants FP, product FP] 
        '''
        start = time.time()

        for epoch in np.arange(self.begin_epoch, self.trainargs['epochs']): # epochs are 1-indexed (as of 27th Aug 2 am)
            self.model.train() # set model to training mode
            train_loss = []
            for batch in tqdm(self.train_loader): 
                batch = batch.to(self.device)
                train_loss.append(self.train_one(batch, val=False))
                self.mean_train_loss.append(np.mean(train_loss)) 

            self.model.eval() # validation mode
            val_loss = []
            with torch.no_grad():
                for batch in tqdm(self.val_loader):
                    batch = batch.to(self.device)
                    val_loss.append(self.train_one(batch, val=True))
                
                self.mean_val_loss.append(np.mean(val_loss))
                if self.trainargs['early_stop'] and \
                self.min_val_loss - self.mean_val_loss[-1] < self.trainargs['min_delta']:
                    if self.trainargs['patience'] <= wait:
                        print('Early stopped at the end of epoch: ', epoch)
                        print('mean_val_loss: ', np.mean(val_loss))
                        self.stats['early_stop_epoch'] = epoch 
                        break 
                    else:
                        wait += 1
                        print('Decrease in val loss < min_delta, patience count: ', wait)
                else:
                    wait = 0
                    self.min_val_loss = min(self.min_val_loss, self.mean_val_loss[-1])
                
                if self.mean_val_loss[-1] < self.min_val_loss:
                    self.best_epoch = epoch # track best_epoch to load best_checkpoint 

            if self.trainargs['checkpoint']: # adapted from moco: main_moco.py
                save_checkpoint({
                        'epoch': epoch, # epochs are 1-indexed
                        'model': self.trainargs['model'],
                        'state_dict': self.model.state_dict(),
                        'optimizer' : self.optimizer.state_dict(),
                        'stats' : self.stats,
                    }, is_best=False, 
                    filename=self.trainargs['checkpoint_path']+'{}_{}_checkpoint_{:04d}.pth.tar'.format(
                        self.trainargs['model'], self.trainargs['expt_name'], epoch))

            print('Epoch: {}, train_loss: {}, val_loss: {}'.format(epoch, 
                                             np.around(np.mean(train_loss), decimals=4), 
                                             np.around(np.mean(val_loss), decimals=4)))

        self.stats['mean_train_loss'] = self.mean_train_loss
        self.stats['mean_val_loss'] = self.mean_val_loss
        self.stats['min_val_loss'] = self.min_val_loss
        self.stats['best_epoch'] = self.best_epoch
        self.stats['train_time'] += (time.time() - start) / 60
        torch.save(self.stats, self.trainargs['checkpoint_path']+'{}_{}_stats.pkl'.format(
            self.trainargs['model'], self.trainargs['expt_name']))         # save training stats

    def test(self, load_stats=None):
        '''
        Evaluates the model on the test set
        '''
        test_loss = []
        
        self.model.eval()
        with torch.no_grad():
            for batch in tqdm(self.test_loader):
                batch = batch.to(self.device)
                test_loss.append(self.train_one(batch, val=True))
        
        if load_stats is not None: 
            self.stats = load_stats 
        assert len(self.stats.keys()) > 1, 'If loading checkpoint, you need to provide load_stats!'
        
        self.stats['test_loss'] = test_loss 
        self.stats['mean_test_loss'] = np.mean(test_loss)
        print('train_time: {}'.format(self.stats['train_time']))
        print('test_loss: {}'.format(self.stats['test_loss']))
        print('mean_test_loss: {}'.format(self.stats['mean_test_loss']))

        # overrides training stats w/ training + test stats
        torch.save(self.stats, self.trainargs['checkpoint_path']+'{}_{}_stats.pkl'.format(
            self.trainargs['model'], self.trainargs['expt_name']))

    def get_scores(self, dataloader, save_neg=False):
        ''' 
        Gets raw energy values (scores) from a trained model on a given dataloader,
        with the option to save pos_neg_smis to analyse model performance
        '''
        scores = []
        self.model.eval()
        with torch.no_grad():
            if save_neg:      # save neg rxn smis to analyse model performance           
                pos_neg_smis = []
                for pos_neg_smi, batch in tqdm(dataloader):
                    batch = batch.to(self.device)
                    scores.append(self.model(batch)) # scores: size N x K 
                    pos_neg_smis.append(pos_neg_smi)
                torch.save(pos_neg_smis, self.trainargs['checkpoint_path']+'{}_{}_posnegsmi.pkl'.format(
                        self.trainargs['model'], self.trainargs['expt_name']))
                
                return torch.cat(scores, dim=0), pos_neg_smis
            else:
                for batch in tqdm(dataloader):
                    batch = batch.to(self.device)
                    scores.append(self.model(batch))
                
                return torch.cat(scores, dim=0)

    def get_topk_acc(self, dataloader, k=1, repeats=1):
        '''
        Computes top-k accuracy of trained model in classifying feasible vs infeasible chemical rxns
        (i.e. maximum score assigned to label 0 of each training sample) 
        Returns: (list of accs, mean acc, variance of acc)
        '''
        accs = np.array([])
        for repeat in range(repeats):
            scores = self.get_scores(dataloader)
            predicted_labels = torch.topk(scores, k, dim=1)[1]
            accs = np.append(accs, torch.where(predicted_labels == 0)[0].shape[0] / predicted_labels.shape[0])

        return accs, accs.mean(), accs.var()

### data

In [59]:
# https://github.com/pytorch/tutorials/blob/master/beginner_source/data_loading_tutorial.py
import torch
from torch.utils.data import Dataset
import random
import pickle
from scipy import sparse 

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdChemReactions
from rdkit import DataStructs
import numpy as np

def create_rxn_MorganFP_fromFP(raw_fp, num_rcts, fp_type='diff', 
                               rctfp_size=4096, prodfp_size=4096, dtype='int8'):
    '''
    fp_type: 'diff' or 'sep', 
    'diff' (difference):
    Creates reaction MorganFP following Schneider et al in J. Chem. Inf. Model. 2015, 55, 1, 39–53
    reactionFP = productFP - sum(reactantFPs)
    
    'sep' (separate):
    Creates separate reactantsFP and productFP following Gao et al in ACS Cent. Sci. 2018, 4, 11, 1465–1476
    '''
    # initialise empty fp numpy arrays
    if fp_type == 'diff':
        diff_fp = np.zeros(rctfp_size, dtype = dtype)
    elif fp_type == 'sep':
        rcts_fp = np.zeros(rctfp_size, dtype = dtype)
        prod_fp = np.zeros(prodfp_size, dtype = dtype)
    else:
        print('ERROR: fp_type not recognised!')
        return
    
    # create product FP
    try:
        fp = raw_fp[-1, :]
        if fp_type == 'diff':
            diff_fp += fp
        elif fp_type == 'sep':
            prod_fp = fp
    except Exception as e:
        print("Cannot build product fp due to {}".format(e))
        return
                                  
    # create reactant FPs, subtracting each from product FP
    for i in range(num_rcts):
        try:
            fp = raw_fp[i, :]
            if fp_type == 'diff':
                diff_fp -= fp
            elif fp_type == 'sep':
                rcts_fp += fp
        except Exception as e:
            print("Cannot build reactant fp due to {}".format(e))
            return
    
    if fp_type == 'diff':
        return diff_fp
    elif fp_type == 'sep':
        return np.concatenate([rcts_fp, prod_fp])
    
    
class ReactionDataset(Dataset):
    '''
    The Dataset class ReactionDataset prepares training samples of length K: 
    [pos_rxn, neg_rxn_1, ..., neg_rxn_K-1], ... where K-1 = num_neg 

    TO DO: can this be further optimised? Augmentation is the clear bottleneck during training
    '''
    def __init__(self, base_path, key, trainargs, save_neg=False):
        '''
        base_path is of the form: 'USPTO_50k_data/clean_rxn_50k_sparse_FPs', and according to key parameter,
        the correct full path will be used e.g. 'USPTO_50k_data/clean_rxn_50k_sparse_FPs_train.npz'
        ''' 
        self.fp_raw_num_rcts = sparse.load_npz(base_path + '_' + key + '.npz')  
        self.fp_type = trainargs['fp_type']
        
        self.fp_radius = trainargs['fp_radius'] # not needed if loading pre-computed fingerprints
        self.rctfp_size = trainargs['rctfp_size']
        self.prodfp_size = trainargs['prodfp_size']
        assert trainargs['rctfp_size'] == trainargs['prodfp_size']
        
        self.num_neg = trainargs['num_neg']
        self.save_neg = save_neg

    def random_sample_negative(self, raw_fp, raw_fp_idx, num_rcts):
        '''
        Randomly generates 1 negative rxn given a positive rxn fingerprint
        Returns neg_rxn_fp (fingerprint)
        ''' 
        rdm_rxn_idx = random.choice(np.arange(self.fp_raw_num_rcts.shape[0])) 
        new_fp_raw_num_rcts = self.fp_raw_num_rcts[rdm_rxn_idx].toarray()
        new_raw_fp, _ = np.split(new_fp_raw_num_rcts, [new_fp_raw_num_rcts.shape[-1]-1], axis=1)
        new_raw_fp = new_raw_fp.reshape(-1, self.rctfp_size) #.astype('int8')
        
        rct_or_prod = random.choice([0, 1])
        if rct_or_prod == 0: # randomly change one of the reactant(s)
            orig_idx = random.choice(np.arange(num_rcts)) # randomly choose 1 reactant to be replaced
            raw_fp[orig_idx, :] = new_raw_fp[orig_idx, :] 
        else:  # randomly change product 
            raw_fp[-1, :] = new_raw_fp[-1, :] 
        return raw_fp 

    def __getitem__(self, idx):
        ''' 
        Returns 1 training sample in the form [pos_rxn_fp, neg_rxn_1_fp, ..., neg_rxn_K-1_fp]
        num_neg: a hyperparameter to be tuned
        '''
        if torch.is_tensor(idx): # may not be needed 
            idx = idx.tolist() 

        fp_raw_num_rcts = self.fp_raw_num_rcts[idx].toarray() 
        pos_raw_fp, num_rcts = np.split(fp_raw_num_rcts, [fp_raw_num_rcts.shape[-1]-1], axis=1)
        pos_raw_fp = pos_raw_fp.reshape(-1, self.rctfp_size) #.astype('int8')
        num_rcts = num_rcts[0][0]
        pos_rxn_fp = create_rxn_MorganFP_fromFP(pos_raw_fp, num_rcts, fp_type=self.fp_type, 
                                                rctfp_size=self.rctfp_size, prodfp_size=self.prodfp_size)

        assert self.num_neg > 0, 'num_neg cannot be negative!'
        neg_raw_fps = [self.random_sample_negative(pos_raw_fp.copy(), idx, num_rcts) for i in range(self.num_neg)]
        neg_rxn_fps = [create_rxn_MorganFP_fromFP(neg_raw_fp, num_rcts, fp_type=self.fp_type, 
                                                  rctfp_size=self.rctfp_size, prodfp_size=self.prodfp_size)
                        for neg_raw_fp in neg_raw_fps]
        return torch.Tensor([pos_rxn_fp, *neg_rxn_fps])

    def __len__(self):
        return self.fp_raw_num_rcts.shape[0]

### Train and Test

In [101]:
trainargs = {
    'model': 'FF_diff', # must change both model & fp_type 
    'hidden_sizes': [256],  
    'output_size': 1,
    'dropout': 0.1,  
    
    'batch_size': 512,
    'activation': 'ReLU',  
    'optimizer': torch.optim.Adam,
    'learning_rate': 1e-2, # to try: lr_finder & lr_schedulers 
    'epochs': 20,
    'early_stop': True,
    'min_delta': 1e-4, 
    'patience': 1,

    'checkpoint': True,
    'model_seed': 1337,
    'random_seed': 0, # affects neg rxn sampling since it is random
    
    'rctfp_size': 4096, # if fp_type == 'diff', ensure that both rctfp_size & prodfp_size are identical!
    'prodfp_size': 4096,
    'fp_radius': 3,
    'fp_type': 'diff',
    
    'num_neg': 3, 
    
    'base_path': base_path, # refer to top of notebook 
    'checkpoint_path': checkpoint_folder,
    'expt_name': 'test3',
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

In [93]:
%%time 
# for num_neg = 5
# old: approx 3 min 
# new (less restrictive random replacements): approx 3 min too, no significant speed up
train_dataset = ReactionDataset(trainargs['base_path'], 'train', trainargs, save_neg=False)
train_loader = DataLoader(train_dataset, trainargs['batch_size'], shuffle=True)

for i in range(2):
    for batch in tqdm(train_loader):
        continue

  0%|                                                                                           | 0/79 [01:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [02:52<00:00,  2.18s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [02:47<00:00,  2.12s/it]

Wall time: 5min 39s





In [103]:
model = FF_ebm(trainargs)
run = Run(model, trainargs)

In [104]:
# training takes ~3 min 45.6 secs on CPU (avg of 50 epochs, total 188 min) but had bug, had to add .copy() --> running now
# ~2 min 7 secs on GPU (colab)

run.train()

100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [02:36<00:00,  1.98s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:16<00:00,  3.23s/it]
  0%|                                                                                           | 0/79 [00:00<?, ?it/s]

Epoch: 1, train_loss: 0.11599999666213989, val_loss: 0.017899999395012856


100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [02:35<00:00,  1.96s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:15<00:00,  3.01s/it]
  0%|                                                                                           | 0/79 [00:00<?, ?it/s]

Decrease in val loss < min_delta, patience count:  1
Epoch: 2, train_loss: 0.021800000220537186, val_loss: 0.019899999722838402


100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [02:30<00:00,  1.90s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.91s/it]
  0%|                                                                                           | 0/79 [00:00<?, ?it/s]

Epoch: 3, train_loss: 0.01730000041425228, val_loss: 0.006200000178068876


100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [02:38<00:00,  2.01s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:15<00:00,  3.01s/it]
  0%|                                                                                           | 0/79 [00:00<?, ?it/s]

Decrease in val loss < min_delta, patience count:  1
Epoch: 4, train_loss: 0.014000000432133675, val_loss: 0.009399999864399433


100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [02:28<00:00,  1.88s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.89s/it]

Early stopped at the end of epoch:  5
mean_val_loss:  0.0074282503





In [105]:
run.test()

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.84s/it]

train_time: 14.08382214307785
test_loss: [tensor(0.0141), tensor(0.0090), tensor(0.0081), tensor(0.0034), tensor(0.0073)]
mean_test_loss: 0.008360333740711212





In [None]:
run.stats

In [107]:
test_dataset = ReactionDataset(trainargs['base_path'], 'test', trainargs)
test_loader = DataLoader(test_dataset, 2 * trainargs['batch_size'], shuffle=False)

run.get_topk_acc(test_loader, 1, repeats=3)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.96s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.95s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.84s/it]


(array([0.99580587, 0.99740363, 0.99760336]),
 0.9969376206644031,
 6.470758083569257e-07)

In [108]:
scores = run.get_scores(test_loader)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.97s/it]


In [109]:
scores[0:20]

tensor([[ -9.9653, -44.2560, -25.9069, -20.5546],
        [ -5.7980, -40.8563, -39.7291, -30.0718],
        [ -8.8300, -47.4825, -34.1496, -46.5821],
        [ -4.5160, -28.2090, -38.2848, -21.6615],
        [ -4.6532, -27.9080, -22.7084, -18.6033],
        [ -9.4715, -24.4788, -19.3430, -38.8233],
        [ -6.1885, -42.0814, -18.7048, -13.1000],
        [ -4.7134, -35.5632, -28.3772, -46.0047],
        [ -5.1582, -39.6972, -23.9009, -39.5617],
        [ -9.3213, -33.2027, -30.8462, -46.5106],
        [ -8.4624, -56.0819, -42.1808, -58.8606],
        [-10.4581, -17.2481, -12.1642, -31.1431],
        [ -9.3195, -35.4154, -57.5583, -44.4843],
        [ -5.3882, -20.2598, -14.7274, -12.4818],
        [ -7.4859, -28.6547, -35.7549, -37.5613],
        [ -6.9860, -33.6282, -18.8835, -28.9305],
        [ -6.6793, -49.7304, -22.6668, -49.4053],
        [-10.6576, -46.4669, -27.0369, -43.0573],
        [ -5.2685, -14.6970, -15.1971, -21.8713],
        [ -6.4010, -13.1828, -28.0627, -22.0230]])

In [101]:
import IPython.display as display
display.Audio(url="https://static.sfdict.com/audio/C07/C0702600.mp3", autoplay=True)

### load checkpoint and resume training

In [94]:
LOCAL = True 

In [None]:
# change folders as needed
if LOCAL: 
    checkpoint_folder = 'checkpoints/'
    base_path = 'USPTO_50k_data/clean_rxn_50k_sparse_FPs_numrcts'
else: # colab 
    checkpoint_folder = '/content/gdrive/My Drive/rxn_ebm/checkpoints/' 
    base_path = '/content/clean_rxn_50k_sparse_FPs_numrcts'

filenames = [filename for filename in os.listdir(checkpoint_folder) 
             if 'test' in filename] # narrow down list 
filenames

In [98]:
opt = 'Adam' # needed to fix bug in name of optimizer when saving checkpoint
stats_filename = 'FF_diff_test2_stats.pkl' # copy & paste from list above 
# filenames must end w/ stats.pkl

curr_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
stats = torch.load(checkpoint_folder + stats_filename, 
          map_location=torch.device('cpu'))
stats['trainargs']['base_path'] = base_path
stats['trainargs']['checkpoint_path'] = checkpoint_folder

if opt == 'Adam':
    stats['trainargs']['optimizer'] = torch.optim.Adam # fix bug in name of optimizer when saving checkpoint

stats['best_epoch'] = stats['mean_val_loss'].index(stats['min_val_loss']) + 1  # 1-index 
stats['trainargs']['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
try: 
    checkpoint_filename = stats_filename[:-9]+'checkpoint_{}.pth.tar'.format(str(stats['best_epoch']).zfill(4)) 
    checkpoint = torch.load(checkpoint_folder + checkpoint_filename,
          map_location=torch.device(curr_device))

    model = FF_ebm(stats['trainargs'])
    optimizer = stats['trainargs']['optimizer'](model.parameters(), lr=stats['trainargs']['learning_rate'])

    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

    if torch.cuda.is_available(): # move optimizer tensors to gpu  https://github.com/pytorch/pytorch/issues/2830
      for state in optimizer.state.values():
        for k, v in state.items():
            if torch.is_tensor(v):
                state[k] = v.cuda()
except Exception as e:
    print(e)
    print('best_epoch: {}'.format(stats['best_epoch']))

In [99]:
trainargs = {'activation': 'ReLU',
 'batch_size': 512,
 'checkpoint': True,
 'checkpoint_path': checkpoint_folder,
 'device': curr_device,
 'dropout': 0.5,
 'early_stop': True,
 'epochs': 50,
 'expt_name': 'test2', # change this if needed
 'fp_radius': 3,
 'fp_type': 'diff',
 'hidden_sizes': [256],
 'learning_rate': 5e-05,
 'min_delta': 1e-05,
 'model': 'FF_diff',
 'model_seed': 1337,
 'num_neg': 5,
 'optimizer': torch.optim.Adam,
 'output_size': 1,
 'base_path': base_path, 
 'patience': 5,
 'prodfp_size': 4096,
 'random_seed': 0,
 'rctfp_size': 4096}   

In [100]:
run = Run(model, trainargs, optimizer, load_checkpoint=True, load_stats=stats, begin_epoch=50)
run.train()