In [1]:
from __future__ import print_function

import os
import re
import cPickle, gzip
import progressbar
from rdkit.Chem import AllChem

In [4]:
with gzip.open('gen_rxn/data/all_rxns.pkl.gz', 'rb') as f:
    rxn_all = cPickle.load(f)

data_length = len(rxn_all)
print(data_length)

865118


In [6]:
import parser.Smipar as Smipar

bar = progressbar.ProgressBar(max_value=data_length)

_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_START_VOCAB = [_PAD, _GO, _EOS]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2

vocab_reactants = {}
vocab_products = {}

for i, rsmi in enumerate(rxn_all):
    
    reactant_list = []
    agent_list = []
    product_list = []

    split_rsmi = rsmi.split('>')
    reactants = split_rsmi[0].split('.')
    agents = split_rsmi[1].split('.')
    products = split_rsmi[2].split('.')

    for reactant in reactants:
        reactant_list += Smipar.parser_list(reactant)
        reactant_list += '.'
    for agent in agents:
        agent_list += Smipar.parser_list(agent)
        agent_list += '.'
    for product in products:
        product_list += Smipar.parser_list(product)
        product_list += '.'
      
    reactant_list.pop() # to pop last '.'
    agent_list.pop()
    product_list.pop()
    
    reactant_list += '>'
    reactant_list += agent_list
    
    for reactant_token in reactant_list:
        if reactant_token in vocab_reactants:
            vocab_reactants[reactant_token] += 1
        else:
            vocab_reactants[reactant_token] = 1
    
    for product_token in product_list:
        if product_token in vocab_products:
            vocab_products[product_token] += 1
        else:
            vocab_products[product_token] = 1
            
    bar.update(i)

 99% (865116 of 865118) |################## | Elapsed Time: 1:35:56 ETA: 0:00:00

In [7]:
reactants_token_list = _START_VOCAB \
        + sorted(vocab_reactants, key=vocab_reactants.get, reverse=True)

products_token_list = _START_VOCAB \
        + sorted(vocab_products, key=vocab_products.get, reverse=True)

In [8]:
print(len(reactants_token_list))
print(reactants_token_list[:100])

print('--------')

print(len(products_token_list))
print(products_token_list[:100])

57
['_PAD', '_GO', '_EOS', u'C', u'1', u'c', u')', u'(', u'O', u'2', '.', u'=', '>', u'Cl', u'3', u'Br', u'[Na+]', u'N', u'[O-]', u'[P+]', u'I', u'[H]', u'[H-]', u'S', u'[OH-]', u'B', u'[Br-]', u'#', u'[K+]', u'[Na]', u'[K]', u'[BH4-]', u'[I-]', u'[Cl-]', u'[Hg+2]', u'[O+]', u'4', u'[Mg+]', u'[Li]', u'[Cr]', u'P', u'[Pd]', u'[Mn]', u'[Zn]', u'n', u'[Pt]', u'[Ni]', u'[Al+3]', u'[Li+]', u'[Os]', u'[Hg]', u'[C-]', u'[Ba+2]', u'5', u'[N-]', u'[N+]', u'6']
--------
25
['_PAD', '_GO', '_EOS', u'C', u'1', u')', u'(', u'2', u'O', u'=', u'3', u'c', u'Br', u'N', u'Cl', u'4', u'I', '.', u'5', u'B', u'S', u'#', u'6', u'7', u'8']


In [9]:
# compare with vocabulary from another database

with gzip.open('data/vocab/vocab_list.pkl.gz', 'rb') as list_file:
    big_reactants_token_list, big_products_token_list = cPickle.load(list_file)

In [10]:
for token in reactants_token_list:
    if token not in big_reactants_token_list:
        print(token)
print('--------')
for token in products_token_list:
    if token not in big_products_token_list:
        print(token)

--------
