In [None]:
# script to collect all of the kinetic calculations for this node in Disproportionation:
# Root_Ext-2R!H-R_2R!H->C_4R->C 
# and put them together into a database


In [1]:
import os
import re
import copy
import glob
import itertools

from rmgpy.exceptions import ActionError
import rmgpy.reaction
import rmgpy.chemkin
import rmgpy.data.kinetics

import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict

In [2]:
# unused function for relabeling/incrementing species
def increment_label(old_label):
    if type(old_label) != str:
        old_label = old_label.label
    
    tokens = old_label.split('-')
    if len(tokens) == 1:
        return f'{old_label}-2'
    
    new_num = str(int(tokens[-1]) + 1)
    tokens[-1] = new_num
    return '-'.join(tokens)

# # tests should print 'C6H5-3' and HO2-2
# print(increment_label(species_dict['C6H5-2']))
# print(increment_label(reaction_list[213].reactants[0].label))

In [3]:
def valid_labels(rxn):
    ref = ['*1', '*2', '*3', '*4']
    
    reactants = rxn.reactants
    
    r0 = reactants[0].molecule[0].get_all_labeled_atoms()
    r1 = reactants[1].molecule[0].get_all_labeled_atoms()
    
    reactant_keys = list(r0.keys()) + list(r1.keys())
    reactant_keys.sort()
    
    if reactant_keys != ref:
        return False
    
    products = rxn.products
    
    p0 = products[0].molecule[0].get_all_labeled_atoms()
    p1 = products[1].molecule[0].get_all_labeled_atoms()
    
    product_keys = list(p0.keys()) + list(p1.keys())
    product_keys.sort()
    
    if product_keys != ref:
        return False
    return True

In [4]:
def relabel(input_r):
    input_reaction = copy.deepcopy(input_r)
    
    
    # copied from AutoTST.autotst.reaction.py
    def get_rmg_mol(smile):
        smiles_conversions = {
                "[CH]": "[CH...]",
                "CARBONMONOXIDE": "[C-]#[O+]"
            }

        if smile.upper() in list(smiles_conversions.keys()):
            smile = smiles_conversions[smile.upper()]
        return rmgpy.molecule.Molecule(smiles=smile).generate_resonance_structures()
    
    rmg_reactants = [get_rmg_mol(sp.smiles) for sp in input_reaction.reactants]
    rmg_products = [get_rmg_mol(sp.smiles) for sp in input_reaction.products]

    combos_to_try = list(itertools.product(
                list(itertools.product(*rmg_reactants)),
                list(itertools.product(*rmg_products))
            ))
    
    for rmg_reactants, rmg_products in combos_to_try:

        test_reaction = rmgpy.reaction.Reaction(
            reactants=list(rmg_reactants),
            products=list(rmg_products)
        )

        try:
            labeled_r, labeled_p = ref_db.kinetics.families[family].get_labeled_reactants_and_products(
                test_reaction.reactants,
                test_reaction.products
            )
            
#             print(labeled_r[0].get_all_labeled_atoms())
#             print(labeled_r[1].get_all_labeled_atoms())
            
#             #check for correct total labeling
#             ref = ['*1', '*2', '*3', '*4']

#             r0 = labeled_r[0].get_all_labeled_atoms()
#             r1 = labeled_r[1].get_all_labeled_atoms()

#             reactant_keys = list(r0.keys()) + list(r1.keys())
#             reactant_keys.sort()

#             if reactant_keys != ref:
#                 raise ValueError('no good')

#             p0 = labeled_p[0].get_all_labeled_atoms()
#             p1 = labeled_p[0].get_all_labeled_atoms()

#             product_keys = list(p0.keys()) + list(p1.keys())
#             product_keys.sort()

#             if product_keys != ref:
#                 raise ActionError
            
            
            
            
#             print(labeled_p)
            
            if input_reaction.reactants[0].molecule[0].is_isomorphic(labeled_r[0]):
                input_reaction.reactants[0].molecule[0] = labeled_r[0]
                input_reaction.reactants[1].molecule[0] = labeled_r[1]
            else:
                input_reaction.reactants[0].molecule[0] = labeled_r[1]
                input_reaction.reactants[1].molecule[0] = labeled_r[0]
                
                
            if input_reaction.products[0].molecule[0].is_isomorphic(labeled_p[0]):
                input_reaction.products[0].molecule[0] = labeled_p[0]
                input_reaction.products[1].molecule[0] = labeled_p[1]
            else:
                input_reaction.products[0].molecule[0] = labeled_p[1]
                input_reaction.products[1].molecule[0] = labeled_p[0]
                
                
            if not valid_labels(input_reaction):
                print(f'Bad labeling on {input_reaction}')
            return input_reaction

# #             print(labeled_r)
#             print([rmgpy.species.Species(molecule=[m]) for m in labeled_r])
#             return
            
#             input_reaction.reactants = [rmgpy.species.Species(molecule=[m]) for m in labeled_r]
#             input_reaction.products = [rmgpy.species.Species(molecule=[m]) for m in labeled_p]
#             return input_reaction
            
#             for i in range(0, len(input_reaction.reactants)):
#                 for reactant in labeled_r:
#                     if input_reaction.reactants[i].molecule[0].is_isomorphic(reactant):
#                         input_reaction.reactants[i].molecule[0] = reactant
#                         break
#             for i in range(0, len(input_reaction.products)):
#                 for product in labeled_p:
#                     if input_reaction.products[i].molecule[0].is_isomorphic(product):
#                         input_reaction.products[i].molecule[0] = product
#             return labeled_r, labeled_p

        except ActionError:
            pass
    return False

In [5]:
# # reload
# # load the new training reactions
# new_training_rxns = output_path
# ark_kinetics_database = rmgpy.data.kinetics.KineticsDatabase()
# ark_kinetics_database.load_libraries(new_training_rxns)
# print(f'{len(ark_kinetics_database.libraries["kinetics"].entries)} new reactions loaded')

In [6]:
# ark_kinetics_database.libraries["kinetics"].entries[427].item.reactants[0].molecule[0].get_all_labeled_atoms()

In [7]:
# relabel(ark_kinetics_database.libraries["kinetics"].entries[427].item)

In [8]:
def print_labels(rxn):
    for sp in rxn.reactants + rxn.products:
        print(sp.molecule[0].get_all_labeled_atoms())

In [9]:
def duplicate_exists(test_entry):
    for entry in training_depo.entries:
        if training_depo.entries[entry].item.is_isomorphic(test_entry.item):
            return True
    return False

In [10]:
DFT_DIR = "/work/westgroup/harris.se/autoscience/autoscience/butane/dft/"
kinetics_libs = glob.glob(os.path.join(DFT_DIR, 'kinetics', 'reaction*', 'arkane', 'RMG_libraries'))

In [11]:
# only include reactions that belong to the disproportionation family

# Load the base model
basedir = '/work/westgroup/harris.se/autoscience/autoscience/butane/models/rmg_model'
base_chemkin = os.path.join(basedir, 'chem_annotated.inp')
dictionary = os.path.join(basedir, 'species_dictionary.txt')
transport = os.path.join(basedir, 'tran.dat')
species_list, reaction_list = rmgpy.chemkin.load_chemkin_file(base_chemkin, dictionary_path=dictionary, transport_path=transport)
print(f'{len(species_list)} species, {len(reaction_list)} reactions')



110 species, 1822 reactions


In [12]:
# load the disproportionation database
# load the thermo database
thermo_libs = [
    'BurkeH2O2',
    'primaryThermoLibrary',
    'FFCM1(-)',
    'CurranPentane',
    'Klippenstein_Glarborg2016',
    'thermo_DFT_CCSDTF12_BAC',
    'DFT_QCI_thermo',
    'CBS_QB3_1dHR',
]

thermo_library_path = os.path.join(rmgpy.settings['database.directory'], 'thermo')
thermo_database = rmgpy.data.thermo.ThermoDatabase()
thermo_database.load(
    thermo_library_path,
    libraries=thermo_libs
)


# load the revised Disproportionation family
family = 'Disproportionation'
ref_library_path = os.path.join(rmgpy.settings['database.directory'], 'kinetics')
kinetics_database = rmgpy.data.kinetics.KineticsDatabase()
kinetics_database.load(
    ref_library_path,
    libraries=[],
    families=[family]
)

# load the entire database
ref_db = rmgpy.data.rmg.RMGDatabase()
ref_db.kinetics = kinetics_database
ref_db.thermo = thermo_database

In [13]:
# Load the Arkane kinetics
entries = []
for i, lib_path in enumerate(kinetics_libs):
    matches = re.search('reaction_([0-9]{4})', lib_path)
    reaction_index = int(matches[1])
    # skip entries not in Disproportionation
    if reaction_list[reaction_index].family != 'Disproportionation':
        continue
    ark_kinetics_database = rmgpy.data.kinetics.KineticsDatabase()
    ark_kinetics_database.load_libraries(lib_path)
    
    # TODO fix bug related to load_libraries not getting the actual name
    for key in ark_kinetics_database.libraries[''].entries.keys():
        entry = ark_kinetics_database.libraries[''].entries[key]
        entry.index = reaction_index
        entries.append(entry)


In [14]:
# compile it all into a single database and a single library which I'll call harris_butane
ark_kinetics_database = rmgpy.data.kinetics.KineticsDatabase()
ark_kinetics_database.libraries['kinetics'] = rmgpy.data.kinetics.KineticsLibrary()
ark_kinetics_database.libraries['kinetics'].label = 'harris_butane'
ark_kinetics_database.libraries['kinetics'].name = 'harris_butane'
ark_kinetics_database.libraries['kinetics'].entries = OrderedDict()
for entry in entries:
#     relabel(entry.item)
    ark_kinetics_database.libraries['kinetics'].entries[entry.label] = entry

In [15]:
# save the results
output_path = os.path.join(DFT_DIR, 'disproportionation_kinetics')
ark_kinetics_database.save_libraries(output_path, reindex=False)
print(len(entries), 'saved')

64 saved


In [16]:
# load the new training reactions
new_training_rxns = output_path
ark_kinetics_database = rmgpy.data.kinetics.KineticsDatabase()
ark_kinetics_database.load_libraries(new_training_rxns)
print(f'{len(ark_kinetics_database.libraries["kinetics"].entries)} new reactions loaded')

64 new reactions loaded


In [17]:
print_labels(ark_kinetics_database.libraries["kinetics"].entries[213].item)

{}
{}
{}
{}


In [18]:
for entry in ark_kinetics_database.libraries["kinetics"].entries:
    ark_kinetics_database.libraries["kinetics"].entries[entry].item = relabel(ark_kinetics_database.libraries["kinetics"].entries[entry].item)
#     new_entries.append(entry)

Bad labeling on CH2CHO(21) + CH2CHO(21) <=> CH3CHO(35) + CH2CO(24)
Bad labeling on C3H5-A(94) + NC3H7(92) <=> C3H6(12) + C3H6(12)
Bad labeling on SC4H9(183) + C4H8(748) <=> SC4H9(183) + C4H8(188)
Bad labeling on C4H7(190) + SC4H9(183) <=> C4H8(188) + C4H8(188)
Bad labeling on C4H7(190) + C4H7(190) <=> C4H8(189) + C4H6(194)
Bad labeling on SC4H9(183) + C4H7(191) <=> C4H8(188) + C4H8(188)
Bad labeling on C4H7(191) + C4H8(748) <=> C4H8(188) + C4H7(191)
Bad labeling on C4H7(190) + C4H6(2534) <=> C4H7(190) + C4H6(194)


In [19]:
print_labels(ark_kinetics_database.libraries["kinetics"].entries[213].item)

{'*2': <Atom 'C'>, '*3': <Atom 'C.'>, '*4': <Atom 'H'>}
{'*1': <Atom 'O.'>}
{'*1': <Atom 'O'>, '*4': <Atom 'H'>}
{'*2': <Atom 'C'>, '*3': <Atom 'C'>}


## Redo the species dictionary to include everything we'll need

In [20]:
# load the training depository

training_depo = ref_db.kinetics.families[family].get_training_depository()
print(len(training_depo.entries), 'training reactions')
training_depo_dict = os.path.join(os.path.join(rmgpy.settings['database.directory'], 'kinetics', 'families', family, 'training', 'dictionary.txt'))
species_dict = training_depo.get_species(training_depo_dict)

137 training reactions


In [21]:
def equivalent_labels(dict1, dict2):
    try:
        for key in dict1.keys():
            if dict2[key].atomtype != dict1[key].atomtype:
                return False
        for key in dict2.keys():
            if dict2[key].atomtype != dict1[key].atomtype:
                return False
    except KeyError:
        return False
    return True

# should return true
equivalent_labels(
    ark_kinetics_database.libraries["kinetics"].entries[213].item.reactants[0].molecule[0].get_all_labeled_atoms(),
    species_dict['C2H5'].molecule[0].get_all_labeled_atoms()
)

True

In [22]:
def get_species(sp):
    # look for the species in the training depo and return it IF it has same labeling
    for key in species_dict.keys():
        if sp.is_isomorphic(species_dict[key]):
            if equivalent_labels(sp.molecule[0].get_all_labeled_atoms(), species_dict[key].molecule[0].get_all_labeled_atoms()):
                return species_dict[key]

In [23]:
# entries_to_add = set()
entries_to_add = []
for entry in ark_kinetics_database.libraries["kinetics"].entries:
    reactants = ark_kinetics_database.libraries["kinetics"].entries[entry].item.reactants
    products = ark_kinetics_database.libraries["kinetics"].entries[entry].item.products
    for sp in reactants + products:
        dict_sp = get_species(sp)
        if not dict_sp:
#             print('Exact Entry not found:', sp, sp.molecule[0].get_all_labeled_atoms())
#             entries_to_add.add(sp)
            entries_to_add.append(sp)

In [24]:
for entry in entries_to_add:
    print(entry)

IC3H7(93)
SC4H9(183)
C4H8(188)
SC4H9(183)
C4H8(188)
CH3CO(20)
CH2CO(24)
CH2CHO(21)
CH2CO(24)
SC4H9(183)
C4H8(189)
SC4H9(183)
C4H8(188)
CH2CHO(21)
CH2CHO(21)
CH3CHO(35)
CH2CO(24)
SC4H9(183)
C4H8(188)
IC3H7(93)
CH2CHO(21)
CH3CHO(35)
SC4H9(183)
CH3CO(20)
CH3CHO(35)
C4H8(188)
SC4H9(183)
C4H8(188)
CH2CHO(21)
CH2CO(24)
IC3H7(93)
C4H8(748)
SC4H9(183)
SC4H9(183)
C4H8(748)
SC4H9(183)
C4H8(188)
C4H7(190)
C4H8(189)
CH2CO(24)
CH2CHO(21)
CH2CO(24)
C4H8(189)
SC4H9(183)
C4H8(189)
SC4H9(183)
C4H8(188)
C4H8(188)
C4H8(189)
SC4H9(183)
butane(1)
CH2CHO(21)
CH3CHO(35)
S(777)
S(252)
CH2CHO(21)
C4H8(748)
PC4H9(182)
CH2CO(24)
C4H8(748)
PC4H9(182)
SC4H9(183)
C4H7(191)
C4H8(188)
C4H8(188)
SC4H9(183)
C4H8(748)
butane(1)
C4H7(191)
PC4H9(182)
C4H8(748)
butane(1)
C4H7(191)
CH3O2(45)
CH2CHO(21)
CH3O2H(46)
CH2CO(24)
CH3O2(45)
C2H4O(703)
CH3O2H(46)
CH2CHO(21)
CH3O2(45)
SC4H9(183)
CH3O2H(46)
C4H8(189)
C4H7(192)
C4H8(188)
C3H5O(129)
C3H4O(74)
C4H8(748)
C4H7(191)
C4H7(191)
C4H8(748)
C4H7(191)
CH3CO(20)
C4H8(748)
CH3CHO(3

In [25]:
# add the new species to the species dictionary
for sp in entries_to_add:
    # has it already been added?
    dict_sp = get_species(sp)
    
    if dict_sp:
        continue
    
    sp.molecule
    if sp.label == '':
        print(f'empty species label for {sp}')
    # check if the molecule is isomorphic with anything in the dictionary
    basename = sp.label
#     isomorphic = False
#     for key in species_dict.keys():
#         if sp.is_isomorphic(species_dict[key]):
#             isomorphic = True
#             basename = key
#             break
    
    while basename in species_dict.keys():
        basename = increment_label(basename)
        sp.label = basename

    species_dict[basename] = sp

In [26]:
# check that there's nothing new to add now
entries_to_add = []
for entry in ark_kinetics_database.libraries["kinetics"].entries:
    reactants = ark_kinetics_database.libraries["kinetics"].entries[entry].item.reactants
    products = ark_kinetics_database.libraries["kinetics"].entries[entry].item.products
    for sp in reactants + products:
        dict_sp = get_species(sp)
        if not dict_sp:
            entries_to_add.append(sp)
assert len(entries_to_add) == 0

In [27]:
# save it
sd = []
for key in species_dict:
    sd.append(species_dict[key])

rmgpy.chemkin.save_species_dictionary(training_depo_dict, sd)

In [28]:
# reload the depo
training_depo = ref_db.kinetics.families[family].get_training_depository()
print(len(training_depo.entries), 'training reactions')
training_depo_dict = os.path.join(os.path.join(rmgpy.settings['database.directory'], 'kinetics', 'families', family, 'training', 'dictionary.txt'))
species_dict = training_depo.get_species(training_depo_dict)

137 training reactions


In [29]:
# go through the new training reactions and make sure they use existing species dictionary definition
for entry in ark_kinetics_database.libraries["kinetics"].entries:

    new_label = ''
    
    for i in range(0, len(ark_kinetics_database.libraries["kinetics"].entries[entry].item.reactants)):
        official_sp = get_species(ark_kinetics_database.libraries["kinetics"].entries[entry].item.reactants[i])
        # it already exists so we have to revise the label names in the entry
        if official_sp:
            ark_kinetics_database.libraries["kinetics"].entries[entry].item.reactants[i] = official_sp
        else:
            raise ValueError('did not successfully add to training depo species dictionary')
            
        if i + 1 < len(ark_kinetics_database.libraries["kinetics"].entries[entry].item.reactants):
            new_label += official_sp.label + ' + '
        else:
            new_label += official_sp.label + ' <=> '
            

    # copy of the above, but for products
    for i in range(0, len(ark_kinetics_database.libraries["kinetics"].entries[entry].item.products)):
        official_sp = get_species(ark_kinetics_database.libraries["kinetics"].entries[entry].item.products[i])
        if official_sp:
            ark_kinetics_database.libraries["kinetics"].entries[entry].item.products[i] = official_sp
        else:
            raise ValueError('did not successfully add to training depo species dictionary')
            
        if i + 1 < len(ark_kinetics_database.libraries["kinetics"].entries[entry].item.products):
            new_label += official_sp.label + ' + '
        else:
            new_label += official_sp.label
    new_label = new_label.strip()

    
    # completely regenerate the 
    ark_kinetics_database.libraries["kinetics"].entries[entry].item.label = new_label
    
    if 'C2H5(33)' in new_label:
        print('problem', entry)
    

In [30]:
# Splice the new training reactions into the training depo
index_start = 100

for i, entry in enumerate(ark_kinetics_database.libraries['kinetics'].entries):
    
    if duplicate_exists(ark_kinetics_database.libraries['kinetics'].entries[entry]):
        continue
    if not valid_labels(ark_kinetics_database.libraries["kinetics"].entries[entry].item):
        print(f'bad labels on {entry}, skipping...')
        continue
    
    if ark_kinetics_database.libraries['kinetics'].entries[entry].item.elementary_high_p:
        # manual fix to this issue 
        ark_kinetics_database.libraries['kinetics'].entries[entry].item.elementary_high_p = False
    
    j = index_start + i
    while j in training_depo.entries.keys():
#         print(f'skipping index {j}')
        j += 1
        continue
#     print(j, ark_kinetics_database.libraries['kinetics'].entries[entry])
    training_depo.entries[j] = ark_kinetics_database.libraries['kinetics'].entries[entry]
    training_depo.entries[j].label = ark_kinetics_database.libraries['kinetics'].entries[entry].item.label
print(len(training_depo.entries), 'training reactions')

bad labels on 427, skipping...
bad labels on 674, skipping...
bad labels on 731, skipping...
bad labels on 795, skipping...
bad labels on 1016, skipping...
bad labels on 1254, skipping...
bad labels on 1398, skipping...
190 training reactions


In [31]:
for entry in training_depo.entries:
    if 'C2H5(33)' in training_depo.entries[entry].label:
        print(entry)
    for sp in training_depo.entries[entry].item.reactants + training_depo.entries[entry].item.products:
        if str(sp) == 'C2H5(33)' or sp.label == 'C2H5(33)':
            print(sp)

In [None]:
species_dict['C2H5(33)']

In [32]:
# Save the results somewhere else
training_dir = os.path.join(rmgpy.settings['database.directory'], 'kinetics', 'families', family, 'training')
training_depo.save_dictionary(os.path.join(training_dir, 'dictionary.txt'))
training_depo.save(os.path.join(training_dir, 'reactions.py'))

In [33]:
# test that you can reload
# load the disproportionation database
# load the thermo database
thermo_libs = [
    'BurkeH2O2',
    'primaryThermoLibrary',
    'FFCM1(-)',
    'CurranPentane',
    'Klippenstein_Glarborg2016',
    'thermo_DFT_CCSDTF12_BAC',
    'DFT_QCI_thermo',
    'CBS_QB3_1dHR',
]

thermo_library_path = os.path.join(rmgpy.settings['database.directory'], 'thermo')
thermo_database = rmgpy.data.thermo.ThermoDatabase()
thermo_database.load(
    thermo_library_path,
    libraries=thermo_libs
)


# load the revised Disproportionation family
family = 'Disproportionation'
ref_library_path = os.path.join(rmgpy.settings['database.directory'], 'kinetics')
kinetics_database = rmgpy.data.kinetics.KineticsDatabase()
kinetics_database.load(
    ref_library_path,
    libraries=[],
    families=[family]
)

# load the entire database
ref_db = rmgpy.data.rmg.RMGDatabase()
ref_db.kinetics = kinetics_database
ref_db.thermo = thermo_database

