# Get a Single Dataset from Benchmarks
Get all of the output files from the checks which have been run

In [1]:
from examol.store.models import MoleculeRecord
from examol.store.recipes.redox import RedoxEnergy, SolvationEnergy
from base64 import b64decode
from pathlib import Path
from tqdm import tqdm
import pickle as pkl
import json

In [2]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*') 

Configuration

In [3]:
recipes = [
    RedoxEnergy(1, 'cp2k_blyp_dzvp', solvent='acn'),
    SolvationEnergy('cp2k_blyp_dzvp', solvent='acn')
]  # Which recipes we wish to pre-compute

## Make Records from the Computations
Loop over all computations we've performed and write them into a summary file

In [4]:
database: dict[str, MoleculeRecord] = {}  # SMILES -> record

Start with the optimizations

In [5]:
for path in Path().rglob('output.json'):
    with path.open() as fp:
        for line in tqdm(fp, desc=str(path)):
            output_record = json.loads(line)

            # Create the record
            smiles = output_record['smiles']
            if smiles in database:
                record = database[smiles]
            else:
                try:
                    record = MoleculeRecord.from_identifier(smiles)
                except ValueError:
                    continue
                database[smiles] = record

            # Add the simulation results
            sim_result, opt_steps, _ = pkl.loads(b64decode(output_record['result']))
            record.add_energies(sim_result, opt_steps)

output.json: 30568it [13:50, 36.82it/s]


In [6]:
print(f'Database has {len(database)} molecules')

Database has 3402 molecules


Now do the solvation energies

In [7]:
for path in Path().rglob('solvation.json'):
    with path.open() as fp:
        for line in tqdm(fp, desc=str(path)):
            output_record = json.loads(line)

            # Create the record
            smiles = output_record['smiles']
            if smiles in database:
                record = database[smiles]
            else:
                continue

            # Add the simulation results
            sim_result, _ = pkl.loads(b64decode(output_record['result']))
            record.add_energies(sim_result)

solvation.json: 24538it [00:41, 591.49it/s]


In [8]:
print(f'Database has {len(database)} molecules')

Database has 3402 molecules


## Compute the recipes
We'll use that for our example caseSolvationEnergy

In [9]:
for recipe in recipes:
    hits = 0
    for record in tqdm(database.values(), desc=recipe.name):
        try:
            recipe.update_record(record)
            hits += 1
        except ValueError:
            continue
    print(f'Computed {recipe} for {hits} molecules')

oxidation_potential: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 3402/3402 [00:00<00:00, 5460.69it/s]


Computed <examol.store.recipes.RedoxEnergy object at 0x7f2d41f9a650> for 3399 molecules


solvation_energy: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 3402/3402 [00:00<00:00, 5957.83it/s]

Computed <examol.store.recipes.SolvationEnergy object at 0x7f2d41f98820> for 3402 molecules





Save it to disk

In [10]:
with open('database.json', 'w') as fp:
    for record in tqdm(database.values()):
        print(record.json(), file=fp)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3402/3402 [00:03<00:00, 1055.02it/s]
