In [1]:
from dataset_selection import optimization_datasets, dataset_type
import pandas as pd
import qcportal
from openforcefield.topology import Molecule
client = qcportal.FractalClient()



In [2]:
from typing import List, Dict, Tuple
def compare_smiles(
    df: pd.DataFrame,
    ds: qcportal.collections.optimization_dataset.OptimizationDataset,
    record_names: List[str]
) -> Tuple[Dict[str, Tuple[str, str]], Dict[str, Exception]]:
    """loop over all record_names, and compare:
    * the offmol that was fetched remotely and saved in df.offmol[record_name]
    * the offmol that is returned by Molecule.from_qcschema(ds.get_entry(record_name))
    by checking that offmol.to_smiles() is the same string in both cases
    """
    
    mismatches = dict()
    exceptions = dict()
    
    for record_name in record_names:
        offmol_fetched_remotely = df.offmol[record_name]
        offmol_fetched_locally = Molecule.from_qcschema(ds.get_entry(record_name))
    
        try:
            match = (offmol_fetched_remotely == offmol_fetched_remotely)
            if not match:
                mismatches[record_name] = (offmol_fetched_remotely, offmol_fetched_remotely)
        except Exception as e:
            exceptions[record_name] = e
    return mismatches, exceptions

In [3]:
all_mismatches_and_exceptions = dict()
for name in optimization_datasets:
    print(name)
    path_to_h5 = f'../../espaloma/data/qca/{name}.h5'
    df = pd.read_hdf(path_to_h5)
    ds = client.get_collection(dataset_type, optimization_datasets[name])
    m, e = compare_smiles(df, ds, df.index)
    all_mismatches_and_exceptions[name] = (m, e)
    print(f'# mismatches: {len(m)}')
    print(f'# exceptions: {len(e)}\n')

Roche
# mismatches: 0
# exceptions: 0

Coverage
# mismatches: 0
# exceptions: 2

Pfizer
# mismatches: 0
# exceptions: 0

eMolecules
# mismatches: 0
# exceptions: 0

Bayer
# mismatches: 0
# exceptions: 0



In [4]:
for name in optimization_datasets:
    print(name)
    mismatches = all_mismatches_and_exceptions[name][0]
    
    unique_mismatches = set(mismatches.values())
    print(f'{name} set ({len(unique_mismatches)} unique problematic mols, affecting {len(mismatches)} total records)')
    
    associated_records = {m: [r for r in mismatches if mismatches[r] == m] for m in unique_mismatches}

    for mismatch in unique_mismatches:
        records = associated_records[mismatch]
        print(f'molecule associated with the following {len(records)} records:')
        print(records)
        remote, local = mismatch
        print(f'\n\tsmiles from remote (RDKit 2020.03.6): {remote}')
        print(f'\tsmiles from local (OpenEye 2020.1.0): {local}')
        print('\n')
    
    print('-'*100)
    print('\n')

Roche
Roche set (0 unique problematic mols, affecting 0 total records)
----------------------------------------------------------------------------------------------------


Coverage
Coverage set (0 unique problematic mols, affecting 0 total records)
----------------------------------------------------------------------------------------------------


Pfizer
Pfizer set (0 unique problematic mols, affecting 0 total records)
----------------------------------------------------------------------------------------------------


eMolecules
eMolecules set (0 unique problematic mols, affecting 0 total records)
----------------------------------------------------------------------------------------------------


Bayer
Bayer set (0 unique problematic mols, affecting 0 total records)
----------------------------------------------------------------------------------------------------


