# Gather Test Structures
Get a list of structures to evaluate from those which Sarah provided

In [None]:
from jitterbug.utils import read_from_string, write_to_string
from rdkit import Chem
from rdkit.Chem import AllChem
from pathlib import Path
from ase import Atoms
import pandas as pd
import gzip
import json

Configuration

In [None]:
num_to_save: int = 128  # How many molecules to save to XYZ-format

## Load in the Molecules
They are from three different files which each are nested dictionaries:
- First key: InChI
- Second key: Conformer identifier
- Third key: Either geometry (xyz), or some other bits about the molecule

In [None]:
records = []
for path in Path('raw/combustion/').glob('set*'):
    with gzip.open(path, 'rt') as fp:
        data = json.load(fp)
        
    for inchi, geoms in data.items():
        for geom_id, geom_data in geoms.items():
            records.append({
                'inchi': inchi,
                'geom_id': geom_id,
                'xyz': geom_data['geometry']
            })
records = pd.DataFrame(records)
print(f'Loaded {len(records)} geometries')

In [None]:
records.head()

## Make a uniform name
Get the InChI key to give each a uniform name without filesystem-unfriendly characters, like slashes

In [None]:
records['inchi_key'] = records['inchi'].apply(Chem.MolFromInchi).apply(Chem.MolToInchiKey)

In [None]:
records['name'] = [f'{x}_{y}' for x, y in records[['inchi_key', 'geom_id']].values]

## Assign Electronic States
Get the charge on the molecule and assign magnetic moments, charges as appropriate.

In [None]:
def assign_atomic_states(xyz: str, inchi: str) -> str:
    """Assign initial charges and magmom to atoms based on electronic states
    
    Our quantum chemistry methods do not use this information as initial guesses
    for atomic charges, but only as a way of determining multiplicity.
    so we just place it on any atom.
    
    Returns:
        Updated structure in extxyz format, which preserves this information
    """
    
    # Load XYZ
    atoms = read_from_string(xyz, 'xyz')
    mol = Chem.MolFromInchi(inchi)
    mol = Chem.AddHs(mol)
    AllChem.AssignRadicals(mol)
    
    # Get the magmom and charges
    magmom = [atom.GetNumRadicalElectrons() for atom in mol.GetAtoms()]
    charges = [atom.GetFormalCharge() for atom in mol.GetAtoms()]
    
    # Assign them to the molecule and print
    atoms.set_initial_charges(charges)
    atoms.set_initial_magnetic_moments(magmom)
    
    # Assign them to the atoms
    return write_to_string(atoms, 'extxyz')

In [None]:
records['extxyz'] = records.apply(lambda x: assign_atomic_states(x.xyz, x.inchi), axis=1)

## Save Some to Disk
Sort, shuffle, then save a sample to disk to use for validation

In [None]:
records.sort_values('name', inplace=True)
records = records.sample(frac=1., random_state=1)

In [None]:
records['inchi'].head().tolist()

Write some XYZ files

In [None]:
out_dir = Path('xyzs') / 'combustion'
out_dir.mkdir(exist_ok=True, parents=True)

In [None]:
for _, row in records.head(num_to_save).iterrows():
    (out_dir / f'{row["name"]}.xyz').write_text(row['extxyz'])