# Gather Example Molecules
Get a range of molecules by size that would be good for scaling tests

In [1]:
from moldesign.simulate.qcfractal import GeometryDataset
from rdkit import Chem
import pandas as pd
import shutil
import os



Get the "small basis" dataset

In [2]:
data = GeometryDataset('Electrolyte Geometry NWChem', 'small_basis')

Get all of the molecules

In [3]:
%%time
records = data.get_complete_records()
print(f'Loaded {len(records)} completed calculations')

Loaded 9258 completed calculations
CPU times: user 4.49 s, sys: 84.1 ms, total: 4.57 s
Wall time: 11.2 s


In [4]:
record_df = pd.DataFrame(zip(records.index, records.values), columns=['name', 'record'])

Get the state and number of electrons

In [5]:
record_df['state'] = record_df['name'].apply(lambda x: x.split("_")[-1])
record_df['inchi'] = record_df['name'].apply(lambda x: x.split("_")[0])

In [6]:
def count_electrons(inchi: str) -> int:
    mol = Chem.MolFromInchi(inchi)
    mol = Chem.AddHs(mol)
    return sum(a.GetAtomicNum() for a in mol.GetAtoms())

In [7]:
record_df['num_electrons'] = record_df.inchi.apply(count_electrons)



Shuffle the dataset and then pick a single example for each level

In [8]:
record_df = record_df.sample(frac=1.0)

In [9]:
record_df.query('state=="neutral"', inplace=True)

In [10]:
record_df.drop_duplicates('num_electrons', keep='first', inplace=True)
print(f'Reduced to {len(record_df)} examples')

Reduced to 113 examples


In [11]:
record_df['num_electrons'].describe()

count    113.000000
mean      92.176991
std       68.370476
min        8.000000
25%       40.000000
50%       75.000000
75%      126.000000
max      312.000000
Name: num_electrons, dtype: float64

Get the molecule for each

In [12]:
record_df['molecule'] = record_df['record'].apply(lambda x: x.get_final_molecule())

In [13]:
record_df['xyz'] = record_df['molecule'].apply(lambda x: x.to_string('xyz'))

Save to disk

In [14]:
record_df.drop(columns=['record', 'molecule']).to_csv('example_molecules.csv', index=False)