This notebook serves as an example of how to load and manipulate the [QM9 dataset](https://figshare.com/collections/Quantum_chemistry_structures_and_properties_of_134_kilo_molecules/978904) using a `Dataset` object.

# Imports

In [1]:
import os
import numpy as np

from ase import Atoms

from colabfit.tools.dataset import Dataset, load_data
from colabfit.tools.property_settings import PropertySettings

# Data loading

## Define the properties and reader functions

In [2]:
qm9_property_definition = {
    'property-id': 'qm9-property',
    'property-title': 'A, B, C, mu, alpha, homo, lumo, gap, r2, zpve, U0, U, H, G, Cv',
    'property-description': 'Geometries minimal in energy, corresponding harmonic frequencies, dipole moments, polarizabilities, along with energies, enthalpies, and free energies of atomization',
    'a':     {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Rotational constant A'},
    'b':     {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Rotational constant B'},
    'c':     {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Rotational constant C'},
    'mu':    {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Dipole moment'},
    'alpha': {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Isotropic polarizability'},
    'homo':  {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Energy of Highest occupied molecular orbital (HOMO)'},
    'lumo':  {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Energy of Lowest occupied molecular orbital (LUMO)'},
    'gap':   {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Gap, difference between LUMO and HOMO'},
    'r2':    {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Electronic spatial extent'},
    'zpve':  {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Zero point vibrational energy'},
    'u0':    {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Internal energy at 0 K'},
    'u':     {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Internal energy at 298.15 K'},
    'h':     {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Enthalpy at 298.15 K'},
    'g':     {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Free energy at 298.15 K'},
    'cv':    {'type': 'float', 'has-unit': True, 'extent': [], 'required': True, 'description': 'Heat capacity at 298.15 K'},
    'smiles-relaxed':    {'type': 'string', 'has-unit': False, 'extent': [], 'required': True, 'description': 'SMILES for relaxed geometry'},
    'inchi-relaxed':     {'type': 'string', 'has-unit': False, 'extent': [], 'required': True, 'description': 'InChI for relaxed geometry'},
}

In [3]:
property_map = {
    'qm9-property': {
        # Property Definition field: {'field': ASE field, 'units': ASE-readable units}
        'a':     {'field': 'A',     'units': 'GHz'},
        'b':     {'field': 'B',     'units': 'GHz'},
        'c':     {'field': 'C',     'units': 'GHz'},
        'mu':    {'field': 'mu',    'units': 'Debye'},
        'alpha': {'field': 'alpha', 'units': 'Bohr*Bohr*Bohr'},
        'homo':  {'field': 'homo',  'units': 'Hartree'},
        'lumo':  {'field': 'lumo',  'units': 'Hartree'},
        'gap':   {'field': 'gap',   'units': 'Hartree'},
        'r2':    {'field': 'r2',    'units': 'Bohr*Bohr'},
        'zpve':  {'field': 'zpve',  'units': 'Hartree'},
        'u0':    {'field': 'U0',    'units': 'Hartree'},
        'u':     {'field': 'U',     'units': 'Hartree'},
        'h':     {'field': 'H',     'units': 'Hartree'},
        'g':     {'field': 'G',     'units': 'Hartree'},
        'cv':    {'field': 'Cv',    'units': 'cal/mol/K'},
        'smiles-relaxed': {'field': 'SMILES_relaxed', 'units': None},
        'inchi-relaxed': {'field': 'SMILES_relaxed',  'units': None},
    }
}

In [4]:
def reader(file_path):
    # A function for returning a list of ASE a
    
    properties_order = [
        'tag', 'index', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv'
    ]
        
    images = []
    with open(file_path, 'r') as f:
        lines = [_.strip() for _ in f.readlines()]
        
        na = int(lines[0])
        properties = lines[1].split()
        
        symbols = []
        positions = []
        partial_charges = []
        
        for line in lines[2:2+na]:
            split = line.split()
            split = [_.replace('*^', 'e') for _ in split]  # Python-readable scientific notation
            
            # Line order: symbol, x, y, z, charge
            symbols.append(split[0])
            positions.append(split[1:4])
            partial_charges.append(split[-1])
            
        positions = np.array(positions)
        partial_charges = np.array(partial_charges, dtype=float)
                
        atoms = Atoms(symbols=symbols, positions=positions)
        
        atoms.info['mulliken_partial_charges'] = partial_charges
        
        name = os.path.splitext(os.path.split(file_path)[-1])[0]

        atoms.info['name'] = name
        
        for pname, val in zip(properties_order[2:], properties[2:]):
            atoms.info[pname] = float(val)
            
        frequencies = np.array(lines[-3].split(), dtype=float)
        atoms.info['frequencies'] = frequencies
                
        smiles = lines[-2].split()
        inchi  = lines[-1].split()
        
        atoms.info['SMILES']    = smiles[0]
        atoms.info['SMILES_relaxed'] = smiles[1]
        atoms.info['InChI']     = inchi[0]
        atoms.info['InChI_relaxed']  = inchi[1]
        
        images.append(atoms)
    
    return images

## Load the full dataset

In [5]:
dataset = Dataset('dsgdb9nsd')

dataset.authors = [
    'Raghunathan Ramakrishnan', 'Pavlo Dral', 'Matthias Rupp', 'O. Anatole von Lilienfeld'
]

dataset.links = [
    'https://www.nature.com/articles/sdata201422',
    'https://figshare.com/collections/Quantum_chemistry_structures_and_properties_of_134_kilo_molecules/978904'
]

dataset.description = \
"These molecules correspond to the subset of all 133,885 "\
"species with up to nine heavy atoms (CONF) out of the GDB-17 "\
"chemical universe of 166 billion organic molecules."

In [6]:
dataset.configurations = load_data(
    file_path='../../../colabfit/data/quantum-machine/qm9/dsgdb9nsd/',
    file_format='folder',
    name_field='name',  # key in ase.Atoms.info to use as the Configuration name
    elements=['H', 'C', 'N', 'O', 'F'],    # order matters for CFG files, but not others
    default_name='qm9',  # default name with `name_field` not found
    reader=reader,
    glob_string='*.xyz',
    verbose=True
)

Loading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 133885/133885 [00:39<00:00, 3347.39it/s]


In [7]:
dataset.property_map = property_map

dataset.custom_definitions = {'qm9-property': qm9_property_definition}

In [8]:
dataset.parse_data(convert_units=False, verbose=True)

Parsing data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 133885/133885 [00:57<00:00, 2338.44it/s]


In [9]:
dataset.property_settings_regexes = {
    '.*':
        PropertySettings(
            method='DFT/B3LYP/6-31G(2df,p)',
            description='QM9 property settings calculation',
            files=[],
            labels=['DFT', 'B3LYP', '6-31G(2df,p)'],
        )
}

In [10]:
dataset.resync()

In [11]:
dataset.to_markdown(
    base_folder='../../../colabfit/data/quantum-machine/qm9/dsgdb9nsd/',
    html_file_name='README.md',
    data_file_name=dataset.name+'.extxyz',
    data_format='xyz',
)

# Load the smaller dataset

In [12]:
small_dataset = Dataset('dsC7O2H10nsd')

small_dataset.authors = [
    'Raghunathan Ramakrishnan', 'Pavlo Dral', 'Matthias Rupp', 'O. Anatole von Lilienfeld'
]

small_dataset.links = [
    'https://www.nature.com/articles/sdata201422',
    'https://figshare.com/collections/Quantum_chemistry_structures_and_properties_of_134_kilo_molecules/978904'
]

small_dataset.description = \
"For the predominant stoichiometry, C7H10O2, there are 6,095 "\
"constitutional isomers among the 134k molecules. We report "\
"energies, enthalpies, and free energies of atomization at the "\
"more accurate G4MP2 level of theory for all of them"

In [13]:
small_dataset.configurations = load_data(
    file_path='../../../colabfit/data/quantum-machine/qm9/dsC7O2H10nsd/',
    file_format='folder',
    name_field='name',  # key in ase.Atoms.info to use as the Configuration name
    elements=['H', 'C', 'N', 'O', 'F'],    # order matters for CFG files, but not others
    default_name='dsC7O2H10nsd',  # default name with `name_field` not found
    reader=reader,
    glob_string='*.xyz',
    verbose=True
)

Loading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6095/6095 [00:01<00:00, 3309.30it/s]


In [14]:
small_dataset.property_map = property_map

small_dataset.custom_definitions = {'qm9-property': qm9_property_definition}

In [15]:
small_dataset.parse_data(convert_units=False, verbose=True)

Parsing data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6095/6095 [00:02<00:00, 2467.38it/s]


In [16]:
atoms = small_dataset.configurations[0]

In [17]:
for ii, atoms2 in enumerate(dataset.configurations):
    
    if atoms.info['SMILES'] == atoms2.info['SMILES']:
        print('SMILES from', atoms.info['name'], 'matches', atoms2.info['name'])
        
    if atoms.positions.shape == atoms2.positions.shape:
        if np.max(np.abs(atoms.positions - atoms2.positions)) < 1e-2:
            print('Positions from', atoms.info['name'], 'match', atoms2.info['name'])

SMILES from dsC7O2H10nsd_1430 matches dsgdb9nsd_062738


In [18]:
small_dataset.property_settings_regexes = {
    '.*':
        PropertySettings(
            method='G4MP2',
            description='QM9 property settings calculation',
            files=[],
            labels=['G4MP2'],
        )
}

In [19]:
small_dataset.resync()

# Exploration

Results of running command:

`dataset.plot_histograms(['a', 'b', 'c', 'mu', 'alpha', 'homo', 'lumo', 'r2', 'zpve', 'u0', 'u', 'h', 'g', 'cv'])`

<img src="./qm9_histograms.png" width="800" height="800"/>

In [20]:
print(dataset.get_statistics('a'))
print(dataset.get_statistics('b'))
print(dataset.get_statistics('c'))

{'average': 9.814382088508795, 'std': 1809.4589082320583, 'min': 0.0, 'max': 619867.68314, 'average_abs': 9.814382088508795}
{'average': 1.4060972645920007, 'std': 1.5837889998648809, 'min': 0.33712, 'max': 437.90386, 'average_abs': 1.4060972645920007}
{'average': 1.1249210272988013, 'std': 1.0956136904779636, 'min': 0.33118, 'max': 282.94545, 'average_abs': 1.1249210272988013}


In [21]:
clean = dataset.filter(
    'data',
    lambda p: (p['a']['source-value'] < 20) and (p['b']['source-value'] < 10),
    verbose=True
)

Filtering data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 133885/133885 [00:02<00:00, 58895.64it/s]


Results of running command:

`clean.plot_histograms(['a', 'b', 'c', 'mu', 'alpha', 'homo', 'lumo', 'r2', 'zpve', 'u0', 'u', 'h', 'g', 'cv'])`

<img src="./qm9_clean_histograms.png" width="800" height="800"/>

In [22]:
print(clean.get_statistics('a'))
print(clean.get_statistics('b'))
print(clean.get_statistics('c'))

{'average': 3.407053427070018, 'std': 1.3368223663235594, 'min': 0.0, 'max': 19.99697, 'average_abs': 3.407053427070018}
{'average': 1.3966863945821093, 'std': 0.4581379707257539, 'min': 0.33712, 'max': 9.93509, 'average_abs': 1.3966863945821093}
{'average': 1.1177706236464615, 'std': 0.3287984573560261, 'min': 0.33118, 'max': 6.46247, 'average_abs': 1.1177706236464615}
