# Validate Datasets
Read the JSON-format datasets to test for inconsistancies and errors

In [1]:
from hydronet.data import graph_from_dict, atoms_from_dict
from ttm.ase import TTMCalculator
from glob import glob
from tqdm import tqdm
import networkx as nx
import gzip
import json

Configuration

In [2]:
audit_size = 100000  # Number of entries to check

## Test the Geometry Dataset
Ensure that the energy evaluated based on the structure is within $10^{-4}$ kcal/mol of the value reported in the record

In [3]:
calc = TTMCalculator()

In [4]:
files = glob('./data/output/geom*json.gz')

In [5]:
with gzip.open(files[0]) as fp:
    record = json.loads(fp.readline())

Make sure the energies are the same when evaluated with TTM

In [6]:
for file in files:
    with gzip.open(file) as fp:
        for line, _ in tqdm(zip(fp, range(audit_size)), desc=file, total=audit_size):
            record = json.loads(line)
            atoms = atoms_from_dict(record)
            energy_diff = abs(calc.get_potential_energy(atoms) - record['energy'])
            assert energy_diff < 1e-4, f"Energy difference too large: {energy_diff:.3e} kcal/mol"

./data/output/geom_valid.json.gz: 100%|██████████| 100000/100000 [19:16<00:00, 86.44it/s]
./data/output/geom_test.json.gz: 100%|██████████| 100000/100000 [19:33<00:00, 85.24it/s] 
./data/output/geom_train.json.gz: 100%|██████████| 100000/100000 [19:19<00:00, 86.21it/s]


## Test the Atomic Networks
We will check that the networks parsed correctly by checking the network for:
- The number of waters and bonds matches up with the reported network size
- Every oxygen has exactly 2 covalent bonds to two hydrogens and no more than 4 hydrogen bonds (allowing for atypical structures)
- Every hydrogen has exactly 1 covalent bond to an oxygen and no more than 1 hydrogen bond

In [7]:
files = glob('./data/output/atomic*json.gz')

Check network properties

In [8]:
for file in files:
    with gzip.open(file) as fp:
        for line, _ in tqdm(zip(fp, range(audit_size)), desc=file, total=audit_size):
            # Load the graph data
            record = json.loads(line)
            graph = graph_from_dict(record)
            node_attrs = nx.get_node_attributes(graph, 'label')
            
            # Check network size
            assert record['n_waters'] == len(graph) // 3, 'Node count mismatch'
            assert record['n_bonds'] == graph.number_of_edges() * 2, 'Edge count mismatch'
            
            # Check the bonding properties
            for node, data in graph.nodes(data=True):
                # Get edges by type
                covalents = [i for i in graph[node] if graph[node][i]['label'] == 'covalent']
                hydrogens = [i for i in graph[node] if graph[node][i]['label'] == 'hydrogen']
                assert len(covalents + hydrogens) == len(graph[node]), "Unexplained bonds"
                
                if data['label'] == 'oxygen':
                    # Check covalent bonds
                    assert len(covalents) == 2, "Incorrect number of covalent bonds"
                    assert all(node_attrs[i] == 'hydrogen' for i in covalents), "Covalent bonds to non-hydrogens"
                    
                    # Check hydrogen bonds
                    assert len(hydrogens) <= 4, "Way too many hydrogen bonds"
                    assert all(node_attrs[i] == 'hydrogen' for i in hydrogens), "Hydrogen bonds to non-hydrogens"
                else:
                    # Check covalent bonds
                    assert len(covalents) == 1, "Incorrect number of covalent bonds"
                    assert all(node_attrs[i] == 'oxygen' for i in covalents), "Covalent bonds to a non-oxygen"
                    
                    # Check hydrogen bonds
                    assert len(hydrogens) <= 1, "Way too many hydrogen bonds"
                    assert all(node_attrs[i] == 'oxygen' for i in hydrogens), "Hydrogen bonds to non-oxygen"  

./data/output/atomic_valid.json.gz: 100%|██████████| 100000/100000 [01:52<00:00, 886.74it/s]
./data/output/atomic_train.json.gz: 100%|██████████| 100000/100000 [01:52<00:00, 889.05it/s]
./data/output/atomic_test.json.gz: 100%|██████████| 100000/100000 [01:52<00:00, 885.98it/s]


## Test the Coarse Networks
We will check that the networks parsed correctly by checking the network for:
- The number of waters and bonds matches up with the reported network size
- Every oxygen donates no more than 2 hydrogen bonds
- Every oxygen accepts no more than 4 hydrogen bonds
- All donations are paired with a accepting bond

In [9]:
files = glob('./data/output/coarse*json.gz')

Check network properties

In [10]:
for file in files:
    with gzip.open(file) as fp:
        for line, _ in tqdm(zip(fp, range(audit_size)), desc=file, total=audit_size):
            # Load the graph data
            record = json.loads(line)
            graph = graph_from_dict(record)
            node_attrs = nx.get_node_attributes(graph, 'label')
            assert all(i == 'oxygen' for i in node_attrs.values())
            
            # Check network size
            assert record['n_waters'] == len(graph), 'Node count mismatch'
            assert record['n_bonds'] == graph.number_of_edges(), 'Edge count mismatch'
            
            # Check the bonding properties
            for node, data in graph.nodes(data=True):
                # Get edges by type
                donate = [i for i in graph[node] if graph[node][i]['label'] == 'donate']
                accept = [i for i in graph[node] if graph[node][i]['label'] == 'accept']
                assert len(donate + accept) == len(graph[node]), "Unexplained bonds"
                
                # Make sure the number of donated bonds is less than 2
                assert len(donate) <= 2, "Too many donated bonds"
                assert len(accept) <= 4, "Way too many accepted bonds"
                
                # Make sure each donate is paired with an accept
                assert all(graph[i][node]['label'] == 'accept' for i in donate), "Non-reciprocal bonding"
                assert all(graph[i][node]['label'] == 'donate' for i in accept), "Non-reciprocal bonding"

./data/output/coarse_valid.json.gz: 100%|██████████| 100000/100000 [00:57<00:00, 1736.46it/s]
./data/output/coarse_train.json.gz: 100%|██████████| 100000/100000 [00:59<00:00, 1687.41it/s]
./data/output/coarse_test.json.gz: 100%|██████████| 100000/100000 [00:57<00:00, 1744.56it/s]
