# Pull Test Datasets
Create test datasets from the initial datasets. 

In [1]:
from sklearn.model_selection import train_test_split
from examol.simulate.initialize import add_initial_conformer
from examol.store.models import MoleculeRecord
from pathlib import Path
from hashlib import md5
from tqdm import tqdm
import numpy as np
import json
import gzip

Configuration

In [2]:
dataset_name: str = 'mdf-mos'
target_prop: str = 'oxidation_potential'
level: str = 'cp2k_b3lyp_s-acn-adiabatic'

Defining where to look

In [3]:
data_path = Path(f'../2_initial-data/datasets/{dataset_name}.json.gz')
out_path = Path(f'datasets/{dataset_name}/{target_prop}-{level}/')

## Load the Data
Load only records with the target level of fidelity into memory

In [4]:
train_records, test_records = [], []
hasher = md5()
rng = np.random.RandomState(1)
all_levels = set()
with gzip.open(data_path, 'rt') as fp:
    for line in tqdm(fp):
        # Update the hash and list with this record
        hasher.update(line.encode())
        
        # Determine whether record should be in training or test set
        in_test = rng.random() > 0.9
        
        # Skip if the record contains our property
        record = json.loads(line)
        all_levels.update(record['properties'].get(target_prop, {}).keys())
        if not level in record['properties'].get(target_prop, {}):
            continue
        
        if in_test:
            test_records.append(line)
        else: 
            train_records.append(line)
data_hash = hasher.hexdigest()
print(f'Loaded {len(test_records) + len(train_records)} matching records. Data Hash: {data_hash}')

1115110it [01:01, 18008.22it/s]

Loaded 93 matching records. Data Hash: b499f5d21c60b5930ec8b9a780050c8a





In [5]:
print(f'{target_prop} available at levels: {", ".join(sorted(all_levels))}')

oxidation_potential available at levels: cp2k_b3lyp_svp-acn-adiabatic, cp2k_b3lyp_svp-acn-vertical, cp2k_b3lyp_svp-adiabatic, cp2k_b3lyp_svp-vertical, cp2k_b3lyp_tzvpd-acn-adiabatic, cp2k_b3lyp_tzvpd-acn-vertical, cp2k_b3lyp_tzvpd-adiabatic, cp2k_b3lyp_tzvpd-vertical, cp2k_wb97x-d3_tzvpd-acn-adiabatic, cp2k_wb97x-d3_tzvpd-acn-vertical, cp2k_wb97x-d3_tzvpd-adiabatic, cp2k_wb97x-d3_tzvpd-vertical, mopac_pm7-acn-adiabatic, mopac_pm7-acn-vertical, mopac_pm7-adiabatic, mopac_pm7-vertical, xtb-acn-adiabatic, xtb-acn-vertical, xtb-adiabatic, xtb-vertical


## Split then save to disk
Save the matching records to disk if they are new

In [6]:
out_path.mkdir(parents=True, exist_ok=True)

In [7]:
md5_path = out_path / 'dataset.md5'
if md5_path.is_file() and md5_path.read_text() == data_hash:
    print('FYI: Nothing has changed since we wrote this data last')
md5_path.write_text(data_hash);

Save each to disk

In [8]:
for label, records in zip(['train', 'test'], [train_records, test_records]):
    with gzip.open(out_path / f'{label}.json.gz', 'wt') as fp:
        for line in tqdm(records, desc=label):
            fp.write(line)

train: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 82/82 [00:00<00:00, 128.00it/s]
test: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 129.60it/s]
