# Pull Test Datasets
Create test datasets from the initial datasets. 

In [1]:
from sklearn.model_selection import train_test_split
from pathlib import Path
from hashlib import md5
from tqdm import tqdm
import json
import gzip

Configuration

In [2]:
dataset_name: str = 'mdf-mos'
target_prop: str = 'reduction_potential'
level: str = 'mopac_pm7-acn-adiabatic'

Defining where to look

In [3]:
data_path = Path(f'../2_initial-data/datasets/{dataset_name}.json.gz')
out_path = Path(f'datasets/{dataset_name}/{target_prop}-{level}/')

## Load the Data
Load only records with the target level of fidelity into memory

In [4]:
all_records = []
hasher = md5()
with gzip.open(data_path, 'rt') as fp:
    for line in tqdm(fp):
        # Skip if the record contains our property
        record = json.loads(line)
        if not level in record['properties'].get(target_prop, {}):
            continue
            
        # Update the hash and list with this record
        hasher.update(line.encode())
        all_records.append(line)
data_hash = hasher.hexdigest()
print(f'Loaded {len(all_records)} matching records. Hash: {data_hash}')

1115110it [00:31, 35706.54it/s]

Loaded 150585 matching records. Hash: b29db032ead2c10d02d23947a5f452c8





## Split then save to disk
Save the matching records to disk if they are new

In [5]:
out_path.mkdir(parents=True, exist_ok=True)

In [6]:
md5_path = out_path / 'dataset.md5'
if md5_path.is_file() and md5_path.read_text() == data_hash:
    print('FYI: Nothing has changed since we wrote this data last')
md5_path.write_text(data_hash);

In [7]:
train_records, test_records = train_test_split(all_records, shuffle=True, random_state=1)
print(f'Split off {len(train_records)} for a training set')

Split off 112938 for a training set


Save each to disk

In [8]:
for label, records in zip(['train', 'test'], [train_records, test_records]):
    with gzip.open(out_path / f'{label}.json.gz', 'wt') as fp:
        for line in tqdm(records):
            fp.write(line)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 112938/112938 [06:07<00:00, 306.95it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37647/37647 [02:01<00:00, 310.27it/s]
