In [None]:
# https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/molnet_publish/qm9.zip
# https://ndownloader.figshare.com/files/3195404
# https://www.dropbox.com/s/mtgh331m6k6gbw2/qm9-smi.zip?dl=1

In [1]:
import tarfile
import hashlib
import random
import os
import numpy as np
import h5py


def get_MD5(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def create_tar(input_file, output_file):
    with tarfile.open(output_file, 'w:gz') as tar:
        tar.add(input_file, arcname=os.path.basename(input_file))

HAR2EV = 27.211386246
KCALMOL2EV = 0.04336414
seed = 1234
train=0.7
val=0.15
test=0.15

conversion = np.array([
    1., 1., HAR2EV, HAR2EV, HAR2EV, 1., HAR2EV, HAR2EV, HAR2EV, HAR2EV, HAR2EV,
    1., KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, KCALMOL2EV, 1., 1., 1.
])

with open('gdb9.sdf.csv', 'r') as f:
    target = [[float(x) for x in line.split(',')[1:20]]
              for line in f.read().split('\n')[1:-1]]
    y = np.array(target, dtype=float)
    y = np.hstack([y[:, 3:], y[:, :3]])
    y = y * conversion.reshape(1, -1)

with open('uncharacterized.txt', 'r') as f:
    skip = [int(x.split()[0]) - 1 for x in f.read().split('\n')[9:-2]]

# "GDB_SMILES" (Col. number: 20), "B3LYP_SMILES" (Col. number: 21)
with open('smi_qm9.csv', 'r') as f:
    lines = f.read().split('\n')[1:-1]
    smiles = [line.split(',')[21] for line in lines]

smiles = np.delete(smiles, skip)
y = np.delete(y, skip, axis=0)

seed = 1234
np.random.seed(seed)
idxs = np.random.permutation(len(smiles))

smiles = smiles[idxs]
y = y[idxs]

num_lines = len(smiles)
train_num = int(num_lines * train)
val_num = train_num + int(num_lines * val)

train_smiles = smiles[:train_num]
val_smiles = smiles[train_num:val_num]
test_smiles = smiles[val_num:]

train_y = y[:train_num]
val_y = y[train_num:val_num]
test_y = y[val_num:]

with h5py.File('train.HDF5', 'w') as hf:
    hf.create_dataset('SMILES', data=train_smiles.tolist())
    hf.create_dataset('y', data=train_y)

with h5py.File('val.HDF5', 'w') as hf:
    hf.create_dataset('SMILES', data=val_smiles.tolist())
    hf.create_dataset('y', data=val_y)

with h5py.File('test.HDF5', 'w') as hf:
    hf.create_dataset('SMILES', data=test_smiles.tolist())
    hf.create_dataset('y', data=test_y)

for input_file in ['train.HDF5', 'val.HDF5', 'test.HDF5']:
    print('{} [MD5]:'.format(input_file), get_MD5(input_file))
    create_tar(input_file, '{}.tar.gz'.format(input_file))

train.HDF5 [MD5]: 006ec4a6d0d05f6f6c13273d1a528167
val.HDF5 [MD5]: ebf75b4958a1ac28edf539669093afce
test.HDF5 [MD5]: 16c8cf41f417a843066617162434f0af
