In [1]:
from rdkit import Chem
from rdkit import RDLogger
import numpy as np
import augchem

RDLogger.DisableLog('rdApp.*')

In [2]:
loader = augchem.Loader('QM9')
data = loader.loadDataset('QM9', [])
data = np.array(data)

print(data)

[[['C' 'C']]

 [['N' 'N']]

 [['O' 'O']]

 [['C#C' 'C#C']]

 [['C#N' 'C#N']]

 [['C=O' 'C=O']]

 [['CC' 'CC']]

 [['CO' 'CO']]

 [['CC#C' 'CC#C']]

 [['CC#N' 'CC#N']]

 [['CC=O' 'CC=O']]

 [['NC=O' 'NC=O']]

 [['CCC' 'CCC']]

 [['CCO' 'CCO']]

 [['COC' 'COC']]

 [['C1CC1' 'C1CC1']]

 [['C1CO1' 'C1CO1']]

 [['CC(C)=O' 'CC(=O)C']]

 [['CC(N)=O' 'CC(=O)N']]

 [['NC(N)=O' 'NC(=O)N']]

 [['CC(C)C' 'CC(C)C']]

 [['CC(C)O' 'CC(C)O']]

 [['C#CC#C' 'C(#C)C#C']]

 [['C#CC#N' 'C(#C)C#N']]

 [['N#CC#N' 'N#CC#N']]

 [['O=CC#C' 'O=CC#C']]

 [['O=CC#N' 'O=CC#N']]

 [['O=CC=O' 'O=CC=O']]

 [['CC#CC' 'CC#CC']]

 [['CCC#C' 'CCC#C']]

 [['CCC#N' 'CCC#N']]

 [['NCC#N' 'NCC#N']]

 [['OCC#C' 'OCC#C']]

 [['OCC#N' 'OCC#N']]

 [['CCC=O' 'CCC=O']]

 [['CNC=O' 'CNC=O']]

 [['COC=O' 'COC=O']]

 [['OCC=O' 'OCC=O']]

 [['CCCC' 'CCCC']]

 [['CCCO' 'CCCO']]

 [['CCOC' 'CCOC']]

 [['OCCO' 'OCCO']]

 [['CC1CC1' 'CC1CC1']]

 [['CC1CO1' 'C[C@H]1CO1']]

 [['CN1CC1' 'CN1CC1']]

 [['OC1CC1' 'OC1CC1']]

 [['C1CCC1' 'C1CCC1'

In [3]:
valid_mols, invalid_mols = loader.verifyMolecules(data)
print('Valid molecules:', len(valid_mols))
print('Invalid molecules:', len(invalid_mols))

Valid molecules: 150
Invalid molecules: 0


In [4]:
aug = augchem.Augmentator(seed=432)

augment_set = []

In [5]:

# Augment the dataset
for item in data:
    for unit in item:
        for smiles in unit:
            mol = aug.mask(smiles, mask_ratio=0.3)

            if mol not in valid_mols and mol not in augment_set:
                augment_set.append(mol)

            mol = aug.delete(smiles, delete_ratio=0.3)

            if mol not in valid_mols and mol not in augment_set:
                augment_set.append(mol)

            mol = aug.swap(smiles)

            if mol not in valid_mols and mol not in augment_set:
                augment_set.append(mol)

            mol = aug.enumerateSmiles(smiles)

            if mol not in valid_mols and mol not in augment_set:
                augment_set.append(mol)

print('Augmented dataset:', len(augment_set) + len(data))
print('Augmented dataset:', augment_set)


Augmented dataset: 1207
Augmented dataset: [['C'], ['N'], ['O'], ['C#C'], ['C#N'], ['N#C', 'C#N'], ['C=O'], ['C=O', 'O=C'], ['O=C', 'C=O'], ['CC'], ['CO'], ['OC', 'CO'], ['C[M]#C', 'CC#[M]', 'CC[M]C', '[M]C#C'], ['CCC', 'CC#', 'C#C'], ['CC#C'], ['C(#C)C', 'CC#C', 'C(C)#C', 'C#CC'], ['C[M]#C', 'CC[M]C', '[M]C#C'], ['CC#', 'C#C'], ['CC[M]N', 'CC#[M]', 'C[M]#N', '[M]C#N'], ['C#N', 'CC#', 'CCN'], ['CN#C', 'NC#C'], ['N#CC', 'CC#N', 'C(#N)C', 'C(C)#N'], ['CC[M]N', 'CC#[M]', '[M]C#N'], ['CC#N'], ['[M]C=O', 'CC[M]O', 'CC=[M]', 'C[M]=O'], ['C=O', 'CCO'], ['CO=C', 'CC=O', 'OC=C'], ['O=CC', 'CC=O', 'C(C)=O', 'C(=O)C'], ['CC[M]O', 'C[M]=O'], ['CO=C', 'CC=O'], ['O=CC', 'C(C)=O', 'C(=O)C'], ['[M]C=O', 'NC[M]O', 'N[M]=O', 'NC=[M]'], ['C=O', 'NCO'], ['CN=O', 'ON=C', 'OC=N', 'NC=O'], ['C(N)=O', 'O=CN', 'NC=O', 'C(=O)N'], ['[M]C=O', 'NC=[M]'], ['C=O', 'NC=', 'NCO'], ['ON=C', 'OC=N'], ['O=CN', 'NC=O', 'C(=O)N'], ['CCC'], ['CCC', 'C(C)C'], ['CCO'], ['COC', 'OCC', 'CCO'], ['C(O)C', 'OCC', 'CCO', 'C(C)O'], 

# Dúvida acerca da função: 
- Para a string N[C]1C(=C([NH])ON=C1)O, foi feita a tokenização para ['N', '[C]1', 'C', '(=C([NH])ON=C1)', 'O'], mas tenho dúvida se essa tokenização é a correta ou se seria necessário tokenizar dentro do parênteses (=C([NH])ON=C1)