In [1]:
from rdkit import Chem
from rdkit import RDLogger
import numpy as np
import augchem

RDLogger.DisableLog('rdApp.*')

['NC1=CN[M][M]N1C1CC1', 'NC1=CN=[M]N1C1C[M]', '[M]C1=CN=[M]N1C1CC1', 'NC1=[M]N=NN1C1[M]C1', 'N[M]=C[M]=NN1C1CC1', 'N[M]=CN=NN1C1C[M]', 'NC1[M]CN=NN1C1C[M]', 'NC1=CN=NN1[M][M]C1', 'NC1=C[M]=NN1C1[M]C1', 'NC1=C[M][M]NN1C1CC1']


In [2]:
loader = augchem.Loader('QM9')
data = loader.loadDataset('QM9', [])
data = np.array(data)

print(data)

[[['C' 'C']]

 [['N' 'N']]

 [['O' 'O']]

 [['C#C' 'C#C']]

 [['C#N' 'C#N']]

 [['C=O' 'C=O']]

 [['CC' 'CC']]

 [['CO' 'CO']]

 [['CC#C' 'CC#C']]

 [['CC#N' 'CC#N']]

 [['CC=O' 'CC=O']]

 [['NC=O' 'NC=O']]

 [['CCC' 'CCC']]

 [['CCO' 'CCO']]

 [['COC' 'COC']]

 [['C1CC1' 'C1CC1']]

 [['C1CO1' 'C1CO1']]

 [['CC(C)=O' 'CC(=O)C']]

 [['CC(N)=O' 'CC(=O)N']]

 [['NC(N)=O' 'NC(=O)N']]

 [['CC(C)C' 'CC(C)C']]

 [['CC(C)O' 'CC(C)O']]

 [['C#CC#C' 'C(#C)C#C']]

 [['C#CC#N' 'C(#C)C#N']]

 [['N#CC#N' 'N#CC#N']]

 [['O=CC#C' 'O=CC#C']]

 [['O=CC#N' 'O=CC#N']]

 [['O=CC=O' 'O=CC=O']]

 [['CC#CC' 'CC#CC']]

 [['CCC#C' 'CCC#C']]

 [['CCC#N' 'CCC#N']]

 [['NCC#N' 'NCC#N']]

 [['OCC#C' 'OCC#C']]

 [['OCC#N' 'OCC#N']]

 [['CCC=O' 'CCC=O']]

 [['CNC=O' 'CNC=O']]

 [['COC=O' 'COC=O']]

 [['OCC=O' 'OCC=O']]

 [['CCCC' 'CCCC']]

 [['CCCO' 'CCCO']]

 [['CCOC' 'CCOC']]

 [['OCCO' 'OCCO']]

 [['CC1CC1' 'CC1CC1']]

 [['CC1CO1' 'C[C@H]1CO1']]

 [['CN1CC1' 'CN1CC1']]

 [['OC1CC1' 'OC1CC1']]

 [['C1CCC1' 'C1CCC1'

In [3]:
valid_mols, invalid_mols = loader.verifyMolecules(data)
print('Valid molecules:', len(valid_mols))
print('Invalid molecules:', len(invalid_mols))

Valid molecules: 150
Invalid molecules: 0


In [4]:
aug = augchem.Augmentator(seed=432)

augment_set = []

In [5]:

# Augment the dataset
for item in data:
    for unit in item:
        for smiles in unit:
            mol = aug.mask(smiles, mask_ratio=0.3)

            if mol not in valid_mols and mol not in augment_set:
                augment_set.append(mol)

            mol = aug.delete(smiles, delete_ratio=0.3)

            if mol not in valid_mols and mol not in augment_set:
                augment_set.append(mol)

            mol = aug.swap(smiles)

            if mol not in valid_mols and mol not in augment_set:
                augment_set.append(mol)

            mol = aug.enumerateSmiles(smiles)

            if mol not in valid_mols and mol not in augment_set:
                augment_set.append(mol)

print('Augmented dataset:', len(augment_set) + len(data))
print('Augmented dataset:', augment_set)


Augmented dataset: 1215
Augmented dataset: [['C'], ['N'], ['O'], ['C#C'], ['C#N'], ['C#N', 'N#C'], ['N#C', 'C#N'], ['C=O'], ['O=C', 'C=O'], ['CC'], ['CO'], ['OC', 'CO'], ['CC[M]C', 'CC#[M]', 'C[M]#C', '[M]C#C'], ['CCC', 'C#C', 'CC#'], ['CC#C'], ['CC#C', 'C#CC', 'C(C)#C', 'C(#C)C'], ['CC[M]C', 'C[M]#C', '[M]C#C'], ['C#C', 'CC#'], ['CC[M]N', 'CC#[M]', 'C[M]#N', '[M]C#N'], ['CCN', 'CC#', 'C#N'], ['NC#C', 'CC#N'], ['C(#N)C', 'C(C)#N', 'N#CC', 'CC#N'], ['CC[M]N', 'CC#[M]', '[M]C#N'], ['NC#C', 'CN#C', 'CC#N'], ['CC[M]O', 'CC=[M]', 'C[M]=O', '[M]C=O'], ['C=O', 'CCO'], ['OC=C', 'CO=C', 'CC=O'], ['C(=O)C', 'O=CC', 'C(C)=O', 'CC=O'], ['CC[M]O', 'C[M]=O'], ['C(C)=O', 'O=CC', 'C(=O)C'], ['NC[M]O', 'N[M]=O', '[M]C=O', 'NC=[M]'], ['NCO', 'C=O'], ['CO=N', 'CN=O', 'NC=O'], ['C(=O)N', 'NC=O', 'O=CN', 'C(N)=O'], ['[M]C=O', 'NC=[M]'], ['NCO', 'C=O', 'NC='], ['CO=N', 'CN=O', 'NC=O', 'OC=N'], ['C(=O)N', 'NC=O', 'O=CN'], ['CCC'], ['C(C)C', 'CCC'], ['CCO'], ['COC'], ['C(C)O', 'C(O)C', 'OCC', 'CCO'], ['OCC', 

# Possíveis problemas:
## Slice_smiles: 
- Pra certas moléculas ele apresenta comportamento instável, principalmente para moléculas com letras minúsculas, como: Nc1cnnn1C1CC1	
### Erro parcialmente solucionado, para a string citada, não estou conseguindo chegar a um resultado correto, ao invés de separar em N, c1, separa em Nc1


In [7]:
print(aug.slice_smiles('Nc1cnnn1C1CC1'))

['Nc1', 'c', 'n', 'n', 'n1', 'C1', 'C', 'C1']
