In [1]:
# Load packages
import numpy as np
import pandas as pd

# From SMILES to tokens
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens)
    return ' '.join(tokens)

# From tokens to SMILES
def smi_detokenizer(smi):
    return smi.replace(' ', '')

# Load tgt and src files
path_src_train  = '../data/MIT_active_learning/src-train.txt'
path_tgt_train  = '../data/MIT_active_learning/tgt-train.txt'
path_src_active = '../data/MIT_active_learning/src-active.txt'
path_tgt_active = '../data/MIT_active_learning/tgt-active.txt'
src_train  = pd.read_csv(path_src_train, sep='\t', header=None).values.flatten().tolist()
tgt_train  = pd.read_csv(path_tgt_train, sep='\t', header=None).values.flatten().tolist()
src_active = pd.read_csv(path_src_active, sep='\t', header=None).values.flatten().tolist()
tgt_active = pd.read_csv(path_tgt_active, sep='\t', header=None).values.flatten().tolist()

# Generate reactions
reac_active = [smi_detokenizer(src_active[n]) + '>' + smi_detokenizer(tgt_active[n]) for n in range(len(tgt_active))]
reac_train  = [smi_detokenizer(src_train[n]) + '>' + smi_detokenizer(tgt_train[n]) for n in range(len(tgt_train))]

In [3]:
# Save datasets
path_to_save = '../data/MIT_active_learning/'
with open(path_to_save + 'active_learning_reactions.txt', 'w+') as f:
    for item in reac_active:
        f.write("%s\n" % item)
with open(path_to_save + 'train_reactions.txt', 'w+') as f:
    for item in reac_train:
        f.write("%s\n" % item)

In [4]:
# Load tgt and src files for val and test
path_src_val  = '../data/MIT_active_learning/src-val.txt'
path_tgt_val  = '../data/MIT_active_learning/tgt-val.txt'
path_src_test = '../data/MIT_active_learning/src-test.txt'
path_tgt_test = '../data/MIT_active_learning/tgt-test.txt'
src_val  = pd.read_csv(path_src_val, sep='\t', header=None).values.flatten().tolist()
tgt_val  = pd.read_csv(path_tgt_val, sep='\t', header=None).values.flatten().tolist()
src_test = pd.read_csv(path_src_test, sep='\t', header=None).values.flatten().tolist()
tgt_test = pd.read_csv(path_tgt_test, sep='\t', header=None).values.flatten().tolist()

# Generate reactions
reac_val = [smi_detokenizer(src_val[n]) + '>' + smi_detokenizer(tgt_val[n]) for n in range(len(tgt_val))]
reac_test  = [smi_detokenizer(src_test[n]) + '>' + smi_detokenizer(tgt_test[n]) for n in range(len(tgt_test))]

In [10]:
# Save datasets
path_to_save = '../data/MIT_active_learning/'
with open(path_to_save + 'validation_reactions.txt', 'w+') as f:
    for item in reac_val:
        f.write("%s\n" % item)
with open(path_to_save + 'test_reactions.txt', 'w+') as f:
    for item in reac_test:
        f.write("%s\n" % item)