(transformation_product_prediction)=

# Transformation Product Prediction (TPP)

In [1]:
from enviPath_python.enviPath import *
from enviPath_python.objects import *

In [5]:
INSTANCE_HOST = 'https://envipath.org'
EP_MODEL_ID = 'https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/relative-reasoning/23e1b2ec-dcc0-4389-9b65-afd52bd72e27'

# data of parent compound
compound_input = {"smiles": "CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(N5C(C)C)C)F", "name": "Abe"}

# Maximum number of TPs to predict
MAX_TP = 50

# Lower probability threshold
PROBABILITY_THRESHOLD = 0 # any value equal to or lower than the threshold will be excluded

# To prioritize small compounds in the queue
SORT_TPS_BY_SIZE = False

#---------------------------#
# CONNECT TO ENVIPATH       #
#---------------------------#
eP = enviPath(INSTANCE_HOST)
# USERNAME = <your-username>
# password = getpass.getpass()
# eP.login(USERNAME, password)

rr = RelativeReasoning(eP.requester, id=EP_MODEL_ID)

In [6]:
def update_compound_entry(compound_entry, this_combined_probability, rules, rule_IDs, this_generation, parent_smiles,
                            size_metric, size_value):
    """
    Update the compound entry with new information
    :param compound_entry: dictionary of compound information
    :param this_combined_probability: new combined probability
    :param rules: new rules
    :param rule_IDs: new rule IDs
    :param this_generation: new generation
    :param parent_smiles: new parent compound
    :param parent_compound: new parent compound
    :param size_metric: size metric
    :param size_value: new size value
    :return: updated compound entry
    """
    if compound_entry['combined_probability'] < this_combined_probability:
        compound_entry['combined_probability'] = this_combined_probability
        compound_entry['rules'] = rules
        compound_entry['rule_IDs'] = rule_IDs
        compound_entry['generation'] = this_generation
        compound_entry['parent_smiles'] = parent_smiles
        compound_entry[size_metric] = size_value
    elif compound_entry['combined_probability'] == this_combined_probability:
        compound_entry['rules'] += ',{}'.format(rules)
        compound_entry['rule_IDs'] += ',{}'.format(rule_IDs)
        compound_entry['parent_smiles'] += ',{}'.format(parent_smiles)
    return compound_entry

def update_queue(_queue,_queued_items, _validated_TPs, _TPs, _parent_data):
    """
    Update queue with TPs predicted in current iteration
    :param _queue: ordered list of smiles to explore
    :param _queued_items: ordered list of compound dictionaries, same order as _queue
    :param _validated_TPs: list of already validated TPs for resulting pathway
    :param _TPs: predicted TPs from current iteration, to be evaluated and added to queue
    :param _parent_data: compound dictionary of the parent compound of _TPs
    :return: new_queue: new ordered list of smiles to explore
    :return _queued_items: new ordered list of compound dictionaries
    :return: _validated_TPs: updated list of already validated TPs
    """
    parent_probability = _parent_data['combined_probability']
    parent_generation = _parent_data['generation']
    parent_smiles = _parent_data['smiles']
    queue_before = len(_queue)
    for smiles in _TPs:
        data = _TPs[smiles]
        # If the probability is 0 , we don't consider the TP further
        this_probability = data['probability']
        if this_probability <= PROBABILITY_THRESHOLD:
            continue
        # add combined probability
        this_combined_probability = parent_probability * this_probability
        this_generation = parent_generation + 1
        rules = data['rules']
        rule_IDs = data['rule_IDs']
        # first, check if compound already in validated. if yes, update
        if smiles in _validated_TPs.keys():
            _validated_TPs[smiles] = update_compound_entry(_validated_TPs[smiles],
                                                           this_combined_probability, rules, rule_IDs,
                                                           this_generation, parent_smiles, size_metric='size',
                                                           size_value=len(smiles))
        # next, check if compound is already in queue. if yes, update
        elif smiles in _queue:
            index = _queue.index(smiles)
            assert smiles == _queued_items[index]['smiles'], \
                'smiles {} does not match smiles in {}'.format(smiles, _queued_items[index])
            _queued_items[index] = update_compound_entry(_queued_items[index],
                                                           this_combined_probability, rules, rule_IDs,
                                                           this_generation, parent_smiles, size_metric='size',
                                                           size_value=len(smiles))
        # else, add new item to queue
        else:
            data['combined_probability'] = this_combined_probability
            data['generation'] = this_generation
            data['parent_smiles'] = parent_smiles
            _queued_items.append(data)
            _queue.append(smiles)

    # order dict by combined probability
    _queued_items.sort(reverse=True, key=lambda x: x['combined_probability'])
    queue_after = len(_queue)
    print('Added {} smiles to queue'.format(queue_after - queue_before))
    new_queue = [x['smiles'] for x in _queued_items]
    print ('New queue for compound', parent_smiles)
    for q in new_queue:
        print(q, _queued_items[new_queue.index(q)]['combined_probability'])

    return new_queue, _queued_items, _validated_TPs

In [7]:
print('\n### PREDICT TPs FOR COMPOUND {} ###\n'.format(compound_input['name']))
num_TP = -1 # counter starts at -1, because source compound is also in the TP list
validated_TPs = {}  # container for resulting predictions
queued_items = [{'probability': 1, 'combined_probability': 1, 'smiles': compound_input['smiles'], 'generation': 0, 'parent_smiles': '',
                    'rules': '', 'rule_IDs': '', 'name': compound_input['name']}]
queue = [compound_input['smiles']]  # queue is updated after each cycle to have top TP first, list of smiles
while num_TP < MAX_TP:
    if len(queue) == 0:
        print('\nEmpty queue - The exploration of has converged at {} predicted TPs'.format(num_TP))
        break # stop TP prediction
    smiles = queue.pop(0) # get top item in queue
    data = queued_items.pop(0) # remove data from queued items

    prediction_data = rr.classify_smiles(smiles)
    # sort by probability
    prediction_data.sort(reverse=True, key=lambda x: x['probability'])

    TP_dict = {}
    for prediction in prediction_data:
        probability = float(prediction['probability'])
        for product_smiles in prediction['products']:
            if product_smiles not in TP_dict.keys():
                TP_dict[product_smiles] = {'rules' : prediction['name'], 'rule_IDs': prediction['id'], 'probability': probability, 'smiles': product_smiles}
            else:
                # check if there's a rule with better probability
                if probability > TP_dict[product_smiles]['probability']:
                    # update probability and rules associated to this probability
                    TP_dict[product_smiles]['probability'] = probability
                    TP_dict[product_smiles]['rules'] = prediction['name']
                    TP_dict[product_smiles]['rule_IDs'] = prediction['id']
    
    queue, queued_items, validated_TPs = update_queue(queue, queued_items, validated_TPs, TP_dict, data)
    validated_TPs[smiles] = data
    num_TP += 1
#validated_TPs -->clean_result(result)


### PREDICT TPs FOR COMPOUND Abe ###

Added 12 smiles to queue
New queue for compound CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(N5C(C)C)C)F
CCN1CCNCC1 0.4
CC(=O)C 0.4
CC(C)N1C(=NC2=C1C=C(C=C2F)C3=NC(=NC=C3F)NC4=CC=C(C=N4)CN5CCNCC5)C 0.4
CC(C)N1C(=NC2=C1C=C(C=C2F)C3=NC(=NC=C3F)NC4=CC=C(C=N4)C=O)C 0.4
CCN1CCN(CC1)CC2=CC=C(N=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(C)N5)F 0.4
CCN(CCNCC1=CC=C(N=C1)NC2=NC=C(C(=N2)C3=CC4=C(C(=C3)F)N=C(C)N4C(C)C)F)CC=O 0.4
CC=O 0.4
CCNCCN(CC=O)CC1=CC=C(N=C1)NC2=NC=C(C(=N2)C3=CC4=C(C(=C3)F)N=C(C)N4C(C)C)F 0.4
CCN1CCN(CC1)CC2=CC=C(N=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(CO)N5C(C)C)F 0.05
CC(C)N1C(=NC2=C1C=C(C=C2F)C3=NC(=NC=C3F)NC4=CC=C(C=N4)CN5CCN(CC5)CCO)C 0.0
CCN1CCN(CC1)CC2=CC=C(N=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(C)N5C(C)CO)F 0.0
CCN1CCN(CC1)CC2=CC=C(N=C2O)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(C)N5C(C)C)F 0.0
Added 4 smiles to queue
New queue for compound CCN1CCNCC1
CC(=O)C 0.4
CC(C)N1C(=NC2=C1C=C(C=C2F)C3=NC(=NC=C3F)NC4=CC=C(C=N4)CN5

In [8]:
print(validated_TPs[[key for key in validated_TPs.keys()][0]])

TypeError: unhashable type: 'list'

In [9]:
validated_TPs

{'CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(N5C(C)C)C)F': {'probability': 1,
  'combined_probability': 1,
  'smiles': 'CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(N5C(C)C)C)F',
  'generation': 0,
  'parent_smiles': '',
  'rules': '',
  'rule_IDs': '',
  'name': 'Abe',
  'size': 73},
 'CCN1CCNCC1': {'rules': 'bt0063',
  'rule_IDs': 'https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/parallel-rule/fbbba937-dd70-43a4-be8c-7de8fcc0f0df',
  'probability': 0.4,
  'smiles': 'CCN1CCNCC1',
  'combined_probability': 0.4,
  'generation': 1,
  'parent_smiles': 'CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(N5C(C)C)C)F',
  'carbon_count': 6},
 'CC(=O)C': {'rules': 'bt0063',
  'rule_IDs': 'https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/parallel-rule/fbbba937-dd70-43a4-be8c-7de8fcc0f0df',
  'probability': 0.4,
  'smiles': 'CC(=O)C',
  'combined_probability': 0.4,
  'generation': 1,
  'parent_smiles': 'CCN1CCN(CC1)

In [26]:
def clean_result(result_dict):
    """
    Sorts TP list for output
    :param result_dict: result dictionary
    :return: sorted and named list of TPs
    """
    result_list = list(result_dict.values())
    result_list.sort(reverse=True, key=lambda x: x['combined_probability'])
    result_list.sort(key=lambda x: x['generation']) # make sure that source compound is first
    # get name of source compound
    source_name = result_list[0]['name']
    source_smiles = result_list[0]['smiles']
    TP_count = 0
    D = {source_smiles: source_name}
    new_result_list = []
    new_result_list.append(result_list[0])
    for res in result_list[1:]:
        new_res = res
        TP_count += 1
        new_name = 'TP_{}_{}'.format(source_name, TP_count)
        new_res['name'] = new_name
        multiple_parents = res['parent_smiles'].split(',')
        for p in multiple_parents:
            new_res['parent_name'] = D[p]
            D[res['smiles']] = new_name
        new_result_list.append(new_res)
    return new_result_list
clean_result(validated_TPs)

[{'probability': 1,
  'combined_probability': 1,
  'smiles': 'CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(N5C(C)C)C)F',
  'generation': 0,
  'parent_smiles': '',
  'rules': '',
  'rule_IDs': '',
  'name': 'Abe',
  'size': 73},
 {'rules': 'bt0063',
  'rule_IDs': 'https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/parallel-rule/fbbba937-dd70-43a4-be8c-7de8fcc0f0df',
  'probability': 0.4,
  'smiles': 'CCN1CCNCC1',
  'combined_probability': 0.4,
  'generation': 1,
  'parent_smiles': 'CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(N5C(C)C)C)F',
  'carbon_count': 6,
  'name': 'TP_Abe_1',
  'parent_name': 'Abe'},
 {'rules': 'bt0063',
  'rule_IDs': 'https://envipath.org/package/32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/parallel-rule/fbbba937-dd70-43a4-be8c-7de8fcc0f0df',
  'probability': 0.4,
  'smiles': 'CC(=O)C',
  'combined_probability': 0.4,
  'generation': 1,
  'parent_smiles': 'CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(N5C(C)