In [None]:
from litellm import completion
from typing import Optional
import litellm
from dotenv import load_dotenv
import rootutils

rootutils.setup_root(".", indicator=".project-root", pythonpath=True)

PosixPath('/Users/shreyasv/Desktop/research/deepchem/retrosynthesis/prod')

In [12]:
from src.variables import SYS_PROMPT, USER_PROMPT

In [13]:
load_dotenv()
litellm.success_callback = ["langfuse"]
litellm.drop_params = True

metadata = {
    "generation_name": "sv_testing",  # set langfuse generation name
    "project": "Retrosynthesis",  # set langfuse project name
    "version": "0.0.2",  # set langfuse version
    "trace_name": "sv_testing",  # set langfuse Trace Name
    "trace_user_id": "sv",  # set langfuse Trace User ID
    "session_id": "session-1",  # set langfuse Session ID
}


def call_LLM(molecule: str,
             LLM: str = "claude-3-opus-20240229",
             temperature: float = 0.0,
             messages: Optional[list[dict]] = None):
    """Calls the LLM model to predict the next step"""

    # logger.info(f"Calling {LLM} with molecule: {molecule}")
    if messages is None:
        messages = [{
            "role": "system",
            "content": SYS_PROMPT
        }, {
            "role": "user",
            "content": USER_PROMPT.replace('{target_smiles}', molecule)
        }]

    try:
        response = completion(model=LLM,
                              messages=messages,
                              max_completion_tokens=4096,
                              temperature=temperature,
                              seed=42,
                              top_p=0.9,
                              metadata=metadata)
        res_text = response.choices[0].message.content
    except Exception as e:
        # logger.info(f"Error in calling {LLM}: {e}")
        response = completion(model=LLM,
                              messages=messages,
                              max_completion_tokens=4096,
                              temperature=temperature,
                              seed=42,
                              top_p=0.9)
        res_text = response.choices[0].message.content

    # logger.info(f"Response from {LLM}: {response}")
    return res_text


res = call_LLM("CC(=O)CCC")

In [14]:
# extract the content within <cot> </cot> tags as thinking content
thinking_content = res[res.find("<cot>\n") + 6:res.find("</cot>")]
# split the thinking content into individual steps based on the <thinking> </thinking> tags
thinking_steps = thinking_content.split("<thinking>\n")[1:]
thinking_steps = [step[:step.find("</thinking>")] for step in thinking_steps]
print(thinking_content)
print("-------------------")
print(thinking_steps)
print("-------------------")
# extract the content within <json> </json> tags as json content
json_content = res[res.find("<json>\n") + 7:res.find("</json>")]
print(json_content)

<thinking>
The target molecule CC(=O)CCC is a ketone with 5 carbon atoms. Possible retrosynthetic disconnections to consider:

1. Disconnecting the C-C bond adjacent to the ketone, which could arise from an aldol condensation reaction between an enolate and an aldehyde.

2. Disconnecting the C-C bond on the other side of the ketone, which could come from a Grignard addition to a nitrile followed by hydrolysis.

3. Treating the ketone as an electrophile in a Grignard addition reaction with an alkyl halide.

4. Oxidizing a secondary alcohol to form this ketone.
</thinking>

<thinking>
For the aldol condensation approach, we would need an enolate precursor like a ketone or ester with 3 carbons (e.g. acetone or methyl acetate) and an aldehyde with 2 carbons like acetaldehyde. The enolate would add to the aldehyde, followed by dehydration to give the target ketone.
</thinking>

<thinking>
For the Grignard addition to a nitrile, the precursors would be acetonitrile and ethyl magnesium bromid

In [43]:
# read json file into dict

import json

with open(
        '/Users/shreyasv/Desktop/research/deepchem/retrosynthesis/prod/results/mols_small/Zonisamide.json',
        'r') as f:
    data = json.load(f)

print(data['dependencies'])

{'1': ['2'], '2': ['3', '4'], '3': [], '4': []}


In [44]:
# find parent step for each step:
parent_id_list = [-1] * (len(data['dependencies']) + 1)
parent_id_list[0] = None
parent_id_list[1] = 0
for step in data['dependencies']:
    for child in data['dependencies'][step]:
        parent_id_list[int(child)] = int(step)
print(parent_id_list)

[None, 0, 1, 2, 2]


In [45]:
data_steps = data['steps']
for step in data['dependencies']:

    print(step)
    data_steps[int(step) - 1]['child_id'] = data['dependencies'][step]
    data_steps[int(step) - 1]['parent_id'] = parent_id_list[int(step)]

print(data_steps)

1
2
3
4
[{'step': '1', 'reactants': [{'smiles': 'COc1ccc(CN(Cc2ccc(OC)cc2)S(=O)(=O)Cc2noc3ccccc23)cc1', 'reactant_metadata': {'name': '', 'chemical_formula': 'C24H24N2O5S', 'mass': 452.14059286799994}}], 'reagents': [], 'products': [{'smiles': 'NS(=O)(=O)Cc1noc2ccccc12', 'product_metadata': {'name': '', 'chemical_formula': 'C8H8N2O3S', 'mass': 212.025563116}}], 'conditions': [], 'reactionmetrics': [{'scalabilityindex': '8', 'confidenceestimate': 0.99, 'closestliterature': ''}], 'child_id': ['2'], 'parent_id': 0}, {'step': '2', 'reactants': [{'smiles': 'O=S(=O)(Cl)Cc1noc2ccccc12', 'reactant_metadata': {'name': '', 'chemical_formula': 'C8H6ClNO3S', 'mass': 230.975691732}}, {'smiles': 'COc1ccc(CNCc2ccc(OC)cc2)cc1', 'reactant_metadata': {'name': '', 'chemical_formula': 'C16H19NO2', 'mass': 257.141578848}}], 'reagents': [], 'products': [{'smiles': 'COc1ccc(CN(Cc2ccc(OC)cc2)S(=O)(=O)Cc2noc3ccccc23)cc1', 'product_metadata': {'name': '', 'chemical_formula': 'C24H24N2O5S', 'mass': 452.140592867

In [None]:
parent_id_list = [-1] * (len(data['dependencies']) + 1)
parent_id_list[0] = None
parent_id_list[1] = 0
for step in data['dependencies']:
    for child in data['dependencies'][step]:
        parent_id_list[int(child)] = int(step)

data_steps = data['steps']
for step in data['dependencies']:
    data_steps[int(step) - 1]['child_id'] = data['dependencies'][step]
    data_steps[int(step) - 1]['parent_id'] = parent_id_list[int(step)]


# convert the json file into a nested tree structure
def build_tree(data_steps, parent_id):
    tree = {}
    for step in data_steps:
        # print(step['parent_id'])
        if step['parent_id'] == parent_id:
            tree[int(step['step'])] = {
                'step': step,
                'children': build_tree(data_steps, int(step['step']))
            }
    return tree


tree = build_tree(data_steps, 0)
import json

print(json.dumps(tree, indent=4))

{'step': '1', 'reactants': [{'smiles': 'COc1ccc(CN(Cc2ccc(OC)cc2)S(=O)(=O)Cc2noc3ccccc23)cc1', 'reactant_metadata': {'name': '', 'chemical_formula': 'C24H24N2O5S', 'mass': 452.14059286799994}}], 'reagents': [], 'products': [{'smiles': 'NS(=O)(=O)Cc1noc2ccccc12', 'product_metadata': {'name': '', 'chemical_formula': 'C8H8N2O3S', 'mass': 212.025563116}}], 'conditions': [], 'reactionmetrics': [{'scalabilityindex': '8', 'confidenceestimate': 0.99, 'closestliterature': ''}], 'child_id': ['2'], 'parent_id': 0}
{'step': '1', 'reactants': [{'smiles': 'COc1ccc(CN(Cc2ccc(OC)cc2)S(=O)(=O)Cc2noc3ccccc23)cc1', 'reactant_metadata': {'name': '', 'chemical_formula': 'C24H24N2O5S', 'mass': 452.14059286799994}}], 'reagents': [], 'products': [{'smiles': 'NS(=O)(=O)Cc1noc2ccccc12', 'product_metadata': {'name': '', 'chemical_formula': 'C8H8N2O3S', 'mass': 212.025563116}}], 'conditions': [], 'reactionmetrics': [{'scalabilityindex': '8', 'confidenceestimate': 0.99, 'closestliterature': ''}], 'child_id': ['2']

In [None]:
from collections import defaultdict

tree = defaultdict(dict)
# print(tree)
for step in data_steps:
    if step['parent_id'] is not None:
        tree[step['parent_id']][step['step']] = step
    else:
        tree[0][step['step_id']] = step

# convert the tree into a json file
