In [None]:
from litellm import completion
from typing import Optional
import litellm
from dotenv import load_dotenv
import rootutils

rootutils.setup_root(".", indicator=".project-root", pythonpath=True)

WindowsPath('C:/Users/irahu/git_workspace/recursiveLLM')

In [None]:
from src.variables import SYS_PROMPT, USER_PROMPT

In [None]:
import os

os.environ["ENABLE_LOGGING"] = "False"

# Testing

In [3]:
load_dotenv()
litellm.success_callback = ["langfuse"]
litellm.drop_params = True

metadata = {
    "generation_name": "sv_testing",  # set langfuse generation name
    "project": "Retrosynthesis",  # set langfuse project name
    "version": "0.0.2",  # set langfuse version
    "trace_name": "sv_testing",  # set langfuse Trace Name
    "trace_user_id": "sv",  # set langfuse Trace User ID
    "session_id": "session-1",  # set langfuse Session ID
}


def call_LLM(molecule: str,
             LLM: str = "claude-3-opus-20240229",
             temperature: float = 0.0,
             messages: Optional[list[dict]] = None):
    """Calls the LLM model to predict the next step"""

    # logger.info(f"Calling {LLM} with molecule: {molecule}")
    if messages is None:
        messages = [{
            "role": "system",
            "content": SYS_PROMPT
        }, {
            "role": "user",
            "content": USER_PROMPT.replace('{target_smiles}', molecule)
        }]

    try:
        response = completion(model=LLM,
                              messages=messages,
                              max_completion_tokens=4096,
                              temperature=temperature,
                              seed=42,
                              top_p=0.9,
                              metadata=metadata)
        res_text = response.choices[0].message.content
    except Exception as e:
        # logger.info(f"Error in calling {LLM}: {e}")
        response = completion(model=LLM,
                              messages=messages,
                              max_completion_tokens=4096,
                              temperature=temperature,
                              seed=42,
                              top_p=0.9)
        res_text = response.choices[0].message.content

    # logger.info(f"Response from {LLM}: {response}")
    return res_text


res = call_LLM("CC(=O)CCC")

In [4]:
res

'Here is the single-step retrosynthesis for the molecule CC(=O)CCC:\n\n<cot>\n<thinking>\nThe target molecule CC(=O)CCC is a ketone with the formula C5H10O. It has a methyl ketone functional group and a propyl alkyl chain.\n\nPossible retrosynthetic disconnections to consider:\n1. Disconnecting the C-C bond between the carbonyl and the propyl group via a Grignard addition \n2. Oxidizing an alcohol precursor\n3. Hydrating an alkyne precursor\n4. Oxidizing an alkene precursor\n</thinking>\n\n<thinking>\nFor the Grignard addition, the precursors would be propylmagnesium bromide and acetaldehyde. The reaction would involve nucleophilic addition of the Grignard reagent to the carbonyl.\nPrecursors: CCC[Mg]Br and CC=O\n</thinking>\n\n<thinking>\nOxidizing a secondary alcohol with the formula C5H12O could yield the target ketone. This would require a strong oxidizing agent like PCC or PDC.\nPrecursor: CC(O)CCC\n</thinking>\n\n<thinking>\nHydrating an alkyne with the formula C5H8 (CC#CCC) usin

In [14]:
# extract the content within <cot> </cot> tags as thinking content
thinking_content = res[res.find("<cot>\n") + 6:res.find("</cot>")]
# split the thinking content into individual steps based on the <thinking> </thinking> tags
thinking_steps = thinking_content.split("<thinking>\n")[1:]
thinking_steps = [step[:step.find("</thinking>")] for step in thinking_steps]
print(thinking_content)
print("-------------------")
print(thinking_steps)
print("-------------------")
# extract the content within <json> </json> tags as json content
json_content = res[res.find("<json>\n") + 7:res.find("</json>")]
print(json_content)

<thinking>
The target molecule CC(=O)CCC is a ketone with 5 carbon atoms. Possible retrosynthetic disconnections to consider:

1. Disconnecting the C-C bond adjacent to the ketone, which could arise from an aldol condensation reaction between an enolate and an aldehyde.

2. Disconnecting the C-C bond on the other side of the ketone, which could come from a Grignard addition to a nitrile followed by hydrolysis.

3. Treating the ketone as an electrophile in a Grignard addition reaction with an alkyl halide.

4. Oxidizing a secondary alcohol to form this ketone.
</thinking>

<thinking>
For the aldol condensation approach, we would need an enolate precursor like a ketone or ester with 3 carbons (e.g. acetone or methyl acetate) and an aldehyde with 2 carbons like acetaldehyde. The enolate would add to the aldehyde, followed by dehydration to give the target ketone.
</thinking>

<thinking>
For the Grignard addition to a nitrile, the precursors would be acetonitrile and ethyl magnesium bromid

In [1]:
from litellm import completion
from typing import Optional
import litellm
from dotenv import load_dotenv
import rootutils

rootutils.setup_root(".", indicator=".project-root", pythonpath=True)
from src.metadata import reagent_agent, conditions_agent, literature_agent

In [2]:
from src.cache import clear_cache

clear_cache()

In [3]:
# read the json content
import json

with open(
        "/Users/shreyasv/Desktop/research/deepchem/retrosynthesis/prod/results/mols_small/Zonisamide.json"
) as f:
    output_data = json.load(f)

# output_data = {
#     "dependencies": {
#         "1": ["2"],
#         "2": ["3", "4"],
#         "3": [],
#         "4": []
#     },
#     "steps": [{
#         "step":
#         "1",
#         "reactants": [{
#             "smiles": "COc1ccc(CN(Cc2ccc(OC)cc2)S(=O)(=O)Cc2noc3ccccc23)cc1",
#             "reactant_metadata": {
#                 "name": "",
#                 "chemical_formula": "C24H24N2O5S",
#                 "mass": 452.14059286799994
#             }
#         }],
#         "reagents": [{
#             "smiles": "O=S(=O)(Cl)Cc1noc2ccccc12",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "C8H6ClNO3S",
#                 "mass": 230.975691732
#             }
#         }],
#         "products": [{
#             "smiles": "NS(=O)(=O)Cc1noc2ccccc12",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "C8H8N2O3S",
#                 "mass": 212.025563116
#             }
#         }],
#         "conditions": {
#             "temperature":
#             "Room temperature (approximately 20-25\u00b0C)",
#             "pressure":
#             "The reaction is carried out at atmospheric pressure, as no special pressure conditions are typically required for this type of transformation.",
#             "solvent":
#             "This reaction does not require a solvent as the product and reactant are the same molecule. No actual chemical transformation is taking place.",
#             "time":
#             "As no reaction is occurring, the concept of reaction time is not applicable in this case."
#         },
#         "reactionmetrics": [{
#             "scalabilityindex": "8",
#             "confidenceestimate": 0.99,
#             "closestliterature": ""
#         }]
#     }, {
#         "step":
#         "2",
#         "reactants": [{
#             "smiles": "O=S(=O)(Cl)Cc1noc2ccccc12",
#             "reactant_metadata": {
#                 "name": "",
#                 "chemical_formula": "C8H6ClNO3S",
#                 "mass": 230.975691732
#             }
#         }, {
#             "smiles": "COc1ccc(CNCc2ccc(OC)cc2)cc1",
#             "reactant_metadata": {
#                 "name": "",
#                 "chemical_formula": "C16H19NO2",
#                 "mass": 257.141578848
#             }
#         }],
#         "reagents": [{
#             "smiles": "Cc1ccccc1N",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "C7H9N",
#                 "mass": 107.073499288
#             }
#         }, {
#             "smiles": "Cl[Mg]I",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "ClIMg",
#                 "mass": 185.85836738
#             }
#         }, {
#             "smiles": "C1CCOC1",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "C4H8O",
#                 "mass": 72.057514876
#             }
#         }],
#         "products": [{
#             "smiles": "COc1ccc(CN(Cc2ccc(OC)cc2)S(=O)(=O)Cc2noc3ccccc23)cc1",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "C24H24N2O5S",
#                 "mass": 452.14059286799994
#             }
#         }],
#         "conditions": {
#             "temperature":
#             "Room temperature (20-25 \u00b0C)",
#             "pressure":
#             "The reaction likely proceeds at atmospheric pressure, as no special pressure conditions are typically required for sulfonylation reactions of this type.",
#             "solvent":
#             "A polar aprotic solvent such as dichloromethane (DCM), acetonitrile (MeCN), or N,N-dimethylformamide (DMF)",
#             "time":
#             "1-24 hours, depending on the specific reactivity of the substrates and other reaction conditions"
#         },
#         "reactionmetrics": [{
#             "scalabilityindex": "8",
#             "confidenceestimate": 0.86,
#             "closestliterature": ""
#         }]
#     }, {
#         "step":
#         "3",
#         "reactants": [{
#             "smiles": "O=P(Cl)(Cl)Cl",
#             "reactant_metadata": {
#                 "name": "",
#                 "chemical_formula": "Cl3OP",
#                 "mass": 151.87523428999998
#             }
#         }, {
#             "smiles": "O=S(=O)(O)Cc1noc2ccccc12",
#             "reactant_metadata": {
#                 "name": "",
#                 "chemical_formula": "C8H7NO4S",
#                 "mass": 213.009578704
#             }
#         }],
#         "reagents": [{
#             "smiles": "C1CCOC1",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "C4H8O",
#                 "mass": 72.057514876
#             }
#         }],
#         "products": [{
#             "smiles": "O=S(=O)(Cl)Cc1noc2ccccc12",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "C8H6ClNO3S",
#                 "mass": 230.975691732
#             }
#         }],
#         "conditions": {
#             "temperature": "0-25 \u00b0C",
#             "pressure": "1 atm",
#             "solvent":
#             "Dichloromethane (CH2Cl2) or other non-polar aprotic solvent",
#             "time": "1-4 hours"
#         },
#         "reactionmetrics": [{
#             "scalabilityindex": "9",
#             "confidenceestimate": 0.93,
#             "closestliterature": ""
#         }]
#     }, {
#         "step":
#         "4",
#         "reactants": [{
#             "smiles": "COc1ccc(C=O)cc1",
#             "reactant_metadata": {
#                 "name": "",
#                 "chemical_formula": "C8H8O2",
#                 "mass": 136.052429496
#             }
#         }, {
#             "smiles": "COc1ccc(CN)cc1",
#             "reactant_metadata": {
#                 "name": "",
#                 "chemical_formula": "C8H11NO",
#                 "mass": 137.084063972
#             }
#         }],
#         "reagents": [{
#             "smiles": "[H][O][H]",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "H2O",
#                 "mass": 18.010564684
#             }
#         }, {
#             "smiles": "[H+]",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "H+",
#                 "mass": 1.00727645209
#             }
#         }],
#         "products": [{
#             "smiles": "COc1ccc(CNCc2ccc(OC)cc2)cc1",
#             "product_metadata": {
#                 "name": "",
#                 "chemical_formula": "C16H19NO2",
#                 "mass": 257.141578848
#             }
#         }],
#         "conditions": {
#             "temperature": "Room temperature (20-25 \u00b0C)",
#             "pressure": "Atmospheric pressure",
#             "solvent": "Ethanol or methanol",
#             "time": "2-4 hours"
#         },
#         "reactionmetrics": [{
#             "scalabilityindex": "9",
#             "confidenceestimate": 0.88,
#             "closestliterature": ""
#         }]
#     }]
# }

In [4]:
for idx, step in enumerate(output_data['steps']):

    status, reagents = reagent_agent(step['reactants'],
                                     step['products'],
                                     LLM="o1-preview-2024-09-12")
    output_data['steps'][idx]['reagents'].extend(reagents)

    status, conditions = conditions_agent(step['reactants'], step['products'],
                                          step['reagents'])
    output_data['steps'][idx]['conditions'] = conditions

    status, literature = literature_agent(step['reactants'], step['products'],
                                          step['reagents'], step['conditions'])
    output_data['steps'][idx]['reactionmetrics'][0][
        'closestliterature'] = literature
    # print(output_data['steps'][idx])
    # break


LookupError: <ContextVar name='logger' at 0x176672570>

In [5]:
print(json.dumps(output_data))

{"dependencies": {"1": ["2"], "2": ["3", "4"], "3": [], "4": []}, "steps": [{"step": "1", "reactants": [{"smiles": "COc1ccc(CN(Cc2ccc(OC)cc2)S(=O)(=O)Cc2noc3ccccc23)cc1", "reactant_metadata": {"name": "", "chemical_formula": "C24H24N2O5S", "mass": 452.14059286799994}}], "reagents": [{"smiles": "O=S(=O)(O)O", "product_metadata": {"name": "", "chemical_formula": "H2O4S", "mass": 97.96737954400001}}, {"smiles": "C1CCOC1", "product_metadata": {"name": "", "chemical_formula": "C4H8O", "mass": 72.057514876}}], "products": [{"smiles": "NS(=O)(=O)Cc1noc2ccccc12", "product_metadata": {"name": "", "chemical_formula": "C8H8N2O3S", "mass": 212.025563116}}], "conditions": {"temperature": "Reflux", "pressure": "Ambient pressure", "solvent": "1,4-Dioxane", "time": "2-4 hours"}, "reactionmetrics": [{"scalabilityindex": "8", "confidenceestimate": 0.99, "closestliterature": "Sulfonylation of amines with sulfonyl chlorides"}]}, {"step": "2", "reactants": [{"smiles": "O=S(=O)(Cl)Cc1noc2ccccc12", "reactant

In [None]:
out = {
    "dependencies": {
        "1": ["2"],
        "2": ["3", "4"],
        "3": [],
        "4": []
    },
    "steps": [{
        "step":
        "1",
        "reactants": [{
            "smiles": "COc1ccc(CN(Cc2ccc(OC)cc2)S(=O)(=O)Cc2noc3ccccc23)cc1",
            "reactant_metadata": {
                "name": "",
                "chemical_formula": "C24H24N2O5S",
                "mass": 452.14059286799994
            }
        }],
        "reagents": [{
            "smiles": "O=S(=O)(O)O",
            "product_metadata": {
                "name": "",
                "chemical_formula": "H2O4S",
                "mass": 97.96737954400001
            }
        }, {
            "smiles": "C1CCOC1",
            "product_metadata": {
                "name": "",
                "chemical_formula": "C4H8O",
                "mass": 72.057514876
            }
        }],
        "products": [{
            "smiles": "NS(=O)(=O)Cc1noc2ccccc12",
            "product_metadata": {
                "name": "",
                "chemical_formula": "C8H8N2O3S",
                "mass": 212.025563116
            }
        }],
        "conditions": {
            "temperature": "Reflux",
            "pressure": "Ambient pressure",
            "solvent": "1,4-Dioxane",
            "time": "2-4 hours"
        },
        "reactionmetrics": [{
            "scalabilityindex":
            "8",
            "confidenceestimate":
            0.99,
            "closestliterature":
            "Sulfonylation of amines with sulfonyl chlorides"
        }]
    }, {
        "step":
        "2",
        "reactants": [{
            "smiles": "O=S(=O)(Cl)Cc1noc2ccccc12",
            "reactant_metadata": {
                "name": "",
                "chemical_formula": "C8H6ClNO3S",
                "mass": 230.975691732
            }
        }, {
            "smiles": "COc1ccc(CNCc2ccc(OC)cc2)cc1",
            "reactant_metadata": {
                "name": "",
                "chemical_formula": "C16H19NO2",
                "mass": 257.141578848
            }
        }],
        "reagents": [{
            "smiles": "[Na+].[OH-]",
            "product_metadata": {
                "name": "",
                "chemical_formula": "HNaO",
                "mass": 39.99250893200001
            }
        }],
        "products": [{
            "smiles": "COc1ccc(CN(Cc2ccc(OC)cc2)S(=O)(=O)Cc2noc3ccccc23)cc1",
            "product_metadata": {
                "name": "",
                "chemical_formula": "C24H24N2O5S",
                "mass": 452.14059286799994
            }
        }],
        "conditions": {
            "temperature":
            "Room temperature (20-25 \u00b0C)",
            "pressure":
            "Atmospheric pressure",
            "solvent":
            "Polar aprotic solvent such as DMF, DMSO, or acetonitrile",
            "time":
            "2-24 hours, depending on the specific substrates and scale of the reaction"
        },
        "reactionmetrics": [{
            "scalabilityindex":
            "8",
            "confidenceestimate":
            0.86,
            "closestliterature":
            "Sulfonamide formation via nucleophilic substitution of a sulfonyl chloride with a secondary amine"
        }]
    }, {
        "step":
        "3",
        "reactants": [{
            "smiles": "O=P(Cl)(Cl)Cl",
            "reactant_metadata": {
                "name": "",
                "chemical_formula": "Cl3OP",
                "mass": 151.87523428999998
            }
        }, {
            "smiles": "O=S(=O)(O)Cc1noc2ccccc12",
            "reactant_metadata": {
                "name": "",
                "chemical_formula": "C8H7NO4S",
                "mass": 213.009578704
            }
        }],
        "reagents": [{
            "smiles": "O",
            "product_metadata": {
                "name": "",
                "chemical_formula": "H2O",
                "mass": 18.010564684
            }
        }],
        "products": [{
            "smiles": "O=S(=O)(Cl)Cc1noc2ccccc12",
            "product_metadata": {
                "name": "",
                "chemical_formula": "C8H6ClNO3S",
                "mass": 230.975691732
            }
        }],
        "conditions": {
            "temperature": "0-25 \u00b0C",
            "pressure": "1 atm",
            "solvent":
            "Dichloromethane (CH2Cl2) or other non-polar aprotic solvent",
            "time": "1-4 hours"
        },
        "reactionmetrics": [{
            "scalabilityindex":
            "9",
            "confidenceestimate":
            0.93,
            "closestliterature":
            "Chlorination of sulfonic acids using phosphorus oxychloride (POCl3)"
        }]
    }, {
        "step":
        "4",
        "reactants": [{
            "smiles": "COc1ccc(C=O)cc1",
            "reactant_metadata": {
                "name": "",
                "chemical_formula": "C8H8O2",
                "mass": 136.052429496
            }
        }, {
            "smiles": "COc1ccc(CN)cc1",
            "reactant_metadata": {
                "name": "",
                "chemical_formula": "C8H11NO",
                "mass": 137.084063972
            }
        }],
        "reagents": [{
            "smiles": "[H][H]",
            "product_metadata": {
                "name": "",
                "chemical_formula": "H2",
                "mass": 2.015650064
            }
        }, {
            "smiles": "[Na+].[BH4-]",
            "product_metadata": {
                "name": "",
                "chemical_formula": "H4BNa",
                "mass": 38.030374808000005
            }
        }, {
            "smiles": "CC(=O)O.[H][H]",
            "product_metadata": {
                "name": "",
                "chemical_formula": "C2H6O2",
                "mass": 62.036779432
            }
        }],
        "products": [{
            "smiles": "COc1ccc(CNCc2ccc(OC)cc2)cc1",
            "product_metadata": {
                "name": "",
                "chemical_formula": "C16H19NO2",
                "mass": 257.141578848
            }
        }],
        "conditions": {
            "temperature": "Room temperature (20-25\u00b0C)",
            "pressure": "Atmospheric pressure",
            "solvent": "Methanol (MeOH) or Ethanol (EtOH)",
            "time": "1-3 hours"
        },
        "reactionmetrics": [{
            "scalabilityindex":
            "9",
            "confidenceestimate":
            0.88,
            "closestliterature":
            "Reductive amination of an aldehyde with a primary amine using sodium borohydride"
        }]
    }]
}

# Deepseek

In [2]:
from src.utils.llm import call_LLM, llm_pipeline
from src.cache import clear_cache_for_molecule

In [3]:
mol2 = "NC(=O)C1=NC(F)=CN=C1O"

In [4]:
mol = "ClC(C(OC)=C1)=CC2=C1[C@@H]3[C@@H](N(C)CC2)CCC4=CC=CC=C43"

In [8]:
clear_cache_for_molecule(mol)

In [9]:
out = call_LLM(mol, LLM="deepinfra/deepseek-ai/DeepSeek-R1")

Calling deepinfra/deepseek-ai/DeepSeek-R1 with molecule: ClC(C(OC)=C1)=CC2=C1[C@@H]3[C@@H](N(C)CC2)CCC4=CC=CC=C43
Detected seven member ring in molecule: ClC(C(OC)=C1)=CC2=C1[C@@H]3[C@@H](N(C)CC2)CCC4=CC=CC=C43
Advanced Prompt: False
Received response from LLM: <think>
Okay, let's tackle this retrosynthesis problem. The target molecule is ClC(C(OC)=C1)=CC2=C1[C@@H]3[C@@H](N(C)CC2)CCC4=CC=CC=C43. First, I need to analyze the structure to identify possible disconnections. 

Looking at the molecule, there's a complex fused ring system. The chlorine substituent is on a benzene ring, which might suggest a Friedel-Crafts alkylation or a nucleophilic aromatic substitution, but the presence of an electron-withdrawing group like Cl could make that less likely. Alternatively, the ester group (OC) attached to the ring could be introduced via an esterification reaction. 

The bicyclic structure with a nitrogen atom suggests a possible Mannich reaction or some kind of cyclization. The seven-membere

In [7]:
print(out[1])

<think>
Okay, let's tackle this retrosynthesis problem. The target molecule is ClC(C(OC)=C1)=CC2=C1[C@@H]3[C@@H](N(C)CC2)CCC4=CC=CC=C43. Hmm, that's a bit complex. Let me break it down.

First, I'll look for functional groups and possible disconnections. The molecule has a chloro group, an ether (OC), a tertiary amine (N(C)), and a fused ring system. The fused rings might suggest a Diels-Alder reaction or some cycloaddition. Alternatively, maybe a ring-forming alkylation or amination.

Looking at the structure, there's a bicyclic system with a bridge. The chlorine is attached to a benzene ring, which might have been introduced via electrophilic substitution. The ether group (OC) could be from an alkylation of a phenol. The tertiary amine suggests a possible Mannich reaction or some kind of nucleophilic substitution where an amine attacks an electrophilic carbon.

Let me consider the bridge formation. The bridge between the two rings could be formed via a Friedel-Crafts alkylation if th

In [8]:
mol3 = "Oc1cc2c(cc1Cl)[C@@H]1[C@@H](N(C)CC2)CCC2=CC=CC=C21"
clear_cache_for_molecule(mol3)
_, out2 = call_LLM(mol3)

Calling claude-3-opus-20240229 with molecule: Oc1cc2c(cc1Cl)[C@@H]1[C@@H](N(C)CC2)CCC2=CC=CC=C21
Detected seven member ring in molecule: Oc1cc2c(cc1Cl)[C@@H]1[C@@H](N(C)CC2)CCC2=CC=CC=C21
Received response from LLM: Here is my retrosynthetic analysis for the target molecule with SMILES Oc1cc2c(cc1Cl)[C@@H]1[C@@H](N(C)CC2)CCC2=CC=CC=C21:

<cot>
<thinking type="initial_assessment">
Initial structural analysis:
- The molecule has a fused tetracyclic ring system
- It contains a phenol, an aromatic chloride, a tertiary amine, and a cyclohexene ring
- There are two stereogenic centers, both with defined absolute stereochemistry
- The amine is methylated and forms part of a tetrahydroisoquinoline substructure
- The phenol and chloride are para to each other on the benzene ring

Reflection: The structure is relatively complex with multiple ring fusions and heteroatom substitution. The two stereocenters will be important to consider in the retrosynthesis. There are no obvious symmetry elements 

In [1]:
# Assume openai>=1.0.0
from openai import OpenAI

# Create an OpenAI client with your deepinfra token and endpoint
openai = OpenAI(
    api_key="wFYChdhelLBf5R49oWGLyKhAbqWsBgMn",
    base_url="https://api.deepinfra.com/v1/openai",
)

chat_completion = openai.chat.completions.create(
    model="deepseek-ai/DeepSeek-R1",
    messages=[{
        "role": "user",
        "content": "Hello"
    }],
)

print(chat_completion.choices[0].message.content)
print(chat_completion.usage.prompt_tokens,
      chat_completion.usage.completion_tokens)

# Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?
# 11 25


<think>

</think>

Hello! How can I assist you today? 😊
4 16


# Test: new exception handling

In [None]:
import ast
import litellm
from typing import Optional
from dotenv import load_dotenv
from litellm import completion
from src.variables import USER_PROMPT, SYS_PROMPT
from src.variables import USER_PROMPT_OPENAI, SYS_PROMPT_OPENAI, OPENAI_MODELS, DEEPSEEK_MODELS
from src.cache import cache_results
# from src.utils.utils_molecule import calc_mol_wt, validity_check
from src.utils.job_context import logger as context_logger

load_dotenv()

# set the success callback to langfuse for logging
litellm.success_callback = ["langfuse"]
litellm.drop_params = True

metadata = {
    "generation_name": "prod",  # set langfuse generation name
    "project": "Retrosynthesis",  # set langfuse project name
    "version": "0.0.3",  # set langfuse version
    "trace_name": "prod",  # set langfuse Trace Name
    "trace_user_id": "sv",  # set langfuse Trace User ID
    "session_id": "prod",  # set langfuse Session ID
}


@cache_results
def call_LLM(molecule: str,
             LLM: str = "claude-3-opus-20240229",
             temperature: float = 0.0,
             messages: Optional[list[dict]] = None):
    """Calls the LLM model to predict the next step"""
    # logger = context_logger.get() # NOTE: cmnt logger while testing
    # logger.info(f"Calling {LLM} with molecule: {molecule}")

    if LLM in OPENAI_MODELS or LLM in DEEPSEEK_MODELS:
        if messages is None:
            messages = [{
                "role":
                "user",
                "content":
                USER_PROMPT_OPENAI.replace('{target_smiles}', molecule)
            }]
        max_completion_tokens = 8192
    else:
        if messages is None:
            messages = [{
                "role": "system",
                "content": SYS_PROMPT
            }, {
                "role":
                "user",
                "content":
                USER_PROMPT.replace('{target_smiles}', molecule)
            }]
        max_completion_tokens = 4096

    try:
        response = completion(
            model=LLM,
            messages=messages,
            max_completion_tokens=max_completion_tokens,
            temperature=temperature,
            seed=42,
            top_p=0.9,
            metadata=metadata)  # NOTE: Why it is necessary to pass metadata
        res_text = response.choices[0].message.content
    except Exception as e:
        # logger.info(f"Error in calling {LLM}: {e}")
        # logger.info(f"Retrying call to {LLM}")
        print(e)
        try:
            response = completion(model=LLM,
                                  messages=messages,
                                  max_completion_tokens=4096,
                                  temperature=temperature,
                                  seed=42,
                                  top_p=0.9)
            res_text = response.choices[0].message.content
        except Exception as e:
            # logger.info(f"2nd Error in calling {LLM}: {e}")
            # logger.info(f"Exiting call to {LLM}")
            return 404, ""
    # logger.info(f"Received response from LLM: {res_text}")
    return 200, res_text


In [None]:
@cache_results
def call_LLM(molecule: str,
             LLM: str = "claude-3-opus-20240229",
             temperature: float = 0.0,
             messages: Optional[list[dict]] = None):
    """Calls the LLM model to predict the next step"""
    # logger = context_logger.get() # NOTE: cmnt logger while testing
    # logger.info(f"Calling {LLM} with molecule: {molecule}")

    if LLM in OPENAI_MODELS or LLM in DEEPSEEK_MODELS:
        if messages is None:
            messages = [{
                "role":
                "user",
                "content":
                USER_PROMPT_OPENAI.replace('{target_smiles}', molecule)
            }]
        max_completion_tokens = 8192
    else:
        if messages is None:
            messages = [{
                "role": "system",
                "content": SYS_PROMPT
            }, {
                "role":
                "user",
                "content":
                USER_PROMPT.replace('{target_smiles}', molecule)
            }]
        max_completion_tokens = 4096

    try:
        response = completion(
            model=LLM,
            messages=messages,
            max_completion_tokens=max_completion_tokens,
            temperature=temperature,
            seed=42,
            top_p=0.9,
            metadata=metadata)  # NOTE: Why it is necessary to pass metadata
        res_text = response.choices[0].message.content
        return 200, res_text
    except Exception as e:
        # logger.info(f"Error in calling {LLM}: {e}")
        # logger.info(f"Retrying call to {LLM}")
        print(e)
        try:
            response = completion(model=LLM,
                                  messages=messages,
                                  max_completion_tokens=4096,
                                  temperature=temperature,
                                  seed=42,
                                  top_p=0.9,
                                  metadata=metadata)
            res_text = response.choices[0].message.content
        except Exception as e:
            # logger.info(f"2nd Error in calling {LLM}: {e}")
            # logger.info(f"Exiting call to {LLM}")
            return 404, ""
    # logger.info(f"Received response from LLM: {res_text}")
    return 200, res_text


In [None]:
help(completion)

Help on function completion in module litellm.main:

completion(model: str, messages: List = [], timeout: Union[float, str, openai.Timeout, NoneType] = None, temperature: Optional[float] = None, top_p: Optional[float] = None, n: Optional[int] = None, stream: Optional[bool] = None, stream_options: Optional[dict] = None, stop=None, max_completion_tokens: Optional[int] = None, max_tokens: Optional[int] = None, modalities: Optional[List[typing_extensions.Literal['text', 'audio']]] = None, prediction: Optional[openai.types.chat.chat_completion_prediction_content_param.ChatCompletionPredictionContentParam] = None, audio: Optional[openai.types.chat.chat_completion_audio_param.ChatCompletionAudioParam] = None, presence_penalty: Optional[float] = None, frequency_penalty: Optional[float] = None, logit_bias: Optional[dict] = None, user: Optional[str] = None, reasoning_effort: Optional[Literal['low', 'medium', 'high']] = None, response_format: Union[dict, Type[pydantic.main.BaseModel], NoneType] =

#### utils_molecule

In [None]:
import os
from rdkit import Chem
from rdkit.Chem import AllChem, rdMolDescriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem.Descriptors import ExactMolWt
# import rootutils

import joblib
import rootutils
from src.variables import REACTION_ENCODING_NAMES, ENCODING_SCALABILITY
from src.cache import cache_results
# from src.utils.job_context import logger as context_logger

# root_dir = rootutils.setup_root(__file__,
#                                 indicator=".project-root",
#                                 pythonpath=True)
root_dir = rootutils.setup_root(".",
                                indicator=".project-root",
                                pythonpath=True)

RXN_CLASSIFICATION_MODEL_PATH = f"{root_dir}/{os.getenv('RXN_CLASSIFICATION_MODEL_PATH')}"


def is_valid_smiles(smiles: str) -> bool:
    """Check if the SMILES string is valid

    Parameters
    ----------
    smiles : str
        smiles string

    Returns
    -------
    bool
        True if the smiles is valid, False otherwise
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
    except:
        return False
    if mol is None:
        return False
    return True


def substructure_matching(target_smiles: str, query_smiles: str) -> int:
    """Check if the query substructure is present in the target molecule

    Parameters
    ----------
    target_smiles : str
        SMILES string of the target molecule
    query_smiles : str
        SMILES string of the query molecule

    Returns
    -------
    int
        1 if the query substructure is present in the target molecule, 0 otherwise
    """
    # logger = context_logger.get()
    # Convert SMILES to RDKit molecule objects
    try:
        target_molecule = Chem.MolFromSmiles(target_smiles)
    except Exception as e:
        # logger.info(f"Error in parsing query molecule: {query_smiles}")
        print(e)

        # logger.info(f"Error in parsing target molecule: {target_smiles}")

    try:
        query_molecule = Chem.MolFromSmiles(query_smiles)
    except Exception as e:
        # logger.info(f"Error in parsing query molecule: {query_smiles}")
        print(e)

    # Check if the query substructure is present in the target molecule
    try:
        if target_molecule.HasSubstructMatch(query_molecule):
            return 1
        else:
            return 0
    except:
        return 0


@cache_results
def validity_check(molecule, res_molecules, res_explanations, res_confidence):
    """Check the validity of the molecules obtained from LLM
    # NOTE: check validity_check

    Parameters
    ----------
    molecule : str
        Target molecule for retrosynthesis
    res_molecules : list
        List of molecules obtained from LLM
    res_explanations : list
        List of explanations obtained from LLM
    res_confidence : list
        List of confidence scores obtained from LLM

    Returns
    -------
    list
        List of valid pathways
    list
        List of valid explanations
    list
        List of valid confidence scores
    """
    # logger = context_logger.get()
    valid_pathways = []
    valid_explanations = []
    valid_confidence = []
    for idx, smile_list in enumerate(res_molecules):
        valid = []
        if isinstance(smile_list, list):
            for smiles in smile_list:
                if is_valid_smiles(smiles):
                    if are_molecules_same(molecule, smiles):
                        # logger.info(
                        #     f"Molecule : {molecule} is same as target molecule"
                        # )
                        print(
                            f"Molecule : {molecule} is same as target molecule"
                        )
                    elif substructure_matching(smiles, molecule):
                        logger.info(
                            f"Molecule : {molecule} is substructure of target molecule"
                        )
                    else:
                        valid.append(smiles)
                else:
                    # logger.info(
                    #     f"Molecule : {molecule} is invalid or cannot be parsed"
                    # )
                    print(
                        "Molecule : {molecule} is invalid or cannot be parsed")
            if len(valid) >= 2:
                valid_pathways.append(valid)
                valid_explanations.append(res_explanations[idx])
                valid_confidence.append(res_confidence[idx])
        else:
            if is_valid_smiles(smile_list):
                if are_molecules_same(molecule, smiles):
                    # logger.info("Molecule is same as target molecule")
                    print("Molecule is same as target molecule")
                elif substructure_matching(smiles, molecule):
                    # logger.info(
                    #     f"Molecule : {molecule} is substructure of target molecule {smiles}"
                    # )
                    print(
                        f"Molecule : {molecule} is substructure of target molecule {smiles}"
                    )
                else:
                    valid_pathways.append([smile_list])
                    valid_explanations.append(res_explanations[idx])
                    valid_confidence.append(res_confidence[idx])
            else:
                print("Molecule is invalid or cannot be parsed")
    #             logger.info("Molecule is invalid or cannot be parsed")
    # logger.info(
    #     f"Obtained {len(valid_pathways)} valid pathways after validity test: {valid_pathways}"
    # )
    return valid_pathways, valid_explanations, valid_confidence


def calc_mol_wt(mol: str) -> float:
    """Calculate the molecular weight of a molecule

    Parameters
    ----------
    mol : str
        SMILES string of the molecule

    Returns
    -------
    float
        molecular weight of the molecule
    """
    # logger = context_logger.get()
    try:
        mol_wt = ExactMolWt(Chem.MolFromSmiles(mol))
    except:
        mol_wt = 0.0
        # logger.info(f"Error in calculating molecular weight: {mol}")
    return mol_wt


def calc_chemical_formula(mol: str):
    """Calculate the chemical formula of a molecule

    Parameters
    ----------
    mol : str
        SMILES string of the molecule

    Returns
    -------
    str
        molecular formula of the molecule
    """
    # logger = context_logger.get()
    try:
        formula = CalcMolFormula(Chem.MolFromSmiles(mol))
    except:
        formula = "N/A"
        # logger.info(f"Error in calculating formula: {mol}")
    return formula


def are_molecules_same(smiles1: str, smiles2: str) -> bool:
    # Convert SMILES strings to RDKit molecule objects
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)

    if mol1 is None or mol2 is None:
        raise ValueError("Invalid SMILES string provided.")

    # Get canonical SMILES for both molecules
    canonical_smiles1 = Chem.MolToSmiles(mol1, canonical=True)
    canonical_smiles2 = Chem.MolToSmiles(mol2, canonical=True)

    # Alternatively, compare molecular fingerprints
    fingerprint1 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol1,
                                                                  radius=2,
                                                                  nBits=1024)
    fingerprint2 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol2,
                                                                  radius=2,
                                                                  nBits=1024)

    # Check if canonical SMILES or fingerprints match
    if canonical_smiles1 == canonical_smiles2:
        return True
    elif fingerprint1 == fingerprint2:
        return True
    else:
        return False


def compute_fingerprint(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol,
                                                        radius,
                                                        nBits=nBits)
    return list(fingerprint)


def sub_structure_matching(target_smiles: str, query_smiles: str) -> bool:
    """Check if the query substructure is present in the target molecule"""
    target_molecule = Chem.MolFromSmiles(target_smiles)
    query_molecule = Chem.MolFromSmiles(query_smiles)

    if target_molecule.HasSubstructMatch(query_molecule):
        return True
    else:
        return False


def get_reaction_type(mol1, mol2, model_path):
    """Get the reaction type of a reaction"""
    clf = joblib.load(model_path)
    mol1_fingerprint = compute_fingerprint(mol1)
    mol2_fingerprint = compute_fingerprint(mol2)
    reaction_type = clf.predict([mol1_fingerprint + mol2_fingerprint])
    return REACTION_ENCODING_NAMES[reaction_type[0]], reaction_type[0]


def calc_confidence_estimate(probability: float) -> float:
    """Calculate the confidence estimate based on the probability

    Parameters
    ----------
    probability : float
        Probability of the prediction

    Returns
    -------
    float
        Confidence estimate
    """
    if isinstance(probability, list):
        probability = probability[0]
    if probability < 0.3:
        probability = 1 - probability
    elif probability < 0.45 and probability >= 0.3:
        probability += 0.5
    elif probability < 0.6 and probability >= 0.45:
        probability += 0.3

    # limit the confidence estimate to 2 decimal places, round to the
    # nearest 0.01
    probability = round(probability, 2)
    if probability > 0.99:
        probability = 0.99
    return probability


def calc_scalability_index(mol1, mol2):
    """Calculate the scalability index of a reaction"""
    _, type = get_reaction_type(mol1, mol2, RXN_CLASSIFICATION_MODEL_PATH)
    return str(ENCODING_SCALABILITY[type])


def calc_yield(mol1, mol2):
    """Calculate the yield of a reaction"""
    return "#"


In [6]:
res = call_LLM("CC(=O)CCC")

In [7]:
res

(200,
 'Here is the single-step retrosynthesis for the molecule CC(=O)CCC:\n\n<cot>\n<thinking>\nThe target molecule CC(=O)CCC is a ketone with the formula C5H10O. It has a methyl ketone functional group and a propyl alkyl chain.\n\nPossible retrosynthetic disconnections to consider:\n1. Oxidation of a secondary alcohol \n2. Grignard addition to an ester\n3. Oxidation of an alkene\n4. Friedel-Crafts acylation\n5. Wacker oxidation of a terminal alkene\n</thinking>\n\n<thinking>\nOxidation of a secondary alcohol:\nThe ketone could be formed by oxidation of the corresponding secondary alcohol CC(O)CCC. This would be a reliable transformation.\nPrecursor: CC(O)CCC\nReaction: Alcohol oxidation \nReagents: PCC, PDC, Swern, Dess-Martin periodinane\nConfidence: High\n</thinking>\n\n<thinking>\nGrignard addition to an ester:\nThe ketone could be formed by Grignard addition of ethyl magnesium bromide to ethyl propionate, followed by hydrolysis. A good yield is likely.\nPrecursors: CCC(=O)OCC and

In [25]:
res

(200,
 'Here is the single-step retrosynthesis for the molecule CC(=O)CCC:\n\n<cot>\n<thinking>\nThe target molecule CC(=O)CCC is a ketone with the formula C5H10O. It has a methyl ketone functional group and a propyl alkyl chain.\n\nPossible retrosynthetic disconnections to consider:\n1. Oxidation of a secondary alcohol \n2. Grignard addition to an ester\n3. Oxidation of an alkene\n4. Friedel-Crafts acylation\n5. Wacker oxidation of a terminal alkene\n</thinking>\n\n<thinking>\nOxidation of a secondary alcohol:\nThe ketone could be formed by oxidation of the corresponding secondary alcohol CC(O)CCC. This would be a reliable transformation.\nPrecursor: CC(O)CCC\nReaction: Alcohol oxidation \nReagents: PCC, PDC, Swern, Dess-Martin periodinane\nConfidence: High\n</thinking>\n\n<thinking>\nGrignard addition to an ester:\nThe ketone could be formed by Grignard addition of ethyl magnesium bromide to ethyl propionate, followed by hydrolysis. A good yield is likely.\nPrecursors: CCC(=O)OCC and

In [11]:
type(res)

tuple

In [12]:
from src.utils.llm import llm_pipeline

In [None]:
def split_cot_json(res_text: str) -> tuple[int, list[str], str]:
    """Parse the LLM response to extract the thinking steps and json content

    Parameters
    ----------
    res_text : str
        The response text from the LLM model

    Returns
    -------
    tuple[int, list[str], str]
        The status code, thinking steps and json content
    """
    # logger = context_logger.get()
    try:
        # extract the content within <cot> </cot> tags as thinking content
        thinking_content = res_text[res_text.find("<cot>\n") +
                                    6:res_text.find("</cot>")]
        # split the thinking content into individual steps based on the <thinking> </thinking> tags
        thinking_steps = thinking_content.split("<thinking>\n")[1:]
        thinking_steps = [
            step[:step.find("</thinking>")] for step in thinking_steps
        ]
        json_content = res_text[res_text.find("<json>\n") +
                                7:res_text.find("</json>")]
    except Exception as e:
        # logger.info(f"Error in parsing LLM response: {e}")
        # return 500, [], ""
        return 501, [], "ValueError: Error occurred while extracting content from LLM Model's response"
    return 200, thinking_steps, json_content


def split_json_openAI(res_text: str) -> tuple[int, list[str]]:
    """Split the response text from OpenAI models to extract the molecules
    Note: OpenAI O-series models do not provide Chain of Thoughts (COT) in the response

    Parameters
    ----------
    res_text : str
        The response text from the OpenAI model

    Returns
    -------
    tuple[int, str]
        the status code and json content
    """
    # logger = context_logger.get()
    try:
        json_content = res_text[res_text.find("<json>\n") +
                                7:res_text.find("</json>")]
    except Exception as e:
        # logger.info(f"Error in parsing LLM response: {e}")
        return 502, "ValueError: Error occurred while extracting content from LLM response from OpenAI"  # NOTE: in the unpacking part there are only 2 params are peresnt, But it has 3 vals to unpack. # NOTE: will use fString; a mapping dictionary can be used as well.
    return 200, json_content


def split_json_deepseek(res_text: str) -> tuple[int, list[str], str]:
    """Parse the LLM response to extract the thinking steps and json content

    Parameters
    ----------
    res_text : str
        The response text from the LLM model

    Returns
    -------
    tuple[int, list[str], str]
        The status code, thinking steps and json content
    """
    # logger = context_logger.get()
    try:
        # extract the content within <cot> </cot> tags as thinking content
        thinking_content = res_text[res_text.find("<think>\n") +
                                    6:res_text.find("</think>")]

        json_content = res_text[res_text.find("<json>\n") +
                                7:res_text.find("</json>")]
    except Exception as e:
        # logger.info(f"Error in parsing LLM response: {e}")
        return 500, [], ""
        # return 503, [], "ValueError: Error occurred while extracting content from LLM response from DeepSeek"
    return 200, thinking_content, json_content


def validate_split_json(
        json_content: str) -> tuple[int, list[str], list[str], list[int]]:
    """Validate the split json content from LLM response

    Parameters
    ----------
    json_content : str
        The json content from the LLM response

    Returns
    -------
    tuple[int, list[str], list[str], list[int]]
        The status code, list of molecules, list of explanations and list of confidence scores
    """
    # logger = context_logger.get()
    try:
        result_list = ast.literal_eval(json_content)
        res_molecules = result_list['data']
        res_explanations = result_list['explanation']
        res_confidence = result_list['confidence_scores']
    except Exception as e:
        # logger.info(f"Error in parsing response: {e}")
        return 504, [], [], []
    return 200, res_molecules, res_explanations, res_confidence


In [31]:
status_code, json_content = split_json_openAI(res)

In [32]:
print(res)

(200, 'Here is the single-step retrosynthesis for the molecule CC(=O)CCC:\n\n<cot>\n<thinking>\nThe target molecule CC(=O)CCC is a ketone with the formula C5H10O. It has a methyl ketone functional group and a propyl alkyl chain.\n\nPossible retrosynthetic disconnections to consider:\n1. Oxidation of a secondary alcohol \n2. Grignard addition to an ester\n3. Oxidation of an alkene\n4. Friedel-Crafts acylation\n5. Wacker oxidation of a terminal alkene\n</thinking>\n\n<thinking>\nOxidation of a secondary alcohol:\nThe ketone could be formed by oxidation of the corresponding secondary alcohol CC(O)CCC. This would be a reliable transformation.\nPrecursor: CC(O)CCC\nReaction: Alcohol oxidation \nReagents: PCC, PDC, Swern, Dess-Martin periodinane\nConfidence: High\n</thinking>\n\n<thinking>\nGrignard addition to an ester:\nThe ketone could be formed by Grignard addition of ethyl magnesium bromide to ethyl propionate, followed by hydrolysis. A good yield is likely.\nPrecursors: CCC(=O)OCC and 

In [33]:
status_code, json_content

(500, [])

In [34]:
status_code, thinking_steps, json_content = split_cot_json(res)

In [37]:
status_code, thinking_steps, json_content

(500, [], '')

In [35]:
thinking_steps

[]

In [36]:
json_content

''

In [None]:
def llm_pipeline(
    molecule: str,
    LLM: str = "claude-3-opus-20240229",
    messages: Optional[list[dict]] = None
) -> tuple[int, list[str], list[str], list[float]]:
    """Pipeline to call LLM and validate the results

    Parameters
    ----------
    molecule : str
        The target molecule for retrosynthesis
    LLM : str, optional
        LLM to be used for retrosynthesis , by default "claude-3-opus-20240229"
    messages : Optional[list[dict]], optional
        Conversation history, by default None

    Returns
    -------
    tuple[int, list[str], list[str], list[float]]
        The output pathways, explanations and confidence scores
    """

    # logger = context_logger.get()
    output_pathways = []
    run = 0.0

    while (output_pathways == [] and run < 0.6):
        # logger.info(f"Calling LLM with molecule: {molecule} and run: {run}")
        status_code, res_text = call_LLM(molecule,
                                         LLM,
                                         messages=messages,
                                         temperature=run)
        if status_code == 200:
            if LLM in OPENAI_MODELS:
                # try:
                status_code, json_content = split_json_openAI(res_text)
                # except Exception as e:
                #     print(e)
                #     return 402, "split_json_openAI failed."
            elif LLM in DEEPSEEK_MODELS:
                # try:
                status_code, thinking_steps, json_content = split_json_deepseek(
                    res_text)
                # except Exception as e:
                #     print(e)
                #     return 402, "split_json_deepseek failed."
            else:
                # try:
                status_code, thinking_steps, json_content = split_cot_json(
                    res_text)
                # except Exception as e:
                #     return 402, "split_cot_json failed."
                #     tuple[int, list[str], str]

            if status_code == 200:
                try:
                    status_code, res_molecules, res_explanations, res_confidence = validate_split_json(
                        json_content)
                except Exception as e:
                    print(e)
                    return 403, "validate_split_json failed."

                if status_code == 200:
                    try:
                        output_pathways, output_explanations, output_confidence = validity_check(
                            molecule, res_molecules, res_explanations,
                            res_confidence)
                    except Exception as e:
                        print(e)

                        print(f"Output Pathways: {output_pathways},\n\
                            Output Explanations: {output_explanations},\n\
                            Output Confidence: {output_confidence}")

                        return 403, "validity_check failed."
                    # logger.info(f"Output Pathways: {output_pathways},\n\
                    #         Output Explanations: {output_explanations},\n\
                    #             Output Confidence: {output_confidence}")
                    run += 0.1
                else:
                    print(
                        f"Error in validating split json content: {res_text}")
                    # logger.info(
                    #     f"Error in validating split json content: {res_text}")
                    continue
            else:
                print(f"Error in splitting cot json: {res_text}")
                # logger.info(f"Error in splitting cot json: {res_text}")
                continue
        else:
            print(f"Error in calling LLM: {res_text}")
            # logger.info(f"Error in calling LLM: {res_text}")
            continue

    return 200, output_pathways, output_explanations, output_confidence


In [None]:
status_code, output_pathways, output_explanations, output_confidence = llm_pipeline(
    molecule="CC(=O)CC")

In [30]:
status_code, output_pathways, output_explanations, output_confidence

(200,
 [['CC(C)=O', 'CC=O'], ['CCOC(C)=O', 'BrCC']],
 ['Aldol condensation of acetone and acetaldehyde under basic conditions',
  'Grignard addition of ethylmagnesium bromide to methyl acetate, followed by oxidation'],
 [0.8, 0.7])

## test cases

In [34]:
molecule = "CC(=O)CC"

In [62]:
molecule_F = "CC(=O)CkcncnC"

In [None]:
status_code, res_text = call_LLM(molecule, temperature=0.0)

In [36]:
status_code

200

In [37]:
print(res_text)

Here is the single-step retrosynthesis analysis for the molecule CC(=O)CC:

<cot>
<thinking>
The target molecule CC(=O)CC contains a ketone functional group. Possible retrosynthetic disconnections to consider are:
1) Disconnection of the C-C bond adjacent to the ketone, which could arise from an aldol condensation reaction.
2) Disconnection of the C-C bond on the other side of the ketone, which could come from a Grignard addition to a carboxylic acid derivative like an ester.
3) Reduction of the ketone to a secondary alcohol, which could then be derived from reduction of the corresponding carboxylic acid or ester.
</thinking>

<thinking>
For the aldol disconnection, the precursors would be acetone (CC(=O)C) and acetaldehyde (CC=O). The reaction would proceed via enolate formation of the acetone, followed by nucleophilic addition to the acetaldehyde. A base like NaOH or LDA would be needed.
Confidence: 0.8
</thinking>

<thinking>
For the Grignard addition, the precursors would be propan

In [38]:
LLM = "claude-3-opus-20240229"

In [None]:
status_code, res_text_F = call_LLM(molecule_F, temperature=0.0)

In [64]:
print(res_text_F)

Here is the retrosynthetic analysis for the target molecule CC(=O)CkcncnC:

<cot>
<thinking>
The target molecule contains an acetyl group (CC(=O)-) attached to a substituted pyrazole ring. Possible disconnections to consider are:
1. Disconnecting the C-C bond between the acetyl group and pyrazole ring 
2. Functionalizing the pyrazole ring
3. Synthesizing the pyrazole ring from acyclic precursors
</thinking>

<thinking>
Disconnecting the acetyl group C-C bond would give an acetyl electrophile and a pyrazole nucleophile. The acetyl electrophile could be acetic anhydride or acetyl chloride. The pyrazole would need an appropriate nucleophilic functional group, likely at the carbon adjacent to the two nitrogens. A hydroxy group could work well.
</thinking>

<thinking>
The pyrazole ring could potentially be functionalized via electrophilic aromatic substitution, such as a Friedel-Crafts acylation to install the acetyl group. This would require the pyrazole and an acylating agent like acetyl 

In [None]:
status_code, thinking_steps, json_content = split_cot_json(res_text_F)

In [50]:
status_code, thinking_steps, json_content

(200,
 ['The target molecule CC(=O)CWRKNVKC contains a ketone functional group (C=O) and appears to be a peptide based on the single letter amino acid sequence WRKNVKC. \n\nPotential retrosynthetic disconnections to consider:\n1. Disconnection of the ketone C-C bond to give an organometallic reagent and a carboxylic acid derivative \n2. Sequential disconnection of the peptide bonds starting from the C-terminus to give individual amino acid building blocks\n3. Disconnection of the ketone C-C bond and simultaneous disconnection of a peptide bond to give two peptide fragments\n',
  'Disconnection 1 would give an organometallic reagent like a Grignard or organolithium and a carboxylic acid derivative like an acid chloride or ester. The organometallic would need to be formed from the corresponding alkyl halide. This is a reasonable approach but the peptide functionality could interfere.\n',
  'Disconnection 2 is the most straightforward and common approach to peptide synthesis. Sequential d

In [51]:
print(json_content)

{
  "data": [
    ["CC(=O)Cl", "CWRKNVKC"],
    ["CC(=O)C", "WRKNVKC", "K", "N", "V", "K", "R", "W", "C"],
    ["CC(=O)CWR", "KNVKC"]
  ],
  "explanation": [
    "Disconnection of ketone C-C bond to give an acid chloride and the peptide. The acid chloride would be made from the corresponding carboxylic acid. The peptide could be made via SPPS.",
    "Sequential disconnection of peptide bonds from C to N terminus to give individual amino acids, which could be coupled together via SPPS. The N-terminal ketone would likely need to be protected during synthesis.", 
    "Disconnection of the ketone C-C bond and a peptide bond to give two peptide fragments, which could be joined via a final peptide coupling. Synthesis and coupling of the ketone-containing fragment could be challenging."
  ],
  "confidence_scores": [
    0.6,
    0.9,
    0.3
  ]
}



In [None]:
status_code, res_molecules, res_explanations, res_confidence = validate_split_json(
    json_content)

In [59]:
status_code, res_molecules, res_explanations, res_confidence

(200,
 [['CC(=O)Cl', 'CWRKNVKC'],
  ['CC(=O)C', 'WRKNVKC', 'K', 'N', 'V', 'K', 'R', 'W', 'C'],
  ['CC(=O)CWR', 'KNVKC']],
 ['Disconnection of ketone C-C bond to give an acid chloride and the peptide. The acid chloride would be made from the corresponding carboxylic acid. The peptide could be made via SPPS.',
  'Sequential disconnection of peptide bonds from C to N terminus to give individual amino acids, which could be coupled together via SPPS. The N-terminal ketone would likely need to be protected during synthesis.',
  'Disconnection of the ketone C-C bond and a peptide bond to give two peptide fragments, which could be joined via a final peptide coupling. Synthesis and coupling of the ketone-containing fragment could be challenging.'],
 [0.6, 0.9, 0.3])

In [None]:
output_pathways, output_explanations, output_confidence = validity_check(
    molecule, res_molecules, res_explanations, res_confidence)

In [61]:
output_pathways, output_explanations, output_confidence

([['CC(=O)C', 'N', 'C']],
 ['Sequential disconnection of peptide bonds from C to N terminus to give individual amino acids, which could be coupled together via SPPS. The N-terminal ketone would likely need to be protected during synthesis.'],
 [0.9])

In [None]:
""" Recursive function to run Prithvi on a molecule """

from src.utils.llm import llm_pipeline
from src.utils.az import run_az
from src.utils.job_context import logger as context_logger


def rec_run_prithvi(molecule: str,
                    job_id,
                    llm: str = "claude-3-opus-20240229"):
    """Recursive function to run Prithvi on a molecule

    Parameters
    ----------
    molecule : str
        Molecule SMILES
    job_id : _type_
        Job ID
    llm : str, optional
        LLM to be used, by default "claude-3-opus-20240229"

    Returns
    -------
    _type_
        _description_
    """
    solved, result_dict = run_az(molecule)
    result_dict = result_dict[0]
    logger = context_logger.get()
    if not solved:
        # logger.info(f"AZ failed for {molecule}, running LLM")
        out_pathways, out_explained, out_confidence = llm_pipeline(
            molecule, llm)
        result_dict = {
            'type':
            'mol',
            'smiles':
            molecule,
            # 'confidence': out_confidence,
            "is_chemical":
            True,
            "in_stock":
            False,
            'children': [{
                "type": "reaction",
                "is_reaction": True,
                "metadata": {
                    "policy_probability": out_confidence,
                },
                "children": []
            }]
        }
        logger.info(f"LLM returned {out_pathways}")
        logger.info(f"LLM explained {out_explained}")
        for pathway in out_pathways:
            if isinstance(pathway, list):
                temp_stat = []
                for mol in pathway:
                    res, stat = rec_run_prithvi(mol, job_id, llm)
                    if stat:
                        temp_stat.append(True)
                        result_dict['children'][0]['children'].append(res)
                logger.info(f"temp_stat: {temp_stat}")
                if all(temp_stat):
                    solved = True
            else:
                res, solved = rec_run_prithvi(pathway, job_id, llm)
                result_dict['children'][0]['children'].append(res)
            if solved:
                logger.info('breaking')
                break
    else:
        logger.info(f"AZ solved {molecule}")
    # print(f"Solved : {solved}, Returning {result_dict}")
    return result_dict, solved


In [None]:
output_pathways = []
run = 0.0

while (output_pathways == [] and run < 0.6):
    # logger.info(f"Calling LLM with molecule: {molecule} and run: {run}")
    # status_code, res_text = call_LLM(molecule,
    #                                  LLM,
    #                                  messages=messages,
    #                                  temperature=run)
    status_code = 404
    # run += 0.1  # NOTE: ------------------> By putting run here instead inside the while loop can solve the issue I guess.
    print(run)
    if status_code == 200:
        if LLM in OPENAI_MODELS:
            status_code, json_content = split_json_openAI(res_text)
            status_code = 500
        elif LLM in DEEPSEEK_MODELS:
            status_code, thinking_steps, json_content = split_json_deepseek(
                res_text)
            status_code = 501
        else:
            status_code, thinking_steps, json_content = split_cot_json(
                res_text)
            status_code = 502

        if status_code == 200:
            status_code, res_molecules, res_explanations, res_confidence = validate_split_json(
                json_content)
            status_code = 503
            if status_code == 200:
                output_pathways, output_explanations, output_confidence = validity_check(
                    molecule, res_molecules, res_explanations, res_confidence)
                logger.info(f"Output Pathways: {output_pathways},\n\
                        Output Explanations: {output_explanations},\n\
                            Output Confidence: {output_confidence}")
                run += 0.1  # NOTE: if we put this after
            else:
                print("30")
                # logger.info(
                #     f"Error in validating split json content: {res_text}")
                continue
        else:
            print("35")
            # logger.info(f"Error in splitting cot json: {res_text}")
            continue
    else:
        print("39")
        # logger.info(f"Error in calling LLM: {res_text}")
        continue

## 

In [None]:
# import rootutils

# root_dir = rootutils.setup_root(,
#                                 indicator=".project-root",
#                                 pythonpath=True)

from src.prithvi import run_prithvi

run_prithvi(molecule="CC(=O)CC")

Traceback (most recent call last):
  File "c:\Users\irahu\.vscode\extensions\ms-python.python-2025.0.0-win32-x64\python_files\python_server.py", line 133, in exec_user_input
    retval = callable_(user_input, user_globals)
  File "<string>", line 9, in <module>
  File "c:\Users\irahu\git_workspace\recursiveLLM\src\prithvi.py", line 46, in run_prithvi
    handler = add_job_specific_handler(log, job_id)
  File "c:\Users\irahu\git_workspace\recursiveLLM\src\utils\custom_logging.py", line 66, in add_job_specific_handler
    logger._logger.addHandler(handler)
AttributeError: 'PrintLogger' object has no attribute 'addHandler'

