# Test Custom Error Code

In [43]:
import rootutils
rootutils.setup_root(".", indicator=".project-root", pythonpath=True)

WindowsPath('C:/Users/irahu/git_workspace/recursiveLLM')

## llm.py

In [76]:
import os
import ast
import litellm
from typing import Optional
from dotenv import load_dotenv
from litellm import completion
from src.variables import OPENAI_MODELS, DEEPSEEK_MODELS
from src.variables import USER_PROMPT, SYS_PROMPT
from src.variables import USER_PROMPT_V4, SYS_PROMPT_V4
from src.variables import USER_PROMPT_OPENAI, SYS_PROMPT_OPENAI
from src.variables import USER_PROMPT_DEEPSEEK, SYS_PROMPT_DEEPSEEK
from src.variables import ADDON_PROMPT_7_MEMBER, USER_PROMPT_DEEPSEEK_V4
from src.variables import ERROR_MAP
from src.cache import cache_results
from src.utils.utils_molecule import validity_check, detect_seven_member_rings
from src.utils.job_context import logger as context_logger

load_dotenv()

# set the success callback to langfuse for logging
litellm.success_callback = ["langfuse"]
litellm.drop_params = True

metadata = {
    "generation_name": "prod",  # set langfuse generation name
    "project": "Retrosynthesis",  # set langfuse project name
    "version": "0.0.3",  # set langfuse version
    "trace_name": "prod",  # set langfuse Trace Name
    "trace_user_id": "sv",  # set langfuse Trace User ID
    "session_id": "prod",  # set langfuse Session ID
}
ENABLE_LOGGING = False if os.getenv("ENABLE_LOGGING",
                                    "true").lower() == "false" else True


def log_message(message: str, logger=None):
    """Log the message

    Parameters
    ----------
    message : str
        The message to be logged
    logger : _type_, optional
        The logger object, by default None

    Returns
    -------
    None
    """
    if logger is not None:
        logger.info(message)
    else:
        print(message)


def obtain_prompt(LLM: str):
    """Obtain the prompt based on the LLM model

    Parameters
    ----------
    LLM : str
        The LLM model to be used

    Returns
    -------
    str, str, int
        The system prompt, user prompt and max completion tokens
    """
    advanced_prompt = False
    detector = LLM.split(":")
    if len(detector) > 1 and detector[1] == "adv":
        advanced_prompt = True
    print(f"Advanced Prompt: {advanced_prompt}")
    if advanced_prompt:
        if LLM in DEEPSEEK_MODELS:
            sys_prompt_final = SYS_PROMPT_V4
            user_prompt_final = USER_PROMPT_DEEPSEEK_V4
            max_completion_tokens = 8192 * 2
        elif LLM in OPENAI_MODELS:
            sys_prompt_final = SYS_PROMPT_OPENAI
            user_prompt_final = USER_PROMPT_OPENAI
            max_completion_tokens = 8192
        else:
            sys_prompt_final = SYS_PROMPT_V4
            user_prompt_final = USER_PROMPT_V4
            max_completion_tokens = 4096
    else:
        if LLM in DEEPSEEK_MODELS:
            sys_prompt_final = SYS_PROMPT_DEEPSEEK
            user_prompt_final = USER_PROMPT_DEEPSEEK
            max_completion_tokens = 8192
        elif LLM in OPENAI_MODELS:
            sys_prompt_final = SYS_PROMPT_OPENAI
            user_prompt_final = USER_PROMPT_OPENAI
            max_completion_tokens = 8192
        else:
            sys_prompt_final = SYS_PROMPT
            user_prompt_final = USER_PROMPT
            max_completion_tokens = 4096
    return sys_prompt_final, user_prompt_final, max_completion_tokens


@cache_results
def call_LLM(molecule: str,
             LLM: str = "claude-3-opus-20240229",
             temperature: float = 0.0,
             messages: Optional[list[dict]] = None) -> tuple[int, str]:
    """Calls the LLM model to predict the next step

    Parameters
    ----------
    molecule : str
        The target molecule for retrosynthesis
    LLM : str, optional
        The LLM model to be used, by default "claude-3-opus-20240229"
    temperature : float, optional
        The temperature for sampling, by default 0.0
    messages : Optional[list[dict]], optional
        The conversation history, by default None

    Returns
    -------
    tuple[int, str]
        The status code and the response text
    """
    logger = context_logger.get() if ENABLE_LOGGING else None
    log_message(f"Calling {LLM} with molecule: {molecule}", logger)

    if detect_seven_member_rings(molecule):
        log_message(f"Detected seven member ring in molecule: {molecule}",
                    logger)
        add_on = ADDON_PROMPT_7_MEMBER
    else:
        add_on = ""

    sys_prompt_final, user_prompt_final, max_completion_tokens = obtain_prompt(
        LLM)
    LLM = LLM.split(":")[0]
    if LLM in DEEPSEEK_MODELS:
        user_prompt_final += add_on
    if messages is None:
        messages = [{
            "role": "system",
            "content": sys_prompt_final + add_on
        }, {
            "role":
            "user",
            "content":
            user_prompt_final.replace('{target_smiles}', molecule)
        }]

    try:
        response = completion(model=LLM,
                              messages=messages,
                              max_completion_tokens=max_completion_tokens,
                              temperature=temperature,
                              seed=42,
                              top_p=0.9,
                              metadata=metadata)
        res_text = response.choices[0].message.content
    except Exception as e:
        log_message(f"Error in calling {LLM}: {e}", logger)
        log_message(f"Retrying call to {LLM}", logger)
        try:
            response = completion(model=LLM,
                                  messages=messages,
                                  max_completion_tokens=4096,
                                  temperature=temperature,
                                  seed=42,
                                  top_p=0.9,
                                  metadata=metadata)
            res_text = response.choices[0].message.content
        except Exception as e:
            log_message(f"2nd Error in calling {LLM}: {e}", logger)
            log_message(f"Exiting call to {LLM}", logger)
            return 400, ""
    log_message(f"Received response from LLM: {res_text}", logger)
    return 200, res_text


def split_cot_json(res_text: str) -> tuple[int, list[str], str]:
    """Parse the LLM response to extract the thinking steps and json content

    Parameters
    ----------
    res_text : str
        The response text from the LLM model

    Returns
    -------
    tuple[int, list[str], str]
        The status code, thinking steps and json content
    """
    logger = context_logger.get() if ENABLE_LOGGING else None
    try:
        # extract the content within <cot> </cot> tags as thinking content
        thinking_content = res_text[res_text.find("<cot>\n") +
                                    6:res_text.find("</cot>")]
        if not thinking_content:
            return 501, [], ""
        
        # split the thinking content into individual steps based on the <thinking> </thinking> tags
        thinking_steps = thinking_content.split("<thinking")[1:]
        thinking_steps = [
            step[:step.find("</thinking>")] for step in thinking_steps
        ]
        if not thinking_steps:
            return 501, [], ""
    except Exception as e:
        log_message(f"Error in parsing obtaining COT: {e}", logger)
        return 501, [], ""

    try:
        json_content = res_text[res_text.find("<json>\n") +
                                7:res_text.find("</json>")]
        if not json_content:
            return 501, [], ""
    except Exception as e:
        log_message(f"Error in parsing LLM response: {e}", logger)
        return 501, [], ""
    return 200, thinking_steps, json_content


def split_json_openAI(res_text: str) -> tuple[int, str]:
    """Split the response text from OpenAI models to extract the molecules
    Note: OpenAI O-series models do not provide Chain of Thoughts (COT) in the response

    Parameters
    ----------
    res_text : str
        The response text from the OpenAI model

    Returns
    -------
    tuple[int, str]
        the status code and json content
    """
    logger = context_logger.get() if ENABLE_LOGGING else None
    try:
        json_content = res_text[res_text.find("<json>\n") +
                                7:res_text.find("</json>")]
        if not json_content:
            return 502, ""
        
    except Exception as e:
        log_message(f"Error in parsing LLM response: {e}", logger)
        return 502, ""
    return 200, json_content


def split_json_deepseek(res_text: str) -> tuple[int, list[str], str]:
    """Parse the LLM response to extract the thinking steps and json content

    Parameters
    ----------
    res_text : str
        The response text from the LLM model

    Returns
    -------
    tuple[int, list[str], str]
        The status code, thinking steps and json content
    """
    logger = context_logger.get() if ENABLE_LOGGING else None

    try:
        # extract the content within <cot> </cot> tags as thinking content
        thinking_content = res_text[res_text.find("<think>\n") +
                                    6:res_text.find("</think>")]
        if not thinking_content:
            return 503, [], ""
        
        json_content = res_text[res_text.find("<json>\n") +
                                7:res_text.find("</json>")]
        if not json_content:
            return 503, [], ""
        
    except Exception as e:
        log_message(f"Error in parsing LLM response: {e}", logger)
        return 503, [], ""
    return 200, [thinking_content], json_content


def split_json_master(res_text: str, model: str) -> tuple[int, list[str], str]:
    """Split the response text based on the model

    Parameters
    ----------
    res_text : str
        The response text from the LLM model
    model : str
        The LLM model used

    Returns
    -------
    tuple[int, list[str], str]
        The status code, thinking steps and json content
    """
    try:
        if model in DEEPSEEK_MODELS:
            status_code, thinking_steps, json_content = split_json_deepseek(
                res_text)
        elif model in OPENAI_MODELS:
            status_code, json_content = split_json_openAI(res_text)
            thinking_steps = []
        else:
            status_code, thinking_steps, json_content = split_cot_json(res_text)
    except Exception as e:
        return 505, [], ""

    return status_code, thinking_steps, json_content


def validate_split_json(
        json_content: str) -> tuple[int, list[str], list[str], list[int]]:
    """Validate the split json content from LLM response

    Parameters
    ----------
    json_content : str
        The json content from the LLM response

    Returns
    -------
    tuple[int, list[str], list[str], list[int]]
        The status code, list of molecules, list of explanations and list of confidence scores
    """
    logger = context_logger.get() if ENABLE_LOGGING else None
    try:
        result_list = ast.literal_eval(json_content)
        res_molecules = result_list['data']
        res_explanations = result_list['explanation']
        res_confidence = result_list['confidence_scores']
    except Exception as e:
        logger.info(f"Error in parsing response: {e}")
        return 504, [], [], []
    return 200, res_molecules, res_explanations, res_confidence


def llm_pipeline(
    molecule: str,
    LLM: str = "claude-3-opus-20240229",
    messages: Optional[list[dict]] = None
) -> tuple[list[list[str]], list[str], list[float]]:
    """Pipeline to call LLM and validate the results

    Parameters
    ----------
    molecule : str
        The target molecule for retrosynthesis
    LLM : str, optional
        LLM to be used for retrosynthesis , by default "claude-3-opus-20240229"
    messages : Optional[list[dict]], optional
        Conversation history, by default None

    Returns
    -------
    tuple[list[list[str]], list[str], list[float]]
        The output pathways, explanations and confidence scores
    """
    logger = context_logger.get() if ENABLE_LOGGING else None
    output_pathways: list[list[str]] = []
    output_explanations: list[str] = []
    output_confidence: list[float] = []
    run = 0.0
    while (output_pathways == [] and run < 0.6):
        log_message(f"Calling LLM with molecule: {molecule} and run: {run}",
                    logger)

        # Selecting the model based on the run number
        current_model = LLM
        if LLM in DEEPSEEK_MODELS and run > 0.0:
            current_model = "claude-3-opus-20240229"  # NOTE: if we are going use deepseek model why set it to claude-3-opus-20240229

        # --------------------
        # Call LLM
        status_code, res_text = call_LLM(molecule,
                                         current_model,
                                         messages=messages,
                                         temperature=run)
        if status_code != 200:
            log_message(f"Error in calling LLM: {res_text}", logger)
            run += 0.1
            get_error_log(status_code)
            continue

        # --------------------
        # Split the response text
        status_code, thinking_steps, json_content = split_json_master(
            res_text, current_model)
        if status_code != 200:
            log_message(f"Error in splitting cot json: {res_text}", logger)
            run += 0.1
            get_error_log(status_code)
            continue

        # --------------------
        # Validate the split json content
        status_code, res_molecules, res_explanations, res_confidence = validate_split_json(
            json_content)
        if status_code != 200:
            log_message(f"Error in validating split json content: {res_text}",
                        logger)
            run += 0.1
            get_error_log(status_code)
            continue

        # --------------------
        # Check the validity of the molecules obtained from LLM
        output_pathways, output_explanations, output_confidence = validity_check(
            molecule, res_molecules, res_explanations, res_confidence)
        log_message(
            f"Output Pathways: {output_pathways},\n\
                Output Explanations: {output_explanations},\n\
                    Output Confidence: {output_confidence}", logger)
        run += 0.1

    return output_pathways, output_explanations, output_confidence

def get_error_log(status_code: int) -> str:
    """Prints error message based on the status code.

    Parameters
    ----------
    status_code : int
        Status Code

    Returns
    -------
    str
        Error message associated with the status code.
    """
    if status_code in ERROR_MAP:
        description = ERROR_MAP[status_code]
        print(f"Error Code: {status_code},\n Description: {description}")
    else:
        print(f"Error Code: {status_code} is not recognized.")

def check_empty_content(content: any) -> bool:
    """Check if the json content is empty or not

    Parameters
    ----------
    content : any
        The json content from the OpenAI model

    Returns
    -------
    bool
        True if the json content is not empty, False otherwise
    """
    return not content

## utils_molecule.py

In [77]:
import os
from rdkit import Chem
from rdkit.Chem import AllChem, rdMolDescriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem.Descriptors import ExactMolWt

import joblib
import rootutils
from src.variables import REACTION_ENCODING_NAMES, ENCODING_SCALABILITY
from src.cache import cache_results
from src.utils.job_context import logger as context_logger

root_dir = rootutils.setup_root(".", # __file__,
                                indicator=".project-root",
                                pythonpath=True)

RXN_CLASSIFICATION_MODEL_PATH = f"{root_dir}/{os.getenv('RXN_CLASSIFICATION_MODEL_PATH')}"
ENABLE_LOGGING = False if os.getenv("ENABLE_LOGGING",
                                    "true").lower() == "false" else True


def log_message(message: str, logger=None):
    """Log the message"""
    if logger is not None:
        log_message(message)
    else:
        print(message)


def is_valid_smiles(smiles: str) -> bool:
    """Check if the SMILES string is valid

    Parameters
    ----------
    smiles : str
        smiles string

    Returns
    -------
    bool
        True if the smiles is valid, False otherwise
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
    except:
        return False
    if mol is None:
        return False
    return True


def substructure_matching(target_smiles: str, query_smiles: str) -> int:
    """Check if the query substructure is present in the target molecule

    Parameters
    ----------
    target_smiles : str
        SMILES string of the target molecule
    query_smiles : str
        SMILES string of the query molecule

    Returns
    -------
    int
        1 if the query substructure is present in the target molecule, 0 otherwise
    """
    logger = context_logger.get() if ENABLE_LOGGING else None

    # Convert SMILES to RDKit molecule objects
    try:
        target_molecule = Chem.MolFromSmiles(target_smiles)
    except:
        log_message(f"Error in parsing target molecule: {target_smiles}",
                    logger)

    try:
        query_molecule = Chem.MolFromSmiles(query_smiles)
    except:
        log_message(f"Error in parsing query molecule: {query_smiles}", logger)

    # Check if the query substructure is present in the target molecule
    try:
        if target_molecule.HasSubstructMatch(query_molecule):
            return 1
        else:
            return 0
    except:
        return 0


@cache_results
def validity_check(molecule, res_molecules, res_explanations, res_confidence):
    """Check the validity of the molecules obtained from LLM

    Parameters
    ----------
    molecule : str
        Target molecule for retrosynthesis
    res_molecules : list
        List of molecules obtained from LLM
    res_explanations : list
        List of explanations obtained from LLM
    res_confidence : list
        List of confidence scores obtained from LLM

    Returns
    -------
    list
        List of valid pathways
    list
        List of valid explanations
    list
        List of valid confidence scores
    """
    logger = context_logger.get() if ENABLE_LOGGING else None
    valid_pathways = []
    valid_explanations = []
    valid_confidence = []
    for idx, smile_list in enumerate(res_molecules):
        valid = []
        if isinstance(smile_list, list):
            for smiles in smile_list:
                if is_valid_smiles(smiles):
                    if are_molecules_same(molecule, smiles):
                        log_message(
                            f"Molecule : {molecule} is same as target molecule",
                            logger)
                    elif substructure_matching(smiles, molecule):
                        log_message(
                            f"Molecule : {molecule} is substructure of target molecule",
                            logger)
                    else:
                        valid.append(smiles)
                else:
                    log_message(
                        f"Molecule : {molecule} is invalid or cannot be parsed",
                        logger)
            if len(valid) >= 2:
                valid_pathways.append(valid)
                valid_explanations.append(res_explanations[idx])
                valid_confidence.append(res_confidence[idx])
        else:
            if is_valid_smiles(smile_list):
                if are_molecules_same(molecule, smiles):
                    log_message("Molecule is same as target molecule", logger)
                elif substructure_matching(smiles, molecule):
                    log_message(
                        f"Molecule : {molecule} is substructure of target molecule {smiles}",
                        logger)
                else:
                    valid_pathways.append([smile_list])
                    valid_explanations.append(res_explanations[idx])
                    valid_confidence.append(res_confidence[idx])
            else:
                log_message("Molecule is invalid or cannot be parsed", logger)
    log_message(
        f"Obtained {len(valid_pathways)} valid pathways after validity test: {valid_pathways}",
        logger)
    return valid_pathways, valid_explanations, valid_confidence


def calc_mol_wt(mol: str) -> float:
    """Calculate the molecular weight of a molecule

    Parameters
    ----------
    mol : str
        SMILES string of the molecule

    Returns
    -------
    float
        molecular weight of the molecule
    """
    logger = context_logger.get() if ENABLE_LOGGING else None
    try:
        mol_wt = ExactMolWt(Chem.MolFromSmiles(mol))
    except:
        mol_wt = 0.0
        log_message(f"Error in calculating molecular weight: {mol}", logger)
    return mol_wt


def calc_chemical_formula(mol: str):
    """Calculate the chemical formula of a molecule

    Parameters
    ----------
    mol : str
        SMILES string of the molecule

    Returns
    -------
    str
        molecular formula of the molecule
    """
    logger = context_logger.get() if ENABLE_LOGGING else None
    try:
        formula = CalcMolFormula(Chem.MolFromSmiles(mol))
    except:
        formula = "N/A"
        log_message(f"Error in calculating formula: {mol}", logger)
    return formula


def are_molecules_same(smiles1: str, smiles2: str) -> bool:
    # Convert SMILES strings to RDKit molecule objects
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)

    if mol1 is None or mol2 is None:
        raise ValueError("Invalid SMILES string provided.")

    # Get canonical SMILES for both molecules
    canonical_smiles1 = Chem.MolToSmiles(mol1, canonical=True)
    canonical_smiles2 = Chem.MolToSmiles(mol2, canonical=True)

    # Alternatively, compare molecular fingerprints
    fingerprint1 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol1,
                                                                  radius=2,
                                                                  nBits=1024)
    fingerprint2 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol2,
                                                                  radius=2,
                                                                  nBits=1024)

    # Check if canonical SMILES or fingerprints match
    if canonical_smiles1 == canonical_smiles2:
        return True
    elif fingerprint1 == fingerprint2:
        return True
    else:
        return False


def compute_fingerprint(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol,
                                                        radius,
                                                        nBits=nBits)
    return list(fingerprint)


def sub_structure_matching(target_smiles: str, query_smiles: str) -> bool:
    """Check if the query substructure is present in the target molecule"""
    target_molecule = Chem.MolFromSmiles(target_smiles)
    query_molecule = Chem.MolFromSmiles(query_smiles)

    if target_molecule.HasSubstructMatch(query_molecule):
        return True
    else:
        return False


def get_reaction_type(mol1, mol2, model_path):
    """Get the reaction type of a reaction"""
    clf = joblib.load(model_path)
    mol1_fingerprint = compute_fingerprint(mol1)
    mol2_fingerprint = compute_fingerprint(mol2)
    reaction_type = clf.predict([mol1_fingerprint + mol2_fingerprint])
    return REACTION_ENCODING_NAMES[reaction_type[0]], reaction_type[0]


def calc_confidence_estimate(probability: float) -> float:
    """Calculate the confidence estimate based on the probability

    Parameters
    ----------
    probability : float
        Probability of the prediction

    Returns
    -------
    float
        Confidence estimate
    """
    if isinstance(probability, list):
        probability = probability[0]
    if probability < 0.3:
        probability = 1 - probability
    elif probability < 0.45 and probability >= 0.3:
        probability += 0.5
    elif probability < 0.6 and probability >= 0.45:
        probability += 0.3

    # limit the confidence estimate to 2 decimal places, round to the
    # nearest 0.01
    probability = round(probability, 2)
    if probability > 0.99:
        probability = 0.99
    return probability


def calc_scalability_index(mol1, mol2):
    """Calculate the scalability index of a reaction"""
    _, type = get_reaction_type(mol1, mol2, RXN_CLASSIFICATION_MODEL_PATH)
    return str(ENCODING_SCALABILITY[type])


def calc_yield(mol1, mol2):
    """Calculate the yield of a reaction"""
    return "#"


def detect_seven_member_rings(smiles) -> bool:
    """
    Detects 7-member rings in a molecule given its SMILES string.

    Parameters
    ----------
    smiles : str
        SMILES string of the molecule.

    Returns
    -------
    bool
        True if 7-member rings are present, False otherwise.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string provided.")

    # Retrieve ring information as tuples of atom indices.
    ring_info = mol.GetRingInfo()
    atom_rings = ring_info.AtomRings()

    # Filter rings by the number of atoms.
    rings_7 = [ring for ring in atom_rings if len(ring) == 7]

    if len(rings_7) > 0:
        return True
    return False


def detect_eight_member_rings(smiles) -> bool:
    """
    Detects 8-member rings in a molecule given its SMILES string.

    Parameters
    ----------
    smiles : str
        SMILES string of the molecule.

    Returns
    -------
    bool
        True if 8-member rings are present, False otherwise.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string provided.")

    # Retrieve ring information as tuples of atom indices.
    ring_info = mol.GetRingInfo()
    atom_rings = ring_info.AtomRings()

    # Filter rings by the number of atoms.
    rings_8 = [ring for ring in atom_rings if len(ring) == 8]

    if len(rings_8) > 0:
        return True
    return False

### 200 test

#### correct molecule

In [78]:
status_code, res_text = call_LLM(molecule="CC(=O)CC")

In [79]:
status_code, res_text

(200,
 'Here is the single-step retrosynthesis analysis for the molecule CC(=O)CC:\n\n<cot>\n<thinking>\nThe target molecule CC(=O)CC contains a ketone functional group. Possible retrosynthetic disconnections to consider are:\n1) Disconnection of the C-C bond adjacent to the ketone, which could arise from an aldol condensation reaction.\n2) Disconnection of the C-C bond on the other side of the ketone, which could come from a Grignard addition to a carboxylic acid derivative like an ester.\n3) Reduction of the ketone to an alcohol, which could then be derived from an oxidation of the corresponding secondary alcohol.\n</thinking>\n\n<thinking>\nFor the aldol disconnection, the precursors would be acetone (CC(=O)C) and acetaldehyde (CC=O). The reaction would proceed via enolate formation of the acetone, followed by nucleophilic addition to the acetaldehyde. A subsequent dehydration step would give the α,β-unsaturated ketone product.\n</thinking>\n\n<thinking>\nFor the Grignard addition, 

### 400 test: call_LLM()

#### wrong molecule

In [80]:
molecule_F = "CC(=O)CkcncnC"

In [81]:
status_code, res_text = call_LLM(molecule=molecule_F)

Calling claude-3-opus-20240229 with molecule: CC(=O)CkcncnC


[21:56:23] SMILES Parse Error: syntax error while parsing: CC(=O)CkcncnC
[21:56:23] SMILES Parse Error: Failed parsing SMILES 'CC(=O)CkcncnC' for input: 'CC(=O)CkcncnC'


ValueError: Invalid SMILES string provided.

### 501 test: split_cot_json()

#### valid test

In [82]:
status_code, res_text = call_LLM(molecule="CC(=O)CC")

In [83]:
res_text

'Here is the single-step retrosynthesis analysis for the molecule CC(=O)CC:\n\n<cot>\n<thinking>\nThe target molecule CC(=O)CC contains a ketone functional group. Possible retrosynthetic disconnections to consider are:\n1) Disconnection of the C-C bond adjacent to the ketone, which could arise from an aldol condensation reaction.\n2) Disconnection of the C-C bond on the other side of the ketone, which could come from a Grignard addition to a carboxylic acid derivative like an ester.\n3) Reduction of the ketone to an alcohol, which could then be derived from an oxidation of the corresponding secondary alcohol.\n</thinking>\n\n<thinking>\nFor the aldol disconnection, the precursors would be acetone (CC(=O)C) and acetaldehyde (CC=O). The reaction would proceed via enolate formation of the acetone, followed by nucleophilic addition to the acetaldehyde. A subsequent dehydration step would give the α,β-unsaturated ketone product.\n</thinking>\n\n<thinking>\nFor the Grignard addition, the pre

In [None]:
res_text_valid = 'Here is the single-step retrosynthesis analysis for the molecule CC(=O)CC:\n\n<cot>\n<thinking>\nThe target molecule CC(=O)CC contains a ketone functional group. Possible retrosynthetic disconnections to consider are:\n1) Disconnection of the C-C bond adjacent to the ketone, which could arise from an aldol condensation reaction.\n2) Disconnection of the C-C bond on the other side of the ketone, which could come from a Grignard addition to a carboxylic acid derivative like an ester.\n3) Reduction of the ketone to an alcohol, which could then be derived from an oxidation of the corresponding secondary alcohol.\n</thinking>\n\n<thinking>\nFor the aldol disconnection, the precursors would be acetone (CC(=O)C) and acetaldehyde (CC=O). The reaction would proceed via enolate formation of the acetone, followed by nucleophilic addition to the acetaldehyde. A subsequent dehydration step would give the α,β-unsaturated ketone product.\n</thinking>\n\n<thinking>\nFor the Grignard addition, the precursors would be propanoyl chloride (CCC(=O)Cl) and methylmagnesium bromide (CMgBr). The Grignard reagent would add to the carbonyl, followed by an acidic workup to give the final ketone product.  \n</thinking>\n\n<thinking>\nFor the alcohol reduction, the precursor would be butan-2-ol (CC(O)CC). Oxidation, potentially using a chromium reagent like pyridinium chlorochromate (PCC) or a Swern oxidation, would convert the secondary alcohol to the ketone.\n</thinking>\n\n</cot>\n\n<json>\n{\n  "data": [\n    ["CC(=O)C", "CC=O"],\n    ["CCC(=O)Cl", "CMgBr"],\n    ["CC(O)CC"]\n  ],\n  "explanation": [\n    "Aldol condensation of acetone and acetaldehyde, proceeding via enolate formation, nucleophilic addition, and dehydration",\n    "Grignard addition of methylmagnesium bromide to propanoyl chloride, followed by acidic workup",\n    "Oxidation of butan-2-ol, e.g. using PCC or Swern conditions"\n  ],\n  "confidence_scores": [\n    0.9,\n    0.7,\n    0.8\n  ]\n}\n</json>'

In [85]:
status_code, thinking_steps, json_content = split_cot_json(res_text_valid)

In [86]:
status_code, thinking_steps, json_content

(200,
 ['>\nThe target molecule CC(=O)CC contains a ketone functional group. Possible retrosynthetic disconnections to consider are:\n1) Disconnection of the C-C bond adjacent to the ketone, which could arise from an aldol condensation reaction.\n2) Disconnection of the C-C bond on the other side of the ketone, which could come from a Grignard addition to a carboxylic acid derivative like an ester.\n3) Reduction of the ketone to an alcohol, which could then be derived from an oxidation of the corresponding secondary alcohol.\n',
  '>\nFor the aldol disconnection, the precursors would be acetone (CC(=O)C) and acetaldehyde (CC=O). The reaction would proceed via enolate formation of the acetone, followed by nucleophilic addition to the acetaldehyde. A subsequent dehydration step would give the α,β-unsaturated ketone product.\n',
  '>\nFor the Grignard addition, the precursors would be propanoyl chloride (CCC(=O)Cl) and methylmagnesium bromide (CMgBr). The Grignard reagent would add to the

#### 501

In [90]:
blank_res_text_ = ''

In [91]:
status_code, thinking_steps, json_content = split_cot_json(blank_res_text_)

In [92]:
status_code, thinking_steps, json_content

(501, [], '')

In [99]:
res_text_invalid = 'Here is the single-step retrosynthesis analysis for the molecule CC(=O)CC:\ntarget molecule CC(=O)CC contains a ketone functional group. Possible retrosynthetic disconnections to consider are:\n1) Disconnection of the C-C bond adjacent to the ketone, which could arise from an aldol condensation reaction.\n2) Disconnection of the C-C bond on the other side of the ketone, which could come from a Grignard addition to a carboxylic acid derivative like an ester.\n3) Reduction ofr the aldol disconnection, the precursors would be acetone (CC(=O)C) and acetaldehyde (CC=O). The reaction would proceed via enolate formation of the acetone, followed by nucleophilic addition to the acethe Grignard addition, the precursors would be propanoyl chloride (CCC(=O)Cl) and methylmagnesium bromide (CMgBr). The Grignard reagent would add to the carbonyl, followed by an acidic workup to give the final ketone product.  \CC(O)CC"]\n  ],\n  "explanation": [\n    "Aldol condensation of acetone and acetaldehyde, proceeding via enolate formation, nucleophilic addition, and dehydration",\n    "Grignard addition of methylmagnesium bromide to propanoyl chloride, followed by acidic workup",\n    "Oxidation of butan-2-ol, e.g. using PCC or Swern conditions"\n  ],\n  "confidence_scores": [\n    0.9,\n    0.7,\n    0.8\n  ]\n}\n</json>'

In [100]:
status_code, thinking_steps, json_content = split_cot_json(res_text_invalid)

In [101]:
status_code, thinking_steps, json_content

(501, [], '')

In [104]:
get_error_log(status_code)

Error Code: 502,
 Description: {'description': 'split_json_openAI() failed.'}


### 502 test: split_json_openAI()

In [113]:
status_code, res_OpenAI = call_LLM(molecule="CC(=O)CC", LLM="gpt-4o")

In [114]:
status_code, res_OpenAI

(200,
 '```json\n{\n  "data": [\n    ["CC(=O)C", "C"],\n    ["CC(=O)CCl", "C"],\n    ["CC(=O)COH", "H2"]\n  ],\n  "explanation": [\n    "Aldol condensation: The target molecule can be formed by the aldol condensation of acetone (CC(=O)C) with formaldehyde (C) under basic conditions.",\n    "Friedel-Crafts acylation: The target molecule can be synthesized by the Friedel-Crafts acylation of acetyl chloride (CC(=O)CCl) with methane (C) using AlCl3 as a catalyst.",\n    "Reduction of a ketone: The target molecule can be obtained by the reduction of 3-hydroxybutan-2-one (CC(=O)COH) using hydrogen gas (H2) and a metal catalyst such as Pd/C."\n  ],\n  "confidence_scores": [\n    0.8,\n    0.7,\n    0.6\n  ]\n}\n```')

In [120]:
print(res_OpenAI)

```json
{
  "data": [
    ["CC(=O)C", "C"],
    ["CC(=O)CCl", "C"],
    ["CC(=O)COH", "H2"]
  ],
  "explanation": [
    "Aldol condensation: The target molecule can be formed by the aldol condensation of acetone (CC(=O)C) with formaldehyde (C) under basic conditions.",
    "Friedel-Crafts acylation: The target molecule can be synthesized by the Friedel-Crafts acylation of acetyl chloride (CC(=O)CCl) with methane (C) using AlCl3 as a catalyst.",
    "Reduction of a ketone: The target molecule can be obtained by the reduction of 3-hydroxybutan-2-one (CC(=O)COH) using hydrogen gas (H2) and a metal catalyst such as Pd/C."
  ],
  "confidence_scores": [
    0.8,
    0.7,
    0.6
  ]
}
```


In [115]:
status_code, json_content = split_json_openAI(res_OpenAI)

In [116]:
status_code, json_content

(200,
 'n\n{\n  "data": [\n    ["CC(=O)C", "C"],\n    ["CC(=O)CCl", "C"],\n    ["CC(=O)COH", "H2"]\n  ],\n  "explanation": [\n    "Aldol condensation: The target molecule can be formed by the aldol condensation of acetone (CC(=O)C) with formaldehyde (C) under basic conditions.",\n    "Friedel-Crafts acylation: The target molecule can be synthesized by the Friedel-Crafts acylation of acetyl chloride (CC(=O)CCl) with methane (C) using AlCl3 as a catalyst.",\n    "Reduction of a ketone: The target molecule can be obtained by the reduction of 3-hydroxybutan-2-one (CC(=O)COH) using hydrogen gas (H2) and a metal catalyst such as Pd/C."\n  ],\n  "confidence_scores": [\n    0.8,\n    0.7,\n    0.6\n  ]\n}\n``')

### 503 test: split_json_deepseek()

In [112]:
status_code, res_deepseek = call_LLM(molecule="CC(=O)CC", LLM = "azure_ai/DeepSeek-R1")

Calling azure_ai/DeepSeek-R1 with molecule: CC(=O)CC
Advanced Prompt: False
Received response from LLM: 

```json
{
  "data": [
    ["CC(O)CC"],
    ["CC#CC"],
    ["CC(Cl)(Cl)CC"]
  ],
  "explanation": [
    "Oxidation of secondary alcohol (2-butanol) using an oxidizing agent like PCC or CrO₃ to form the ketone.",
    "Acid-catalyzed hydration of internal alkyne (2-butyne) with HgSO₄ and H₂SO₄ to yield the ketone via Markovnikov addition.",
    "Hydrolysis of geminal dihalide (2,2-dichlorobutane) under basic conditions (e.g., NaOH) to eliminate HCl and form the ketone."
  ],
  "confidence_scores": [
    0.95,
    0.85,
    0.7
  ]
}
```


In [117]:
print(res_deepseek)



```json
{
  "data": [
    ["CC(O)CC"],
    ["CC#CC"],
    ["CC(Cl)(Cl)CC"]
  ],
  "explanation": [
    "Oxidation of secondary alcohol (2-butanol) using an oxidizing agent like PCC or CrO₃ to form the ketone.",
    "Acid-catalyzed hydration of internal alkyne (2-butyne) with HgSO₄ and H₂SO₄ to yield the ketone via Markovnikov addition.",
    "Hydrolysis of geminal dihalide (2,2-dichlorobutane) under basic conditions (e.g., NaOH) to eliminate HCl and form the ketone."
  ],
  "confidence_scores": [
    0.95,
    0.85,
    0.7
  ]
}
```


In [None]:
status_code, res_deepseek

In [118]:
status_code, thinking_step, json_content = split_json_deepseek(res_deepseek)

In [119]:
status_code

200

#### 503

In [None]:
invalid_res_text_deepseek = """```json
{
  "data": [
    ["CC(O)CC"],
    ["CC#CC"],
    ["CC(Cl)(Cl)CC"]
  ],
  "explanation": [
    "Oxidation of secondary alcohol (2-butanol) using an oxidizing agent like PCC or CrO₃ to form the ketone.",
    "Acid-catalyzed hydration of internal alkyne (2-butyne) with HgSO₄ and H₂SO₄ to yield the ketone via Markovnikov addition.",
    "Hydrolysis of geminal dihalide (2,2-dichlorobutane) under basic conditions (e.g., NaOH) to eliminate HCl and form the ketone."
  ],
  "confidence_scores": [
    0.95,
    0.85,
    0.7
  ]
}
```"""


### 504 test: validate_split_json()

### 505 test: split_json_master()