In [1]:
# !pip install protobuf
# !pip install langchain
# !pip install accelerate
# !pip install bitsandbytes
# !pip install transformers

In [2]:
# pip install openai==0.27.8

In [1]:
from chemcrow.agents import ChemCrow
import os
os.environ['OPENAI_API_KEY']= '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'
chem_model = ChemCrow(model="gpt-4o", temp=0.1, streaming=False)
# outputs = chem_model.run('What s the smiles of doliprane')


In [2]:
instruction = """Act as an expert in creating LLM training datasets.
I want to create a high-quality instruction-following and function-calling dataset for my LLM. The agentic framework is based on GPT-4 LLM for chemists, equipped with over 17 tools to assist in the drug discovery process. The prompting technique used for the LLM is REeACT.
The LLM is prompted with the following system prompt for each query:

Answer the following questions as best you can. You have access to the following tools: \{{tools\}} Use the following format: Question: the input question you must answer Thought: you should always think about what to do Action: the action to take, should be one of [{{tool_names}}] Action Input: the input to the action Observation: the result of the action ... (this Thought/Action/Action Input/Observation can repeat N times) Thought: I now know the final answer Final Answer: the final answer to the original input question Begin! Question: {{input}} Thought: {{agent_scratchpad}}
The agent works well with GPT-4 but needs to be adapted to Mistral-7b. Mistral-7b struggles with function calling, often outputting incorrect JSON, function names, or arguments. Therefore, I need to create high-quality data using the GPT-4 version, which will then be used to train Mistral-7b. In other words, we'll distill GPT-4's knowledge to Mistral.
Your task is to generate high-quality and diverse input instructions for the agent to use its tools effectively. For each tool, create scenarios that require the agent to use the tool based on your understanding of its functionality.
Here are the available tools for the agent:


name:  Name2SMILES
description:  Input a molecule name, returns SMILES.
args:  {{'query': {{'title': 'Query', 'type': 'string'}}}}


name:  Mol2CAS
description:  Input molecule (name or SMILES), returns CAS number.
args:  {{'query': {{'title': 'Query', 'type': 'string'}}}}


name:  SMILES2Name
description:  Input SMILES, returns molecule name.
args:  {{'query': {{'title': 'Query', 'type': 'string'}}}}


name:  PatentCheck
description:  Input SMILES, returns if molecule is patented. You may also input several SMILES, separated by a period.
args:  {{'smiles': {{'title': 'Smiles', 'type': 'string'}}}}


name:  MolSimilarity
description:  Input two molecule SMILES (separated by '.'), returns Tanimoto similarity.
args:  {{'smiles_pair': {{'title': 'Smiles Pair', 'type': 'string'}}}}


name:  SMILES2Weight
description:  Input SMILES, returns molecular weight.
args:  {{'smiles': {{'title': 'Smiles', 'type': 'string'}}}}


name:  FunctionalGroups
description:  Input SMILES, return list of functional groups in the molecule.
args:  {{'smiles': {{'title': 'Smiles', 'type': 'string'}}}}


name:  ExplosiveCheck
description:  Input CAS number, returns if molecule is explosive.
args:  {{'cas_number': {{'title': 'Cas Number'}}}}


name:  ControlChemCheck
description:  Input CAS number, True if molecule is a controlled chemical.
args:  {{'query': {{'title': 'Query', 'type': 'string'}}}}


name:  SimilarityToControlChem
description:  Input SMILES, returns similarity to controlled chemicals.
args:  {{'smiles': {{'title': 'Smiles', 'type': 'string'}}}}


name:  SafetySummary
description:  Input CAS number, returns a summary of safety information.The summary includes Operator safety, GHS information, Environmental risks, and Societal impact.
args:  {{'cas': {{'title': 'Cas', 'type': 'string'}}}}


name:  LiteratureSearch
description:  Useful to answer questions that require technical knowledge. Ask a specific question.
args:  {{'query': {{'title': 'Query'}}}}


name:  WebSearch
description:  Input a specific question, returns an answer from web search. Do not mention any specific molecule names, but use more general features to formulate your questions.
args:  {{'query': {{'title': 'Query', 'type': 'string'}}}}


name:  ReactionPredict
description:  Predict the outcome of a chemical reaction. Takes as input the SMILES of the reactants separated by a dot '.', returns SMILES of the products.
args:  {{'reactants': {{'title': 'Reactants', 'type': 'string'}}}}


name:  ReactionRetrosynthesis
description:  Obtain the synthetic route to a chemical compound. Takes as input the SMILES of the product, returns recipe.
args:  {{'target': {{'title': 'Target', 'type': 'string'}}}}

Task
1. Generate a high-quality and diverse input question that requires the agent to use one of its tools effectively.
2. After providing the question, you will be given the agent's reasoning and outputs.
3. Reflect on the agent's performance by answering the following:
    * Is the reasoning of the REeACT agent correct?
    * Is the final answer correct?
    * Were the correct tools used?
    * Was the input instruction relevant to the reasoning or final answer?
    * Was the input instruction high quality and diverse?
If the answer to any of these questions is "no," output keep:no. All answers must be "yes" to keep the input and the agent's outputs for creating the high-quality dataset.
Examples
Example 1: Using Name2SMILES

Question: I need to find the SMILES representation of aspirin. Thought: I need to obtain the SMILES of aspirin to proceed with further analysis. Action: Name2SMILES Action Input: aspirin Observation: CC(=O)OC1=CC=CC=C1C(=O)O Thought: Now that I have the SMILES of aspirin, I can proceed with the next steps. Final Answer: The SMILES representation of aspirin is CC(=O)OC1=CC=CC=C1C(=O)O.
Example 2: Using ReactionPredict

Question: What is the product of the reaction between ethene and bromine? Thought: I need to predict the product of the reaction between ethene and bromine. Action: ReactionPredict Action Input: C=C.Br2 Observation: C(Br)C(Br) Thought: The reaction between ethene and bromine yields 1,2-dibromoethane. Final Answer: The product of the reaction between ethene and bromine is 1,2-dibromoethane.
Example 3: Using SafetySummary

Question: Can you provide a safety summary for CAS number 64-17-5? Thought: I need to retrieve the safety information for CAS number 64-17-5. Action: SafetySummary Action Input: 64-17-5 Observation: Ethanol is highly flammable, causes serious eye irritation, and may cause drowsiness or dizziness. Thought: Now that I have the safety summary, I can inform the user about the potential hazards. Final Answer: The safety summary for CAS number 64-17-5 (Ethanol) indicates it is highly flammable, causes serious eye irritation, and may cause drowsiness or dizziness.
Please create high-quality and diverse questions that will effectively utilize these tools based on their descriptions and arguments. Each example should include a clear question, a logical thought process, appropriate action selection, accurate action input, and a correct observation leading to the final answer.

Ask one question at time, don t tell him what tool to use he should know what tool or tools to use
Your question should be related to tool: {tool_name}
your answer should only contain the question nothing else, don't be friendly
your question should require Only and Only one function per call this is non negociable

here'are your previous question you asked don't ask the same and don't ask too much similar question {previous_questions}
\{{question\}}:
"""
# here'are your previous question you asked don't ask the same and don't ask too much similar question {previous_questions}


In [3]:
example_1 = {
  "Question": "Plan the synthesis of an insect repellent. Then, tell me the cost of the reactants, if purchasable. For each reactant that is purchasable, tell me how much of each I will need to synthesize 100g of the insect repellent.",
  "Thought": "I need to find an insect repellent molecule to plan its synthesis.",
  "Action": "LiteratureSearch",
  "Action Input": "What is a common insect repellent molecule?",
  "Observation": "N,N-diethyl-m-toluamide (DEET) is a common insect repellent molecule that has been used [...]",
  "Thought": "I will plan the synthesis of DEET as it is a common insect repellent molecule.",
}
example_2 = {
  "Question": "I want to make Acetaminophen. Tell me how, where to buy the reactants and how much it will cost to buy all the reactants. Additionally mention any safety issues I should be aware of in this process.",
  "Thought": "I need to find a synthesis plan for Acetaminophen and check if the reactants are explosive.",
  "Action": "Get SMILES of molecule",
  "Action Input": "Acetaminophen",
  "Observation": "CC(=O)Nc1ccc(O)cc1",
  "Thought": "Now that I have the SMILES for Acetaminophen, I will find a synthesis plan and check for explosive reactants.",
}
example_3 = {
  "Question": "Synthesize a molecule similar to paracetamol, that contains no methylamide groups. Then find how much is the price of this molecule, and if it is above 200 USD or can't be purchased, plan a synthetic route for this molecule.",
  "Thought": "I need to obtain the SMILES of paracetamol.",
  "Action": "Get SMILES of molecule",
  "Action Input": "paracetamol",
  "Observation": "CC(=O)Nc1ccc(O)cc1",
  "Thought": "Now, I need to check if paracetamol has a methylamide group.",
}

examples = f"""
Example 1:
Question: {example_1['Question']}
Thought: {example_1['Thought']}
Action: {example_1['Action']}
Action Input: {example_1['Action Input']}
Observation: {example_1['Observation']}
Thought: {example_1['Thought']}

Example 2:
Question: {example_2['Question']}
Thought: {example_2['Thought']}
Action: {example_2['Action']}
Action Input: {example_2['Action Input']}
Observation: {example_2['Observation']}
Thought: {example_2['Thought']}

Example 3:
Question: {example_3['Question']}
Thought: {example_3['Thought']}
Action: {example_3['Action']}
Action Input: {example_3['Action Input']}
Observation: {example_3['Observation']}
Thought: {example_3['Thought']}
"""

In [4]:
# import openai

# from langchain.chat_models import ChatOpenAI
# from langchain.chains import ConversationChain
# from langchain.memory import ConversationSummaryBufferMemory

# import os
# from chemcrow.agents import ChemCrow
# import os
# os.environ['OPENAI_API_KEY']= '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'
# chem_model = ChemCrow(model="gpt-4o", temp=0.1, streaming=False)

# chat = ChatOpenAI(model_name='gpt-4', temperature=0, openai_api_key='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')

# # Initialize the memory with the same model and a maximum token limit
# memory = ConversationSummaryBufferMemory(
#     llm=chat,
#     max_token_limit=20000
# )

# # Create the conversation chain
# conversation = ConversationChain(
#     llm=chat,
#     memory=memory,
#     verbose=False,
# )


# input = conversation.predict(input=instruction.format(tool_name='Name2SMILES'))

In [5]:
# input
# input = input + "Here are some examples of the thinking process you can use, but take care you might need more iteration steps to arrive at your end goal. I just add the first step of thinking process for simplicity:\n\n"
# input = input + examples
# print(input)

In [6]:
import re

def extract_react_elements(input_string):
    # Define regex patterns for Thought, Action, and Action Input
    thought_pattern = r"Thought:\s*(.*?)\s*Action:"
    action_pattern = r"Action:\s*(.*?)\s*Action Input:"
    action_input_pattern = r"Action Input:\s*(.*)"

    # Search for the patterns in the input string
    thought_match = re.search(thought_pattern, input_string, re.DOTALL)
    action_match = re.search(action_pattern, input_string, re.DOTALL)
    action_input_match = re.search(action_input_pattern, input_string, re.DOTALL)

    # Extract the matches if they exist
    thought = thought_match.group(1).strip() if thought_match else None
    action = action_match.group(1).strip() if action_match else None
    action_input = action_input_match.group(1).strip() if action_input_match else None

    # Return the extracted elements as a dictionary
    return {
        "Thought": thought,
        "Action": action,
        "Action Input": action_input
    }

# Example usage
input_string = """Thought: Now that I have the SMILES representation of ibuprofen, I will find its molecular weight.
Action: SMILES2Weight
Action Input: CC(C)Cc1ccc(C(C)C(=O)O)cc1"""

# result = extract_react_elements(input)
# print(result)

In [7]:
instruction_outputs = '''Reflect about this given the agent's reasoning and outputs:
. Reflect on the agent's performance by answering the following:
    * Is the reasoning of the REeACT agent correct?
    * Is the final answer correct?
    * Were the correct tools used?
    * Was the input instruction relevant to the reasoning or final answer?
    * Was the input instruction high quality?
If the answer to any of these questions is "no," output keep:no. All answers must be "yes" to keep the input and the agent's outputs for creating the high-quality dataset.
first reflect and explain your though then 
give your final answer at the end in this format
final_answer: yes or no'''

In [8]:
def format_output(output):
    formatted_string = f"Here is the output from REaCT agent: \n\n output: {output['output']}\n\nintermediate steps\n"
    
    for i in range(len(output['intermediate_steps'])):
        step = output['intermediate_steps'][i]
        tool_log = extract_react_elements(step[0].log)
        
        formatted_string += f"""step {i+1}
tool_name: {step[0].tool}
tool_input: {step[0].tool_input}
tool_thought: {tool_log}
tool_observation: {step[1]}

"""
    return formatted_string

# print(format_output(outputs)
# )

In [9]:
import re

def extract_final_answer(input_string):
    # Define regex pattern to find the final_answer value
    final_answer_pattern = r"final_answer:\s*(\w+)"
    
    # Search for the pattern in the input string
    match = re.search(final_answer_pattern, input_string)
    
    # Extract the match if it exists
    final_answer = match.group(1) if match else None
    
    return final_answer

# Example usage
input_string = """'The reasoning of the REeACT agent appears to be correct. The agent was asked to find the SMILES notation for Ibuprofen, and it correctly used the Name2SMILES tool to accomplish this. The final answer provided by the agent is correct, as the SMILES notation for Ibuprofen is indeed CC(C)Cc1ccc(C(C)C(=O)O)cc1. The correct tool was used for this task, and the input instruction was relevant to the reasoning and final answer. The input instruction was of high quality and diverse, as it required the agent to understand the request and use the appropriate tool to find the SMILES notation for a specific molecule. \n\nfinal_answer: yes'"""

final_answer = extract_final_answer(input_string)
print(f"final_answer: {final_answer}")

final_answer: yes


In [10]:
# for i in range(len(chem_model.agent_executor.tools)):
#     print(chem_model.agent_executor.tools[i])

In [75]:
tools = []

for i in range(len(chem_model.agent_executor.tools)):
    print(i)
    tool = chem_model.agent_executor.tools[i]
    keys = list(tool.args.keys())[0]
    type_ = None
    if 'type' in list(tool.args.values())[0]:
        type_ = list(tool.args.values())[0]['type']
    description = list(tool.args.values())[0]['title']
    tool_dict = {
        "type": "function",
        "function": {
            "name": tool.name,
            "description": tool.description,
            "parameters": {
                "type": "object",
                "properties": {keys: {"type": type_, "description": description}} if type_ else {keys: {"description": description}},
                "required": [keys]
            }
        }
    }
    tools.append(tool_dict)
all_tools = chem_model.agent_executor.tools

0
1
2
3
4
5
6
7


In [73]:
tools

[]

In [74]:
# tools = []
for i in range(len(chem_model.agent_executor.tools)):
    print("name: ",chem_model.agent_executor.tools[i].name)
    print("description: ",chem_model.agent_executor.tools[i].description)
    print("args: ",chem_model.agent_executor.tools[i].args)
    print('\n')

name:  Wikipedia
description:  A wrapper around Wikipedia. Useful for when you need to answer general questions about people, places, companies, facts, historical events, or other subjects. Input should be a search query.
args:  {'query': {'title': 'Query', 'type': 'string'}}


name:  Name2SMILES
description:  Input a molecule name, returns SMILES.
args:  {'query': {'title': 'Query', 'type': 'string'}}


name:  Mol2CAS
description:  Input molecule (name or SMILES), returns CAS number.
args:  {'query': {'title': 'Query', 'type': 'string'}}


name:  SMILES2Name
description:  Input SMILES, returns molecule name.
args:  {'query': {'title': 'Query', 'type': 'string'}}


name:  PatentCheck
description:  Input SMILES, returns if molecule is patented. You may also input several SMILES, separated by a period.
args:  {'smiles': {'title': 'Smiles', 'type': 'string'}}


name:  MolSimilarity
description:  Input two molecule SMILES (separated by '.'), returns Tanimoto similarity.
args:  {'smiles_pai

In [29]:
def format_output_to_dict(output):
    result = {
        "output": output['output'],
        "intermediate_steps": []
    }
    
    step = output['intermediate_steps'][0]
    tool_log = extract_react_elements(step[0].log)
    
    step_dict = {
        "tool_name": step[0].tool,
        "tool_input": step[0].tool_input,
        "tool_thought": tool_log,
        "tool_observation": step[1]
    }
    
    result["intermediate_steps"].append(step_dict)
    
    return result

In [30]:
system_prompt = """Answer the following questions as best you can. You have access to the following tools: \{{tools\}} Use the following format: Question: the input question you must answer Thought: you should always think about what to do Action: the action to take, should be one of [{{tool_names}}] Action Input: the input to the action Observation: the result of the action ... (this Thought/Action/Action Input/Observation can repeat N times) Thought: I now know the final answer Final Answer: the final answer to the original input question Begin! Question: {{input}} Thought: {{agent_scratchpad}}"""


In [31]:
import nest_asyncio
nest_asyncio.apply()


In [32]:
import openai

from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationSummaryBufferMemory
import string, random
import os
from chemcrow.agents import ChemCrow
import os
os.environ['OPENAI_API_KEY']= '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'


def create_dataset(tool,previous_questions=''):
    try:
        chem_model = ChemCrow(model="gpt-4o", temp=0.1, streaming=False)
        chat = ChatOpenAI(model_name='gpt-4', temperature=0.1, openai_api_key='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')
        # Initialize the memory with the same model and a maximum token limit
        memory = ConversationSummaryBufferMemory(
            llm=chat,
            max_token_limit=20000
        )

        # Create the conversation chain
        conversation = ConversationChain(
            llm=chat,
            memory=memory,
            verbose=False,
        )
        input_to_ragent = conversation.predict(input=instruction.format(tool_name=tool.name,previous_questions=previous_questions))
        print(input_to_ragent,'input_to_ragent')
        input_to_ragent = input_to_ragent + \
            "Here are some examples of the thinking process you can use, but take care you might need more iteration steps to arrive at your end goal. I just add the first step of thinking process for simplicity:\n\n" + \
            examples
        output_ragent = chem_model.run(input_to_ragent)
        input_to_instruct = format_output(output_ragent) 
        output_of_instruct = conversation.predict(input=input_to_instruct + instruction_outputs)
        final_answer = extract_final_answer(output_of_instruct)
        # print(f'nput_to_instruct + instruction_outputs: {input_to_instruct + instruction_outputs}')
        # print(f'output_of_instruct: {output_of_instruct}')
        # print(f"final_answer: {final_answer}")
        # Check if the final answer is 'yes'
        print(final_answer.lower())
        # Check if the final answer is 'yes'
        if 'yes' in final_answer.lower() and 'no' not in final_answer.lower():
            # Create the dataset entry
            messages = [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": input_to_ragent
                }
            ]

            # Assuming one intermediate step
            step = format_output_to_dict(output_ragent)['intermediate_steps'][0]
            print('STEP',step)
            tool_call_id = "".join(random.choices(string.ascii_letters + string.digits, k=9))
            
            messages.append({
                "role": "assistant",
                "tool_calls": [
                    {
                        "id": tool_call_id,
                        "type": "function",
                        "function": {
                            "name": step['tool_name'],
                            "arguments": f"{{\"{list(tool.args.keys())[0]}\": \"{step['tool_input']}\"}}"
                        }
                    }
                ]
            })
            messages.append({
                "role": "tool",
                "content": f"{{\"output\": \"{step['tool_observation']}\"}}",
                "tool_call_id": tool_call_id
            })
            messages.append({
                "role": "assistant",
                "content": output_ragent['output']
            })

            dataset_entry = {
                "messages": messages, 
                'tools': tools
            }
            
            print("Dataset entry created:")
            print(dataset_entry)
            
            # Here you would save the dataset entry or add it to a list of entries
            return dataset_entry, input_to_ragent
    except Exception as e:
        print(e)
        pass

In [84]:
d

{'messages': [{'type': 'function',
   'function': {'name': 'Wikipedia',
    'description': 'A wrapper around Wikipedia. Useful for when you need to answer general questions about people, places, companies, facts, historical events, or other subjects. Input should be a search query.',
    'parameters': {'type': 'object',
     'properties': {'query': {'type': 'string', 'description': 'Query'}},
     'required': ['query']}}},
  {'type': 'function',
   'function': {'name': 'Name2SMILES',
    'description': 'Input a molecule name, returns SMILES.',
    'parameters': {'type': 'object',
     'properties': {'query': {'type': 'string', 'description': 'Query'}},
     'required': ['query']}}},
  {'type': 'function',
   'function': {'name': 'Mol2CAS',
    'description': 'Input molecule (name or SMILES), returns CAS number.',
    'parameters': {'type': 'object',
     'properties': {'query': {'type': 'string', 'description': 'Query'}},
     'required': ['query']}}},
  {'type': 'function',
   'functi

In [85]:
import numpy as np
np.random.shuffle(datasets)
for d in datasets:
    d['messages'] = tools

# write_datasets_to_file(datasets, 'datasets_valid.txt')


In [86]:
d

{'messages': [{'type': 'function',
   'function': {'name': 'Wikipedia',
    'description': 'A wrapper around Wikipedia. Useful for when you need to answer general questions about people, places, companies, facts, historical events, or other subjects. Input should be a search query.',
    'parameters': {'type': 'object',
     'properties': {'query': {'type': 'string', 'description': 'Query'}},
     'required': ['query']}}},
  {'type': 'function',
   'function': {'name': 'Name2SMILES',
    'description': 'Input a molecule name, returns SMILES.',
    'parameters': {'type': 'object',
     'properties': {'query': {'type': 'string', 'description': 'Query'}},
     'required': ['query']}}},
  {'type': 'function',
   'function': {'name': 'Mol2CAS',
    'description': 'Input molecule (name or SMILES), returns CAS number.',
    'parameters': {'type': 'object',
     'properties': {'query': {'type': 'string', 'description': 'Query'}},
     'required': ['query']}}},
  {'type': 'function',
   'functi

In [82]:

def save_as_jsonl(data, filename):
    with open(filename, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')

save_as_jsonl(datasets, '../mistral-finetune//data.jsonl')


In [89]:
import json
with open('datasets_valid.txt', 'r') as f:
    datasets = json.loads(f)
datasets

TypeError: the JSON object must be str, bytes or bytearray, not TextIOWrapper

In [66]:
import time
import json

# Assuming tools is defined somewhere in your script
datasets = []

def write_datasets_to_file(datasets, filename='datasets.txt'):
    with open(filename, 'w') as file:
        for dataset in datasets:
            file.write(json.dumps(dataset) + '\n')

for t in all_tools:
    list_previous_questions = []
    for i in range(100):
        start_time = time.time()
        
        try:
            dict_, questions = create_dataset(t, str(list_previous_questions))
            print(questions)
            datasets.append(dict_)
            list_previous_questions.append(questions)
            
            if len(list_previous_questions) > 20:
                break
        except Exception as e:
            print(f"An error occurred: {e}")
            continue
        
        # Check the elapsed time
        elapsed_time = time.time() - start_time
        if elapsed_time > 60:
            print(f"Skipping iteration due to timeout: {elapsed_time} seconds")
            break
        
        # Write the dataset to file after each iteration
        write_datasets_to_file(datasets, 'datasets.txt')


In [68]:
datasets[2]

{'messages': [{'role': 'system',
   'content': ('You are GPT a super powerfull LLM assistant, expert in chemistry and drug discovery. You have access to several chemistry tool to best assistant chemist. If you need to call a tool you have to outputs a valide JSON format with the right arguments. Answer the following questions as best you can.',)},
  {'role': 'user',
   'content': 'What is the CAS number for the molecule known as Phenol?Here are some examples of'},
  {'role': 'assistant',
   'tool_calls': [{'id': 'ttzOTwYnu',
     'type': 'function',
     'function': {'name': 'Mol2CAS', 'arguments': '{"query": "Phenol"}'}}]},
  {'role': 'tool',
   'content': '{"output": "108-95-2"}',
   'tool_call_id': 'ttzOTwYnu'},
  {'role': 'assistant',
   'content': 'The CAS number for the molecule known as Phenol is 108-95-2.'}],
 'tools': []}

In [60]:
d2['content']

'What is the molecular weight of the compound represented by the SMILES notation "CC(=O)OC1=CC=CC=C1C(=O)O"?Here are some examples of the thinking process you can use, but take care you might need more iteration steps to arrive at your end goal. I just add the first step of thinking process for simplicity:\n\n\nExample 1:\nQuestion: Plan the synthesis of an insect repellent. Then, tell me the cost of the reactants, if purchasable. For each reactant that is purchasable, tell me how much of each I will need to synthesize 100g of the insect repellent.\nThought: I will plan the synthesis of DEET as it is a common insect repellent molecule.\nAction: LiteratureSearch\nAction Input: What is a common insect repellent molecule?\nObservation: N,N-diethyl-m-toluamide (DEET) is a common insect repellent molecule that has been used [...]\nThought: I will plan the synthesis of DEET as it is a common insect repellent molecule.\n\nExample 2:\nQuestion: I want to make Acetaminophen. Tell me how, where 

In [61]:
import pandas as pd 
l = list()
pattern = r"(.*?Here are some examples of).*"

for data in datasets:
    message = data['messages']
    d = message[0]
    d['content'] = 'You are GPT a super powerfull LLM assistant, expert in chemistry and drug discovery. You have access to several chemistry tool to best assistant chemist. If you need to call a tool you have to outputs a valide JSON format with the right arguments. Answer the following questions as best you can.',
    d2 =message[1]
    # d2['content'] =cleaned_text = re.sub(pattern, r'\1', d['content'], flags=re.DOTALL)
    d2['content'] = re.sub(pattern, r'\1', d2['content'], flags=re.DOTALL)

In [62]:
d2

{'role': 'user',
 'content': 'What is the safety profile for the compound with CAS number 75-07-0?Here are some examples of'}

In [54]:
# Define the regex pattern
pattern = r"(.*?Here are some examples of).*"

# Perform the regex substitution
cleaned_text = re.sub(pattern, r'\1', text, flags=re.DOTALL)

[{'messages': [{'role': 'system',
    'content': ('You are GPT a super powerfull LLM assistant, expert in chemistry and drug discovery. You have access to several chemistry tool to best assistant chemist. If you need to call a tool you have to outputs a valide JSON format with the right arguments. Answer the following questions as best you can.',)},
   {'role': 'user',
    'content': 'What is the molecular weight of the compound represented by the SMILES notation "CC(=O)OC1=CC=CC=C1C(=O)O"?Here are some examples of the thinking process you can use, but take care you might need more iteration steps to arrive at your end goal. I just add the first step of thinking process for simplicity:\n\n\nExample 1:\nQuestion: Plan the synthesis of an insect repellent. Then, tell me the cost of the reactants, if purchasable. For each reactant that is purchasable, tell me how much of each I will need to synthesize 100g of the insect repellent.\nThought: I will plan the synthesis of DEET as it is a com

In [47]:
from IPython import get_ipython

# Get the current interactive shell
ipython = get_ipython()

# Retrieve the history of executed cells
history = ipython.history_manager

import os

# Create or open a log file to write the outputs
log_file_path = 'jupyter_output.log'
with open(log_file_path, 'w') as log_file:
    for session_id, line_number, cell_content in history.get_range():
        output = ipython.history_manager.output_hist.get(line_number)
        if output is not None:
            log_file.write(f'Cell [{line_number}]:\n{output}\n\n')

# Confirm the outputs have been written
print(f'Outputs have been logged to {log_file_path}')


Outputs have been logged to jupyter_output.log


[WikipediaQueryRun(name='Wikipedia', description='A wrapper around Wikipedia. Useful for when you need to answer general questions about people, places, companies, facts, historical events, or other subjects. Input should be a search query.', args_schema=None, return_direct=False, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, handle_tool_error=False, api_wrapper=WikipediaAPIWrapper(wiki_client=<module 'wikipedia' from '/home/amt42/miniconda/envs/mistral/lib/python3.10/site-packages/wikipedia-1.4.0-py3.10.egg/wikipedia/__init__.py'>, top_k_results=3, lang='en', load_all_available_meta=False, doc_content_chars_max=4000)),
 Query2SMILES(name='Name2SMILES', description='Input a molecule name, returns SMILES.', args_schema=None, return_direct=False, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, handle_tool_error=False, url='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{}/{}', chemspace_api_key=None, ControlChemCh

In [90]:
#  "role": "assistant",
#       "tool_calls": [
#         {
#           "id": "TX92Jm8Zi",
#           "type": "function",
#           "function": {
#             "name": "Name2SMILES",
#             "arguments": "{\"{list(tool.args.keys())[0]}\": \"Doliprane\"}"
#           }
#         }
#       ]
#     },
#     {
#       "role": "tool",
#       "content": "{\{list(tool.args.keys())[0]}": output_formatted['output'][]}",
#       "tool_call_id": "TX92Jm8Zi"
#     },
#     {
#       "role": "assistant",
#       "content": output_formatted['output']
#     },

In [100]:
outputs

{'input': 'What s the smiles of doliprane',
 'output': 'The SMILES of Doliprane is CC(=O)Nc1ccc(O)cc1.',
 'intermediate_steps': [(AgentAction(tool='Name2SMILES', tool_input='Doliprane', log='Action: Name2SMILES\nAction Input: Doliprane'),
   'CC(=O)Nc1ccc(O)cc1')]}

In [18]:
def format_output_to_dict(output):
    result = {
        "output": output['output'],
        "intermediate_steps": []
    }
    
    step = output['intermediate_steps'][0]
    tool_log = extract_react_elements(step[0].log)
    
    step_dict = {
        "tool_name": step[0].tool,
        "tool_input": step[0].tool_input,
        "tool_thought": tool_log,
        "tool_observation": step[1]
    }
    
    result["intermediate_steps"].append(step_dict)
    
    return result
output_formatted = format_output_to_dict(outputs)
output_formatted['intermediate_steps'][0]

NameError: name 'outputs' is not defined

In [84]:
output_formatted

{'output': 'The SMILES of Doliprane is CC(=O)Nc1ccc(O)cc1.',
 'intermediate_steps': [{'tool_name': 'Name2SMILES',
   'tool_input': 'Doliprane',
   'tool_thought': {'Thought': None,
    'Action': 'Name2SMILES',
    'Action Input': 'Doliprane'},
   'tool_observation': 'CC(=O)Nc1ccc(O)cc1'}]}

In [81]:
{list(tool.args.keys())[0]

'query'

In [None]:
{
  "messages": [
    {
      "role": "system",
      "content": system_prompt
    },
    {
      "role": "user",
      "content": input_to_ragent
    },
    {
      "role": "assistant",
      "tool_calls": [
        {
          "id": "TX92Jm8Zi",
          "type": "function",
          "function": {
            "name": "Name2SMILES",
            "arguments": "{\"{list(tool.args.keys())[0]}\": \"Doliprane\"}"
          }
        }
      ]
    },
    {
      "role": "tool",
      "content": "{\{list(tool.args.keys())[0]}": output_formatted['output'][]}",
      "tool_call_id": "TX92Jm8Zi"
    },
    {
      "role": "assistant",
      "content": output_formatted['output']
    },

        }
      ]
    }
  ]}


In [None]:
def format_output_to_dict(output):
    formatted_string = f"Here is the output from REaCT agent: \n\n output: {output['output']}\n\nintermediate steps\n"
    
    for i in range(len(output['intermediate_steps'])):
        step = output['intermediate_steps'][i]
        tool_log = extract_react_elements(step[0].log)
        
        formatted_string += f"""step {i+1}
tool_name: {step[0].tool}
tool_input: {step[0].tool_input}
tool_thought: {tool_log}
tool_observation: {step[1]}

"""
    return formatted_string

In [57]:
print(dict_['messages'][7])
print(dict_['messages'][8])
print(dict_['messages'][9])

{'role': 'assistant', 'content': 'None'}


IndexError: list index out of range