In [335]:
import gget
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool, StructuredTool, Tool, tool
from langchain.agents import AgentType, initialize_agent
from typing import List, Dict

import langchain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import LLMMathChain
from langchain.vectorstores import FAISS
import os 
from langchain.embeddings.openai import OpenAIEmbeddings
import openai

In [336]:
Field.__doc__

'\n    Used to provide extra information about a field, either for the model schema or complex validation. Some arguments\n    apply only to number fields (``int``, ``float``, ``Decimal``) and some apply only to ``str``.\n\n    :param default: since this is replacing the field’s default, its first argument is used\n      to set the default, use ellipsis (``...``) to indicate the field is required\n    :param default_factory: callable that will be called when a default value is needed for this field\n      If both `default` and `default_factory` are set, an error is raised.\n    :param alias: the public name of the field\n    :param title: can be any string, used in the schema\n    :param description: can be any string, used in the schema\n    :param exclude: exclude this field while dumping.\n      Takes same values as the ``include`` and ``exclude`` arguments on the ``.dict`` method.\n    :param include: include this field while dumping.\n      Takes same values as the ``include`` and

In [337]:
openai.api_key = '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'

In [338]:
GPT_MODEL = "gpt-3.5-turbo-0613"
SYSTEM_PROMPT = """
    You are a helpful AI assistant. You answer the user's queries.
    NEVER make up an answer.
    If you don't know the answer,
    just respond with "I don't know".
"""

In [339]:
from langchain.tools import BaseTool
from typing import Optional, Type
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.chat_models import ChatOpenAI
from pydantic import BaseModel, Field


In [340]:
# Function that is called by the agent to generate a response
def guideRNA_finder(sequence: str) -> List[str]:

    """
    This function takes an ensembl gene id and fetches the gene sequence. 
    Then it uses regex to find all 20 base pair guide RNA sequences across the gene sequence.
    """
    import re
    from Bio.Seq import Seq

    # 1. Lookahead assertion to return overlapping matches
    pattern = re.compile(r"(?=([ACGT]{21}GG))")

    # 2. Fetch the gene sequence
    reverse_complement = Seq(sequence).reverse_complement()

    # 3. Return the guide RNA sequences
    forward_guideRNA = pattern.findall(sequence)
    reverse_guideRNA = pattern.findall(str(reverse_complement))

    return forward_guideRNA, reverse_guideRNA

In [341]:
# THESE ARE THE ARGUMENTS THAT THE AGENT WILL TAKE IN FROM A HUMAN MESSAGE
# The pydantic class defines what the model should give back --> pass it in as argument schema
class CheckSequenceFromGeneID(BaseModel):
    """
    Input for guide RNA finder
    """
    gene_id: str = Field(description="An ensembl gene id to find guide RNA sequences against")


In [342]:
# THIS IS A DEFINITION OF THE TOOL THAT USES INPUTS FROM THE MODEL 
# (GENE ID) TO RUN A CUSTOM USER DEFINED FUNCTION
class GuideRNAFinderToolFromGeneID(BaseTool):

    # Need to define name and description in order to pass to langchainAI
    name = 'get_guide_RNA_sequences_for_gene'
    description = 'Useful for when you need to find guide RNA sequences across a DNA sequence or a gene'

    def _run(self, gene_id: str) -> List[str]:
        
        print('Running GuideRNAFinder against a gene')

        # Get sequence from gget if gene_id is provided
        import gget

        sequence = gget.seq(gene_id)[1]
        
        return sequence 

    # define something for async run 
    def _arun(self) -> str:

        raise NotImplementedError('This tool does not support async run yet')
    
    args_schema: Optional[Type[BaseModel]] = CheckSequenceFromGeneID


In [343]:
# THIS IS WHAT GOES INTO THE MODEL (TOOL)
# The pydantic class defines what the model should give back --> pass it in as argument schema
class CheckSequenceFromSequence(BaseModel):
    """
    Input for guide RNA finder
    """
    sequence: str = Field(description="A DNA sequence to find guide RNA sequences against")

In [344]:
class GuideRNAFinderToolFromSequence(BaseTool):

    name = 'get_guide_RNA_sequences_for_sequence'
    description = 'Useful for when you need to find guide RNA sequences across a DNA sequence or a gene'

    def _run(self, sequence: str) -> List[str]:
        
        print('Running GuideRNAFinder against a sequence')

        return guideRNA_finder(sequence)

    def _arun(self) -> str:

        raise  NotImplementedError('This tool does not support async run yet')
    
    args_schema: Optional[Type[BaseModel]] = CheckSequenceFromSequence

In [349]:
tools = [GuideRNAFinderToolFromGeneID(),GuideRNAFinderToolFromSequence()]    

In [None]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

In [None]:
open_ai_agent = initialize_agent(tools, 
                                 llm,
                                 agent=AgentType.OPENAI_FUNCTIONS,
                                 verbose=True)

In [None]:
open_ai_agent.run("What are the guide RNA sequences for the EGFR gene'?")

In [None]:
open_ai_agent.run("What are the guide RNA sequences for this DNA sequence 'AGACGTCCGGGCAGCCCCCGGCGCAGCGCGGCCGCAGCAGCCTCCGCCCCCCGCACGGTAGACGTCCGGGCAGCCCCCGGCGCAGCGCGGCCGCAGCAGCCTCCGCCCCCCGCACGGT'?")

In [None]:
ai_message = model.predict_messages([HumanMessage(content='What are the guide RNA sequences for the KRAS gene')], functions=functions)

In [None]:
ai_message.additional_kwargs['function_call'].get('arguments')

In [None]:
# AI is going to get the function arguments 
import json
_args = json.loads(ai_message.additional_kwargs['function_call'].get('arguments'))
_args

In [None]:
tool_result = tools[0](_args)

In [None]:
# Then once you have created the function using the json and etc and know what the kwargs are 
# You can pass it into the function and get a result of the function
# Content = what the tool gave back as the result for the kwargs 
FunctionMessage(name='guideRNA_finder', content=tool_result)