In [32]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)


def create_retriever(text_file_path, chunk_size = 300, \
                    chunk_overlap = 0, separator = "\n", \
                    reset=True, model_name="all-MiniLM-L6-v2"):
    # Load the document, split it into chunks, embed each chunk and load it into the vector store.
    raw_documents = TextLoader(text_file_path).load()
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator=separator)
    documents = text_splitter.split_documents(raw_documents)
    embedding_function = SentenceTransformerEmbeddings(model_name=model_name)
    # reset the vector store
    if reset:
        db = Chroma()
        all_ids = db.get()
        for id in all_ids['ids']:
            db.delete(id)
    # # load the document into Chroma
    db = Chroma.from_documents(documents, embedding_function)
    retriever = db.as_retriever()
    return retriever

txt_path = "/mnt/HDD1/duclv/DEMO/src/app/uploads/Earthquakes conversation_uncompressed.txt"

retriever = create_retriever(txt_path)
retriever.invoke("who is speaker_00",  threshold=0.5)

Created a chunk of size 447, which is longer than the specified 300
Created a chunk of size 658, which is longer than the specified 300
Created a chunk of size 469, which is longer than the specified 300
Created a chunk of size 799, which is longer than the specified 300
Created a chunk of size 1429, which is longer than the specified 300
Created a chunk of size 615, which is longer than the specified 300


[Document(page_content='SPEAKER_00: And this is Joe Weiss. And we just wanted to let you know that this material is copyrighted in the year 2008 by Learn Real English LLC. www.learnrealenglish.com', metadata={'source': '/mnt/HDD1/duclv/DEMO/src/app/uploads/Earthquakes conversation_uncompressed.txt'}),
 Document(page_content="SPEAKER_00: Hey, check this out. You know what Eric asked me when I got into work this morning?\nSPEAKER_01: I have no idea. What did he ask you?\nSPEAKER_00: He asked me if I felt the earthquake last night.\nSPEAKER_01: Earthquake? You've got to be kidding. I didn't feel an earthquake.", metadata={'source': '/mnt/HDD1/duclv/DEMO/src/app/uploads/Earthquakes conversation_uncompressed.txt'}),
 Document(page_content='SPEAKER_00: Yeah, without falling.\nSPEAKER_01: Yeah, without falling. And I had no idea what was going on for a few minutes afterwards. And then I realized, oh, that must have been an earthquake.', metadata={'source': '/mnt/HDD1/duclv/DEMO/src/app/upload

In [6]:
from operator import itemgetter
from typing import List
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser

from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from model import llm

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You're a helpful AI assistant. Given a user question and the conversation, answer the user question.\
            If none of the articles answer the question, just say you don't know.\n\nHere are conversation:{context}",
        ),
        ("human", "{question}"),
    ]
)
prompt.pretty_print()
def format_docs(docs: List[Document]) -> str:
    """Convert Documents to a single string.:"""
    formatted = [
        f"Article Title: {doc.metadata['source']}\nArticle Snippet: {doc.page_content}"
        for doc in docs
    ]
    return "\n\n" + "\n\n".join(formatted)


format = itemgetter("docs") | RunnableLambda(format_docs)
# subchain for generating an answer once we've done retrieval
answer = prompt | llm | StrOutputParser()
# complete chain that calls wiki -> formats docs to string -> runs answer subchain -> returns just the answer and retrieved docs.
chain = (
    RunnableParallel(question=RunnablePassthrough(), docs=retriever)
    .assign(context=format)
    .assign(answer=answer)
    .pick(["answer", "docs"])
)

2024-07-09 17:13:53.192 
  command:

    streamlit run /home/duclv/.conda/envs/mm/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]



You're a helpful AI assistant. Given a user question and the conversation, answer the user question.            If none of the articles answer the question, just say you don't know.

Here are conversation:[33;1m[1;3m{context}[0m


[33;1m[1;3m{question}[0m


In [27]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers.openai_tools import JsonOutputKeyToolsParser
from langchain_core.tools import tool
from langchain_experimental.llms.ollama_functions import OllamaFunctions


class Citation(BaseModel):
    source_id: int = Field(
        ...,
        description="The integer ID of a SPECIFIC source which justifies the answer.",
    )
    quote: str = Field(
        ...,
        description="The VERBATIM quote from the specified source that justifies the answer.",
    )


class quoted_answer(BaseModel):
    """Answer the user question based only on the given sources, and cite the sources used."""

    answer: str = Field(
        ...,
        description="The answer to the user question, which is based on the given sources, paraphrased as needed.",
    )
    citations: List[Citation] = Field(
        ..., description="Citations from the given sources and should keep the relevant content in that source, DO NOT paraphrase the source."
    )
    
output_parser_2 = JsonOutputKeyToolsParser(
    key_name="quoted_answer"
)
def format_docs_with_id(docs: List[Document]) -> str:
    formatted = [
        f"Source ID: {i},{doc.page_content}"
        for i, doc in enumerate(docs)
    ]

    return "\n\n" + "\n\n".join(formatted)

# llm_with_tool_2 = llm.bind_tools(
#     tools = [quoted_answer])
# format_2 = itemgetter("docs") | RunnableLambda(format_docs_with_id)
# print("format_2", format_2)
# answer_2 = prompt | llm_with_tool_2 | output_parser_2
# chain_2 = (
#     RunnableParallel(question=RunnablePassthrough(), docs=retriever)
#     .assign(context=format_2)
#     .assign(quoted_answer=answer_2)
#     .pick(["quoted_answer", "docs"])
# )

In [28]:
from langchain_core.prompts.prompt import PromptTemplate
def classify_user_need(input_text):
    prompt_template = """System: Classify the user need based on the chat conversation below,
    Whether the user need is "Question" or an "Action".
    RETURN your answer in 1 word:
    "{text}"
    
    Answer :"""
    prompt = PromptTemplate.from_template(prompt_template)
    classify_chain = prompt | llm | StrOutputParser()
    return classify_chain.invoke(input_text)
classify_user_need("how people in the conversation")

'Question'

In [29]:
from langchain_core.prompts import PromptTemplate
import langchain
# langchain.debug = False
llm = OllamaFunctions(model="llama3", temperature=0.3, \
                      top_p=0.5, mirostat = 2,\
                    mirostat_tau =5,)
structured_llm = llm.with_structured_output(quoted_answer)
# Prompt template
prompt = PromptTemplate.from_template(
    """SYSTEM: Base on this context: {context}
    If none of the speech answer the question, just say you don't know.
    DO NOT repeat the dialogue in the answer.\n\n
    \n\n
    Human: {question}
    AI: """
)
# The code snippet you provided is setting up a prompt template for a conversational AI system. The template includes a context section and a question section. The context is filled with relevant information retrieved based on the user's question. The AI assistant is expected to answer the user's question based on this context.
# prompt = ChatPromptTemplate.from_messages(
    
#         (
#             "system",
#             "You're a helpful AI assistant. Given a user question and the conversation, answer the user question.\
#             If none of the articles answer the question, just say you don't know.\n\nHere are conversation:{context}",
#         ),
#         ("human", "{question}, indicate the source in the conversation"),
    
# )

# from model import llm
from langchain_community.chat_models import ChatOllama
# llm = ChatOllama(model="llama3")
chain = prompt | structured_llm 

In [34]:
question = "the attendees of the conversation"  
relevant_contents = retriever.invoke(question, threshold=0)
print(retriever.aget_relevant_documents(relevant_contents))
context = format_docs_with_id(relevant_contents)
print(context)
with open(txt_path, 'r') as f:
    conversation = f.read()

answer =  chain.invoke(input = {'context' : conversation, 'question':question})
answer

<coroutine object BaseRetriever.aget_relevant_documents at 0x7fb9bf067c40>


Source ID: 0,SPEAKER_00: Hey, check this out. You know what Eric asked me when I got into work this morning?
SPEAKER_01: I have no idea. What did he ask you?
SPEAKER_00: He asked me if I felt the earthquake last night.
SPEAKER_01: Earthquake? You've got to be kidding. I didn't feel an earthquake.

Source ID: 1,SPEAKER_00: And this is Joe Weiss. And we just wanted to let you know that this material is copyrighted in the year 2008 by Learn Real English LLC. www.learnrealenglish.com

Source ID: 2,SPEAKER_00: Yeah, without falling.
SPEAKER_01: Yeah, without falling. And I had no idea what was going on for a few minutes afterwards. And then I realized, oh, that must have been an earthquake.

Source ID: 3,SPEAKER_01: Well, I guess that's not so farfetched considering we live on a major fault line here.
SPEAKER_00: Yeah, but actually, I think this earthquake was on a different fault line. Because, you know, Eric live

  print(retriever.aget_relevant_documents(relevant_contents))


quoted_answer(answer="It seems that Eric asked if I felt an earthquake last night. He said it woke him up at 4:42 am, and since he lives closer to the epicenter than we do, it's possible that it wasn't felt here. I've experienced earthquakes before, but each one has been a unique experience. When I'm at home, I often feel like there's a big train or truck going by, and the noise is loud for a second before it stops.", citations=[Citation(source_id=0, quote='')])

In [26]:
system = """You're a helpful AI assistant. Given a user question and some conversation snippets, \
answer the user question and provide citations. If none of the articles answer the question, just say you don't know.

Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that \
justifies the answer and the ID of the quote article. Return a citation for every quote across all articles \
that justify the answer. Use the following format for your final output:

<cited_answer>
    <answer> your answer [id]</answer>
    <citations>
        <citation> <source_id> </source_id> <quote> </quote> </citation>
        <citation> <source_id> </source_id> <quote> </quote> </citation>
    </citations>
</cited_answer>

Here are the some part of meeting conversation:{context}"""
prompt_3 = ChatPromptTemplate.from_messages(
    [("system", system), ("human", "{question}")]
)

from langchain_core.output_parsers import XMLOutputParser

from model import llm
def format_docs_xml(docs: List[Document]) -> str:
    formatted = []
    for i, doc in enumerate(docs):
        doc_str = f"""\
    <source id=\"{i}\">
        <article_snippet>{doc.page_content}</article_snippet>
    </source>"""
        formatted.append(doc_str)
    return "\n\n<sources>" + "\n".join(formatted) + "</sources>"


format_3 = itemgetter("docs") | RunnableLambda(format_docs_xml)
answer_3 = prompt_3 | llm | XMLOutputParser() | itemgetter("cited_answer")
chain_3 = (
    RunnableParallel(question=RunnablePassthrough(), docs=retriever)
    .assign(context=format_3)
    .assign(cited_answer=answer_3)
    .pick(["cited_answer", "docs"])
)
chain_3.invoke("when did eric woke up?")

{'cited_answer': '\n    Eric woke up at 4:42 in the morning [id]\n    ',
 'docs': [Document(page_content="SPEAKER_00: I know, that's what I said. He told me it actually woke him up last night. You know, yeah, it was like it went, it happened at like, uh, four 42 in the morning. So we must've been sleeping, but I mean, it's possible that, you know, even if we had been awake, we might not have felt it because you know, maybe it, uh, wasn't felt, you know, this far North. But, uh, I mean, I thought he was pulling my leg when he first talked about it.", metadata={'source': '/mnt/HDD1/duclv/DEMO/src/app/uploads/Earthquakes conversation_uncompressed.txt'}),
  Document(page_content="SPEAKER_01: Well, I guess that's not so farfetched considering we live on a major fault line here.\nSPEAKER_00: Yeah, but actually, I think this earthquake was on a different fault line. Because, you know, Eric lives just south of San Jose.\nSPEAKER_01: Yeah.", metadata={'source': '/mnt/HDD1/duclv/DEMO/src/app/upl

In [12]:
!pip install defusedxml

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [13]:
from langchain_anthropic import ChatAnthropicMessages

anthropic = ChatAnthropicMessages(model_name="claude-instant-1.2")
system = """You're a helpful AI assistant. Given a user question and some Wikipedia article snippets, \
answer the user question and provide citations. If none of the articles answer the question, just say you don't know.

Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that \
justifies the answer and the ID of the quote article. Return a citation for every quote across all articles \
that justify the answer. Use the following format for your final output:

<cited_answer>
    <answer></answer>
    <citations>
        <citation><source_id></source_id><quote></quote></citation>
        <citation><source_id></source_id><quote></quote></citation>
        ...
    </citations>
</cited_answer>

Here are the Wikipedia articles:{context}"""
prompt_3 = ChatPromptTemplate.from_messages(
    [("system", system), ("human", "{question}")]
)

  warn_deprecated(


In [14]:
from langchain_core.output_parsers import XMLOutputParser


def format_docs_xml(docs: List[Document]) -> str:
    formatted = []
    for i, doc in enumerate(docs):
        doc_str = f"""\
    <source id=\"{i}\">
        <title>{doc.metadata['source']}</title>
        <article_snippet>{doc.page_content}</article_snippet>
    </source>"""
        formatted.append(doc_str)
    return "\n\n<sources>" + "\n".join(formatted) + "</sources>"


format_3 = itemgetter("docs") | RunnableLambda(format_docs_xml)
answer_3 = prompt_3 | anthropic | XMLOutputParser() | itemgetter("cited_answer")
chain_3 = (
    RunnableParallel(question=RunnablePassthrough(), docs=retriever)
    .assign(context=format_3)
    .assign(cited_answer=answer_3)
    .pick(["cited_answer", "docs"])
)

In [15]:
chain_3.invoke("who is speaker_00")

TypeError: "Could not resolve authentication method. Expected either api_key or auth_token to be set. Or for one of the `X-Api-Key` or `Authorization` headers to be explicitly omitted"