In [16]:
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

# Load the document, split it into chunks, embed each chunk and load it into the vector store.
raw_documents = TextLoader('/mnt/HDD1/duclv/ACM-AMM/database/uploaded_docs/Earthquakes Conversation.txt').load()
# text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=100, separator="\n")
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=100, separator="\n")
documents = text_splitter.split_documents(raw_documents)
with open('/mnt/HDD1/duclv/open_ai_key.txt', 'r') as f:
    openai_api_key = f.read().strip()
db = Chroma.from_documents(documents, OpenAIEmbeddings(openai_api_key=openai_api_key))
retriever = db.as_retriever()
retriever.invoke("who is speaker_00", top_k=5, threshold=0.5)

[Document(page_content='from 342.84 to 344.861\tSPEAKER_01:It was really, it was a very surreal feel.\nfrom 347.903 to 349.144\tSPEAKER_01:Hi, this is Kristen Dodds.\nfrom 349.564 to 350.645\tSPEAKER_00:And this is Joe Weiss.', metadata={'source': '/mnt/HDD1/duclv/ACM-AMM/database/uploaded_docs/Earthquakes Conversation.txt'}),
 Document(page_content='from 342.84 to 344.861\tSPEAKER_01:It was really, it was a very surreal feel.\nfrom 347.903 to 349.144\tSPEAKER_01:Hi, this is Kristen Dodds.\nfrom 349.564 to 350.645\tSPEAKER_00:And this is Joe Weiss.', metadata={'source': '/mnt/HDD1/duclv/ACM-AMM/database/uploaded_docs/Earthquakes Conversation.txt'}),
 Document(page_content='from 342.84 to 344.861\tSPEAKER_01:It was really, it was a very surreal feel.\nfrom 347.903 to 349.144\tSPEAKER_01:Hi, this is Kristen Dodds.\nfrom 349.564 to 350.645\tSPEAKER_00:And this is Joe Weiss.', metadata={'source': '/mnt/HDD1/duclv/ACM-AMM/database/uploaded_docs/Earthquakes Conversation.txt'}),
 Document(pag

In [17]:
query = "who is speaker_00"
docs = db.similarity_search(query, k=1)
for doc in docs:
    print(doc.page_content)

from 342.84 to 344.861	SPEAKER_01:It was really, it was a very surreal feel.
from 347.903 to 349.144	SPEAKER_01:Hi, this is Kristen Dodds.
from 349.564 to 350.645	SPEAKER_00:And this is Joe Weiss.


In [18]:
from operator import itemgetter
from typing import List
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser

from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from model import llm

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You're a helpful AI assistant. Given a user question and the conversation, answer the user question.\
            If none of the articles answer the question, just say you don't know.\n\nHere are conversation:{context}",
        ),
        ("human", "{question}"),
    ]
)
prompt.pretty_print()
def format_docs(docs: List[Document]) -> str:
    """Convert Documents to a single string.:"""
    formatted = [
        f"Article Title: {doc.metadata['source']}\nArticle Snippet: {doc.page_content}"
        for doc in docs
    ]
    return "\n\n" + "\n\n".join(formatted)


format = itemgetter("docs") | RunnableLambda(format_docs)
# subchain for generating an answer once we've done retrieval
answer = prompt | llm | StrOutputParser()
# complete chain that calls wiki -> formats docs to string -> runs answer subchain -> returns just the answer and retrieved docs.
chain = (
    RunnableParallel(question=RunnablePassthrough(), docs=retriever)
    .assign(context=format)
    .assign(answer=answer)
    .pick(["answer", "docs"])
)


You're a helpful AI assistant. Given a user question and the conversation, answer the user question.            If none of the articles answer the question, just say you don't know.

Here are conversation:[33;1m[1;3m{context}[0m


[33;1m[1;3m{question}[0m


In [19]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers.openai_tools import JsonOutputKeyToolsParser
from langchain_core.tools import tool
from langchain_experimental.llms.ollama_functions import OllamaFunctions


class Citation(BaseModel):
    source_id: int = Field(
        ...,
        description="The integer ID of a SPECIFIC source which justifies the answer.",
    )
    quote: str = Field(
        ...,
        description="The VERBATIM quote from the specified source that justifies the answer.",
    )


class quoted_answer(BaseModel):
    """Answer the user question based only on the given sources, and cite the sources used."""

    answer: str = Field(
        ...,
        description="The answer to the user question, which is based on the given sources, cite only 1 source for 1 information.",
    )
    citations: str = Field(
        ..., description="Citations from the given sources, DO NOT paraphrase the source."
    )
    
output_parser_2 = JsonOutputKeyToolsParser(
    key_name="quoted_answer"
)
def format_docs_with_id(docs: List[Document]) -> str:
    formatted = [
        f"Source ID: {i},\n{doc.page_content}"
        for i, doc in enumerate(docs)
    ]

    return "\n\n" + "\n\n".join(formatted)

# llm_with_tool_2 = llm.bind_tools(
#     tools = [quoted_answer])
# format_2 = itemgetter("docs") | RunnableLambda(format_docs_with_id)
# print("format_2", format_2)
# answer_2 = prompt | llm_with_tool_2 | output_parser_2
# chain_2 = (
#     RunnableParallel(question=RunnablePassthrough(), docs=retriever)
#     .assign(context=format_2)
#     .assign(quoted_answer=answer_2)
#     .pick(["quoted_answer", "docs"])
# )

In [38]:
from langchain_core.prompts import PromptTemplate
import langchain
# langchain.debug = False
llm = OllamaFunctions(model="llama3", temperature=0, top_p=0.5)
structured_llm = llm.with_structured_output(quoted_answer)
# Prompt template
prompt = PromptTemplate.from_template(
    """SYSTEM: Base on this context: {context}
    If none of the articles answer the question, just say you don't know.\n\n
    -------------------------------
    answer the question below:

Human: {question}
AI: """
)
# The code snippet you provided is setting up a prompt template for a conversational AI system. The template includes a context section and a question section. The context is filled with relevant information retrieved based on the user's question. The AI assistant is expected to answer the user's question based on this context.
# prompt = ChatPromptTemplate.from_messages(
    
#         (
#             "system",
#             "You're a helpful AI assistant. Given a user question and the conversation, answer the user question.\
#             If none of the articles answer the question, just say you don't know.\n\nHere are conversation:{context}",
#         ),
#         ("human", "{question}, indicate the source in the conversation"),
    
# )

# from model import llm
from langchain_community.chat_models import ChatOllama
# llm = ChatOllama(model="llama3")
chain = prompt | structured_llm 

In [39]:
relevant_contents

[Document(page_content='from 342.84 to 344.861\tSPEAKER_01:It was really, it was a very surreal feel.\nfrom 347.903 to 349.144\tSPEAKER_01:Hi, this is Kristen Dodds.\nfrom 349.564 to 350.645\tSPEAKER_00:And this is Joe Weiss.', metadata={'source': '/mnt/HDD1/duclv/ACM-AMM/database/uploaded_docs/Earthquakes Conversation.txt'}),
 Document(page_content='from 342.84 to 344.861\tSPEAKER_01:It was really, it was a very surreal feel.\nfrom 347.903 to 349.144\tSPEAKER_01:Hi, this is Kristen Dodds.\nfrom 349.564 to 350.645\tSPEAKER_00:And this is Joe Weiss.', metadata={'source': '/mnt/HDD1/duclv/ACM-AMM/database/uploaded_docs/Earthquakes Conversation.txt'}),
 Document(page_content='from 342.84 to 344.861\tSPEAKER_01:It was really, it was a very surreal feel.\nfrom 347.903 to 349.144\tSPEAKER_01:Hi, this is Kristen Dodds.\nfrom 349.564 to 350.645\tSPEAKER_00:And this is Joe Weiss.', metadata={'source': '/mnt/HDD1/duclv/ACM-AMM/database/uploaded_docs/Earthquakes Conversation.txt'}),
 Document(pag

In [43]:
question = "who is SPEAKER_00?"  
relevant_contents = retriever.invoke(question, top_k=5, threshold=0.5)[:2]
relevant_contents+=relevant_contents
context = format_docs_with_id(relevant_contents)
print(context)
answer =  chain.invoke(input = {'context' : context, 'question':question})
answer



Source ID: 0,
from 3.294 to 4.095	SPEAKER_00: Hey, check this out.
from 4.475 to 6.597	SPEAKER_00:You know what Eric asked me when I got into work this morning?
from 6.617 to 8.339	SPEAKER_01:I have no idea.

Source ID: 1,
from 3.294 to 4.095	SPEAKER_00: Hey, check this out.
from 4.475 to 6.597	SPEAKER_00:You know what Eric asked me when I got into work this morning?
from 6.617 to 8.339	SPEAKER_01:I have no idea.

Source ID: 2,
from 3.294 to 4.095	SPEAKER_00: Hey, check this out.
from 4.475 to 6.597	SPEAKER_00:You know what Eric asked me when I got into work this morning?
from 6.617 to 8.339	SPEAKER_01:I have no idea.

Source ID: 3,
from 3.294 to 4.095	SPEAKER_00: Hey, check this out.
from 4.475 to 6.597	SPEAKER_00:You know what Eric asked me when I got into work this morning?
from 6.617 to 8.339	SPEAKER_01:I have no idea.


quoted_answer(answer='SPEAKER_00 is a speaker in the conversation.', citations='Source ID: 0, from 3.294 to 4.095; Source ID: 1, from 3.294 to 4.095; Source ID: 2, from 3.294 to 4.095; Source ID: 3, from 3.294 to 4.095')

In [14]:
system = """You're a helpful AI assistant. Given a user question and some conversation snippets, \
answer the user question and provide citations. If none of the articles answer the question, just say you don't know.

Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that \
justifies the answer and the ID of the quote article. Return a citation for every quote across all articles \
that justify the answer. Use the following format for your final output:

<cited_answer>
    <answer> your answer [id]</answer>
    <citations>
        <citation> <source_id> </source_id> <quote> </quote> </citation>
        <citation> <source_id> </source_id> <quote> </quote> </citation>
    </citations>
</cited_answer>

Here are the some part of meeting conversation:{context}"""
prompt_3 = ChatPromptTemplate.from_messages(
    [("system", system), ("human", "{question}")]
)

from langchain_core.output_parsers import XMLOutputParser

from model import llm
def format_docs_xml(docs: List[Document]) -> str:
    formatted = []
    for i, doc in enumerate(docs):
        doc_str = f"""\
    <source id=\"{i}\">
        <article_snippet>{doc.page_content}</article_snippet>
    </source>"""
        formatted.append(doc_str)
    return "\n\n<sources>" + "\n".join(formatted) + "</sources>"


format_3 = itemgetter("docs") | RunnableLambda(format_docs_xml)
answer_3 = prompt_3 | llm | XMLOutputParser() | itemgetter("cited_answer")
chain_3 = (
    RunnableParallel(question=RunnablePassthrough(), docs=retriever)
    .assign(context=format_3)
    .assign(cited_answer=answer_3)
    .pick(["cited_answer", "docs"])
)
chain_3.invoke("when did eric woke up?")

OutputParserException: Failed to parse XML format from completion <cited_answer>
    Eric woke up at 4:42 in the morning. [2]
    <citations>
        <citation> [2] "You know, yeah, it was like it went, it happened at like, uh, four 42 in the morning." </citation>
    </citations>
</cited_answer>

Note: The answer is based on the conversation snippets provided, specifically from sources 2 and 3.. Got: junk after document element: line 8, column 0

In [None]:
!pip install defusedxml

