### Libraries

In [2]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter

from langchain.document_loaders import PyPDFLoader


# OpenAIEmbeddings() was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. That's why I use langchain_openai instaed
import langchain_openai

from langchain.chains import ConversationalRetrievalChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

from langchain.chains.chat_vector_db.prompts import CONDENSE_QUESTION_PROMPT

from langchain.chains.llm import LLMChain

In [5]:
import os
# os.chdir("..")
os.getcwd()

'd:\\Projects\\chatbot\\code'

In [6]:
from utilities.customprompt import PROMPT

### API info

In [7]:
api_key = os.getenv("OPENAI_API_KEY")

### Embedding

In [8]:
embeddings = langchain_openai.OpenAIEmbeddings(openai_api_key=api_key)

### Upload documents

In [50]:
loader = PyPDFLoader("docs/Naval Ravikant - The Almanack.pdf")
documents = loader.load()

In [51]:
# split the text to chuncks of of size 1000
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

In [52]:
vector_store = Chroma.from_documents(documents, embeddings)

### LLM

In [53]:
llm=langchain_openai.OpenAI(openai_api_key = api_key, model_name="text-davinci-003", temperature=0,max_tokens=300)

### Semantic answer function

In [55]:
chat_history=[]

In [58]:
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False)

doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff", verbose=False, prompt=PROMPT)

chain = ConversationalRetrievalChain(
    retriever=vector_store.as_retriever(),
    question_generator=question_generator,
    combine_docs_chain=doc_chain,
    return_source_documents=True,
)

question = "ciao"
result = chain.invoke({"question": question, "chat_history": chat_history})

In [57]:
result

{'question': 'ciao',
 'chat_history': [],
 'answer': ' Hi there! How can I help you?',
 'source_documents': [Document(page_content='162 · THE ALMANACK OF NAVAL RAVIKANTronment. It leads to allergies and an untrained immune system. \nThis is known as the hygiene hypothesis. We’re evolved to live \nin much smaller tribes and to have more family around us. I \npartially grew up in India, and in India, everybody is in your \nbusiness. There’ s a cousin, an aunt, an uncle who is in your face, \nwhich makes it hard to be depressed, because you are never \nalone. (I’m not referring to people with chemical depression. \nI’m talking more about the existential angst and malaise teen -\nagers seem to go through.) But on the other hand, you have no \nprivacy, so you can’t be free. There are trade-offs.\nWe’re not meant to check our phone every five minutes. The \nconstant mood swings of getting a “like” then an angry com -\nment makes us into anxious creatures. We evolved for scarcity \nbut live i

In [32]:
result["answer"]

' Hi there! How can I help you?'

In [69]:
# Extracting unique sourcess
unique_sources = set(map(lambda x: x.metadata["source"], result['source_documents']))

# Enumerating the sources
sources = [f"{idx}. {source}" for idx, source in enumerate(unique_sources, start=1)]
sources

['1. docs/conda-cheatsheet.pdf']

### function

In [38]:
def get_semantic_answer_lang_chain(prompt, chat_history):
    
    # Log a message indicating that the function has started
    # LOGGER.info(f"Start answering based on prompt: {prompt}.")
    
    # Create a prompt template using a template from the config module and input variables
    
    # Load a QA chain using an OpenAI object, a chain type, and a prompt template.
    question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT, verbose=False)

    doc_chain = load_qa_with_sources_chain(llm, chain_type="stuff", verbose=False, prompt=PROMPT)

    chain = ConversationalRetrievalChain(
        retriever=vector_store.as_retriever(),
        question_generator=question_generator,
        combine_docs_chain=doc_chain,
        return_source_documents=True,
    )

    # Log a message indicating the number of chunks to be considered when answering the user's query.
    # LOGGER.info(f"The top {config.k} chunks are considered to answer the user's query.")
    

    result = chain.invoke({"question": prompt, "chat_history": chat_history})

    # Extracting unique sources
    unique_sources = set(map(lambda x: x.metadata["source"], result['source_documents']))

    # Enumerating and formatting the sources
    sources = [f"{idx}. {source}" for idx, source in enumerate(unique_sources, start=1)]

    source_chunks="\n".join(f"Chunck_number_{i}:{doc.page_content}" for i, doc in enumerate(result['source_documents']))
    
    # Log a message indicating the answer that was generated
    # LOGGER.info(f"The returned answer is: {result['answer']}")
    
    # Log a message indicating that the function has finished and return the answer.
    # LOGGER.info(f"Answering module over.")

    return prompt, result['answer'], sources, source_chunks

In [59]:
user_input, result, sources, source_chunks = get_semantic_answer_lang_chain("ciao", [])

In [45]:
result

' Hi there! How can I help you?'