In [1]:
import os
import ast
from dotenv import load_dotenv
import arxiv

from langchain_openai import ChatOpenAI
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

In [2]:
from langchain_core.documents import Document
from langchain_chroma import Chroma

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_openai import ChatOpenAI

In [3]:
chat_model = ChatOpenAI()

In [4]:
file_path = (
    "../../eth_courses/thesis/gokberk_thesis_report_04_06_23.pdf"
)
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()

pages[0]

Document(page_content='Deciphering the U.S. Diplomatic\nDocuments with NLP and Graph Data\nScience\nMaster’s Thesis\nGökberk Özsoy\ngoezsoy@ethz.ch\nSwiss Data Science Center\nETH Zürich\nSupervisors:\nDr. Luis Salamanca\nProf. Dr. Fernando Perez-Cruz\nJune 4, 2023', metadata={'source': '../../eth_courses/thesis/gokberk_thesis_report_04_06_23.pdf', 'page': 0})

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(pages)

len(all_splits)

In [None]:
vectorstore = Chroma.from_documents(
    all_splits,
    embedding=HuggingFaceEmbeddings(),
)

In [None]:
vectorstore.similarity_search("turkey")

In [None]:
context = "\n\n".join([doc.page_content for doc in vectorstore.similarity_search("turkey")])

In [None]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough

prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

chain = (
    {"context": vectorstore.as_retriever() | format_docs, "question": RunnablePassthrough()}
    | prompt
    | chat_model
)

In [None]:
context = "\n\n".join([doc.page_content for doc in vectorstore.similarity_search("what role kissinger has based on the context provided?")])
print(context)

### Conversational RAG

In [None]:
prompt = """System: You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.

Context: {context}

Human: {input}
AI:
"""

contextualize_q_system_prompt = """Given a chat history and the latest user question which might reference context in the chat history, 
formulate a standalone question which can be understood without the chat history. 
Do NOT answer the question, just reformulate it if needed and otherwise return it as is.
If chat history is empty (''), do NOT change anything in the question.

Chat history:
{history}
Question:
{input}
Reformulated Question:
"""

history = ''

def ask(question):

    global history

    rephrased_q = chat_model.invoke(contextualize_q_system_prompt.format(input=question,history=history)).content
    print(rephrased_q)

    context = "\n\n".join([doc.page_content for doc in vectorstore.similarity_search(rephrased_q)])

    prompt_after_formatting = prompt.format(input=rephrased_q,
                                            context=context)
    print(prompt_after_formatting)
    output = chat_model.invoke(prompt_after_formatting)

    history += f"Human: {question}\nAI: {output.content}\n"

    return output.content

In [None]:
ask("who is kissinger?")

In [None]:
ask("what other roles does he has?")

In [None]:
ask("what was england's aim in the ww1?")

In [None]:
ask("what is redaction?")

In [None]:
ask("when do we observe a sharp drop in it?")

In [None]:
print(history)