In [None]:
from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from os import path

In [3]:
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1"
LLM_MODEL = "nemotron-mini"
CONNECTION_STRING = "postgresql://postgres:password@localhost:5432/data-battle"
DATA_DIR = "../../../data/raw"
EPAC_COLLECTION_NAME = "epac"
EQE_COLLECTION_NAME = "eqe"

In [None]:
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={ "trust_remote_code": True })

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

In [None]:
epac_loader = DirectoryLoader(path.join(DATA_DIR, "epac"), loader_cls=UnstructuredPDFLoader, recursive=True, use_multithreading=True)
epac_documents = epac_loader.load_and_split(text_splitter=text_splitter)

In [None]:
eqe_loader = DirectoryLoader(path.join(DATA_DIR, "eqe"), loader_cls=UnstructuredPDFLoader, recursive=True, use_multithreading=True)
eqe_documents = eqe_loader.load_and_split(text_splitter=text_splitter)

In [None]:
legal_pub_loader = DirectoryLoader(path.join(DATA_DIR, "legal_pubs"), loader_cls=UnstructuredPDFLoader, recursive=True, use_multithreading=True)
legal_pub_documents = legal_pub_loader.load_and_split(text_splitter=text_splitter)

In [None]:
epac_and_legal_pub_documents = epac_documents + legal_pub_documents

PGVector.from_documents(
    embedding=embeddings,
    documents=epac_and_legal_pub_documents,
    collection_name=EPAC_COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    use_jsonb=True
)

In [None]:
eqe_and_legal_pub_documents = eqe_documents + legal_pub_documents

database = PGVector.from_documents(
    embedding=embeddings,
    documents=eqe_and_legal_pub_documents,
    collection_name=EQE_COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    use_jsonb=True
)

In [None]:
llm = OllamaLLM(model=LLM_MODEL)

In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. Always answer with explanations. "
    "Always cite articles and rules if relevant. "
    "If you don't know the answer, say that you don't know."
    "\n\n"
    "{context}"
)

In [None]:
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("placeholder", "{chat_history}"),
        ("human", "{input}"),
    ]
)

In [None]:

condense_question_system_template = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

condense_question_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", condense_question_system_template),
        ("placeholder", "{chat_history}"),
        ("human", "{input}"),
    ]
)

In [None]:
retriever = database.as_retriever()
history_aware_retriever = create_history_aware_retriever(llm, retriever, condense_question_prompt)

In [None]:
qa_chain = create_stuff_documents_chain(llm, qa_prompt)

In [None]:
convo_qa_chain = create_retrieval_chain(history_aware_retriever, qa_chain)

In [None]:
chat_history = []

In [None]:
query = """
Francesca has filed a European patent application EP-F before the EPO. Francesca did
not develop the invention which is the subject of EP-F. In drafting EP-F Francesca used
information in Andrew’s laboratory notebook, without Andrew’s consent. EP-F was
published in December 2017 and is still pending.
Can Andrew seek a stay of proceedings if he provides evidence that he has
instituted proceedings against Francesca seeking a decision that Andrew is entitled
to the grant of the European patent based on EP-F ? Why ?.
"""
result = convo_qa_chain.invoke({"input": query, "chat_history": chat_history})

print(result["answer"])

chat_history.append((query, result["answer"]))