In [29]:
import os
import pinecone
from dotenv import load_dotenv

from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore


load_dotenv()

llm = AzureChatOpenAI(
    azure_deployment=os.getenv("DEPLOYMENT_NAME_LLM"),
    openai_api_version="2023-06-01-preview",
    model_version="0301",
)

embedding = AzureOpenAIEmbeddings(
    azure_deployment=os.getenv("DEPLOYMENT_NAME_EMBEDDING"),
    openai_api_version="2023-05-15",
)

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_client = pinecone.Pinecone(
    api_key=pinecone_api_key
)


In [30]:
import pymupdf
from docx import Document

def extract_text_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def extract_text_from_docx(doc_path):
    doc = Document(doc_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return "\n".join(full_text)

def load_documents_from_directory(directory_path):
    documents = []
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if filename.endswith(".pdf"):
            documents.append(extract_text_from_pdf(file_path))
        elif filename.endswith(".docx"):
            documents.append(extract_text_from_docx(file_path))
    return documents

directory_path = "Documents/Corpus"
corpus = load_documents_from_directory(directory_path)

In [31]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

doc_splits = []

for text in corpus:
    texts = text_splitter.split_text(text)
    docs = [Document(page_content=t) for t in texts]
    doc_splits.extend(text_splitter.split_documents(docs))

print(f"Total number of document splits: {len(doc_splits)}")

Total number of document splits: 348


In [32]:
index_name = "corpus"

pinecone = PineconeVectorStore.from_documents(
    documents=doc_splits,
    embedding=embedding,
    index_name=index_name
)

retriever = pinecone.as_retriever()

In [33]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [34]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [35]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [36]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [37]:
response_1 = conversational_rag_chain.invoke(
    {"input": "Que raconte l'hirondelle et les petits oiseaux ?"},
    config={"configurable": {"session_id": "Le S"}},
)

print(response_1["answer"])

L'hirondelle prévient les petits oiseaux des dangers que représente la culture du chanvre, expliquant qu'ils seront pris au piège et attrapés par les hommes s'ils ne font rien. Les oiseaux ne la croient pas et se moquent d'elle. Plus tard, ils se rendent compte que l'hirondelle avait raison et décident de suivre ses conseils pour éviter les pièges des hommes.


In [38]:
response_2 = conversational_rag_chain.invoke(
    {"input": "Que raconte Jupiter et le passager ?"},
    config={"configurable": {"session_id": "Le S"}},
)

print(response_2["answer"])

Dans cette fable, le passager d'un navire en détresse fait un vœu à Jupiter en promettant de lui offrir 100 bœufs s'il est sauvé. Une fois sain et sauf, il brûle quelques os en guise de sacrifice et dit à Jupiter qu'il n'a plus rien à lui devoir. Jupiter feint de rire, mais envoie un songe au passager pour lui dire qu'un trésor l'attend à un certain endroit. Le passager court alors chercher le trésor, mais se fait voler et n'ayant plus qu'un écu, il leur promet cent talents d'or en échange du trésor.
