In [1]:
import os
from dotenv import load_dotenv, find_dotenv

from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.vectorstores import Chroma, FAISS
from langchain_pinecone import PineconeVectorStore
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import AzureChatOpenAI
from langchain_experimental.text_splitter import SemanticChunker
from sentence_transformers import SentenceTransformer
from langchain.document_loaders.pdf import PyMuPDFLoader
from langchain.retrievers import MultiQueryRetriever
from typing import Dict, Any, List
import pinecone
from pinecone import ServerlessSpec

# Load environment variables from .env file
dotenv_path = find_dotenv()
if not dotenv_path:
    raise FileNotFoundError("Could not find .env file")
load_dotenv(dotenv_path)

# Initialize the SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
index_name = "vladch"

# Azure GPT-4 model parameters
azure_api_key = os.getenv('AZURE_API_KEY')
azure_api_version = os.getenv('AZURE_API_VERSION')
azure_deployment = os.getenv('AZURE_DEPLOYMENT')
azure_endpoint = os.getenv('AZURE_ENDPOINT')

# Check if the environment variables are loaded correctly
if not all([pinecone_api_key, azure_api_key, azure_api_version, azure_deployment, azure_endpoint]):
    raise ValueError("One or more environment variables are missing.")

# Define an embedding function
class SentenceTransformerEmbeddings:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, documents):
        return self.model.encode(documents, show_progress_bar=True).tolist()

    def embed_query(self, query):
        return self.model.encode(query, show_progress_bar=True).tolist()

    def __call__(self, text):
        return self.model.encode(text, show_progress_bar=True).tolist()

embedding_function = SentenceTransformerEmbeddings(model)

# Paths to the directories containing the PDF files
directory_path_1 = r"E:\RepoFisiereMulte\FisiereChroma"
directory_path_2 = r"E:\RepoFisiereMulte\FisiereFAISS"
directory_path_3 = r"E:\RepoFisiereMulte\FisierePinecone"

# Initialize lists to store the texts and metadata from all PDF files
all_texts_1 = []
all_metadatas_1 = []
all_texts_2 = []
all_metadatas_2 = []
all_texts_3 = []
all_metadatas_3 = []

# Load, chunk, and index the contents of the PDF documents in directory 1
for filename in os.listdir(directory_path_1):
    if filename.endswith(".pdf"):
        file_path = os.path.join(directory_path_1, filename)
        loader = PyMuPDFLoader(file_path=file_path)
        docs = loader.load()
        text_splitter = SemanticChunker(embedding_function, breakpoint_threshold_type="standard_deviation")
        splits = text_splitter.split_documents(docs)
        for i, split in enumerate(splits):
            split.metadata['chunk_id'] = i
            split.metadata['file'] = filename
            all_texts_1.append(split)
            all_metadatas_1.append(split.metadata)

# Load, chunk, and index the contents of the PDF documents in directory 2
for filename in os.listdir(directory_path_2):
    if filename.endswith(".pdf"):
        file_path = os.path.join(directory_path_2, filename)
        loader = PyMuPDFLoader(file_path=file_path)
        docs = loader.load()
        text_splitter = SemanticChunker(embedding_function, breakpoint_threshold_type="standard_deviation")
        splits = text_splitter.split_documents(docs)
        for i, split in enumerate(splits):
            split.metadata['chunk_id'] = i
            split.metadata['file'] = filename
            all_texts_2.append(split)
            all_metadatas_2.append(split.metadata)

# Load, chunk, and index the contents of the PDF documents in directory 3
for filename in os.listdir(directory_path_3):
    if filename.endswith(".pdf"):
        file_path = os.path.join(directory_path_3, filename)
        loader = PyMuPDFLoader(file_path=file_path)
        docs = loader.load()
        text_splitter = SemanticChunker(embedding_function, breakpoint_threshold_type="standard_deviation")
        splits = text_splitter.split_documents(docs)
        for i, split in enumerate(splits):
            split.metadata['chunk_id'] = i
            split.metadata['file'] = filename
            all_texts_3.append(split)
            all_metadatas_3.append(split.metadata)

# Define Azure GPT-4 model
llm = AzureChatOpenAI(
    openai_api_version=azure_api_version,
    azure_deployment=azure_deployment,
    api_key=azure_api_key,
    azure_endpoint=azure_endpoint
)

# Use the embeddings to create Chroma, FAISS, and Pinecone vector stores
chroma_store = Chroma.from_texts(texts=[doc.page_content for doc in all_texts_1], embedding=embedding_function, metadatas=all_metadatas_1)
faiss_store = FAISS.from_texts(texts=[doc.page_content for doc in all_texts_2], embedding=embedding_function, metadatas=all_metadatas_2)
pinecone_store = PineconeVectorStore.from_documents(
    all_texts_3, 
    index_name=index_name, 
    embedding=embedding_function
)

# Initialize MultiQueryRetrievers for all stores
retriever_from_llm_chroma = MultiQueryRetriever.from_llm(
    retriever=chroma_store.as_retriever(), llm=llm
)

retriever_from_llm_faiss = MultiQueryRetriever.from_llm(
    retriever=faiss_store.as_retriever(), llm=llm
)

retriever_from_llm_pinecone = MultiQueryRetriever.from_llm(
    retriever=pinecone_store.as_retriever(), llm=llm
)

# Contextualize question
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever_chroma = create_history_aware_retriever(
    llm, retriever_from_llm_chroma, contextualize_q_prompt
)
history_aware_retriever_faiss = create_history_aware_retriever(
    llm, retriever_from_llm_faiss, contextualize_q_prompt
)
history_aware_retriever_pinecone = create_history_aware_retriever(
    llm, retriever_from_llm_pinecone, contextualize_q_prompt
)

# Answer question
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain_chroma = create_retrieval_chain(history_aware_retriever_chroma, question_answer_chain)
rag_chain_faiss = create_retrieval_chain(history_aware_retriever_faiss, question_answer_chain)
rag_chain_pinecone = create_retrieval_chain(history_aware_retriever_pinecone, question_answer_chain)

# Statefully manage chat history
store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain_chroma = RunnableWithMessageHistory(
    rag_chain_chroma,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

conversational_rag_chain_faiss = RunnableWithMessageHistory(
    rag_chain_faiss,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

conversational_rag_chain_pinecone = RunnableWithMessageHistory(
    rag_chain_pinecone,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

def get_answer_and_sources(input_question, session_id="abc123"):
    def is_invalid_answer(answer):
        return "I don't" in answer or "I'm sorry" in answer

    # Retrieve from Chroma
    response_chroma = conversational_rag_chain_chroma.invoke(
        {"input": input_question},
        config={"configurable": {"session_id": session_id}},
    )

    if not is_invalid_answer(response_chroma["answer"]):
        answer = response_chroma["answer"]
        context = response_chroma["context"]
        used_chunks = [doc for doc in all_texts_1 if any(chunk.page_content in doc.page_content for chunk in context)]
        sources = {(chunk.metadata['file'], chunk.metadata.get('page', 'Unknown')) for chunk in used_chunks}
        return answer, context, sources

    # Retrieve from FAISS
    response_faiss = conversational_rag_chain_faiss.invoke(
        {"input": input_question},
        config={"configurable": {"session_id": session_id}},
    )

    if not is_invalid_answer(response_faiss["answer"]):
        answer = response_faiss["answer"]
        context = response_faiss["context"]
        used_chunks = [doc for doc in all_texts_2 if any(chunk.page_content in doc.page_content for chunk in context)]
        sources = {(chunk.metadata['file'], chunk.metadata.get('page', 'Unknown')) for chunk in used_chunks}
        return answer, context, sources

    # Retrieve from Pinecone
    response_pinecone = conversational_rag_chain_pinecone.invoke(
        {"input": input_question},
        config={"configurable": {"session_id": session_id}},
    )

    if not is_invalid_answer(response_pinecone["answer"]):
        answer = response_pinecone["answer"]
        context = response_pinecone["context"]
        used_chunks = [doc for doc in all_texts_3 if any(chunk.page_content in doc.page_content for chunk in context)]
        sources = {(chunk.metadata['file'], chunk.metadata.get('page', 'Unknown')) for chunk in used_chunks}
        return answer, context, sources

    # If none of the stores have an answer
    return "I don't know the answer.", [], set()

# Function to format the context
def format_context(context: List[Any]) -> str:
    formatted_context = "\n\n".join([
        f"File: {doc.metadata['file']}, Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content}"
        for doc in context
    ])
    return formatted_context




Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
question = "What are the primary tasks performed by broadcast technicians during radio and television broadcasts?"
answer, context, sources = get_answer_and_sources(question)

# Format the output in a readable way
formatted_context = "\n\n".join([f"File: {doc.metadata['file']}, Page {doc.metadata['page']}: {doc.page_content}" for doc in context])

output = f"""
Answer: {answer}

Context used for answer:
{formatted_context}
"""

print(output)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Answer: Broadcast technicians perform tasks such as controlling audio equipment to regulate volume and sound quality, monitoring the strength, clarity, and reliability of incoming and outgoing signals and adjusting equipment as necessary, and regulating the fidelity, brightness, and contrast of video transmissions using video console control panels. They also ensure that programs are ready for transmission, select sources for programming, report equipment problems, make emergency repairs when possible, record sound onto tape or film, and align antennae with receiving dishes for clear signal transmission.

Context used for answer:
File: 1_chroma.pdf, Page 0: Broadcast Technicians, page 1 of 3
A CareerZone Occupational Brief for:
Broadcast Technicians
An occupation in Engineering and Technologies
New York State Department of Labor
David A. Paterson, Governor
Job Description
Set up, operate, and maintain the electronic equipment used to transmit radio and television programs. Control
aud

In [3]:
question = "What are the specific authorities and responsibilities of the Inspector General within the Department of Justice?"
answer, context, sources = get_answer_and_sources(question)

# Format the output in a readable way
formatted_context = "\n\n".join([f"File: {doc.metadata['file']}, Page {doc.metadata['page']}: {doc.page_content}" for doc in context])

output = f"""
Answer: {answer}

Context used for answer:
{formatted_context}
"""

print(output)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Answer: The Inspector General within the Department of Justice (DOJ) is authorized to conduct investigations and issue reports related to criminal wrongdoing and administrative misconduct of DOJ employees, as well as the administration of programs and operations. They can investigate complaints or information concerning illegal activities, mismanagement, gross waste of funds, abuse of authority, or substantial and specific dangers to public health and safety. The Inspector General has access to all necessary DOJ records and materials except in certain sensitive cases, can request information from any federal, state, or local agency, issue subpoenas for evidence, and employ necessary staff. Special Agents of the Office of the Inspector General are authorized to detect and assist in the prosecution of crimes, serve legal writs, arrest without warrant for certain offenses, seek and execute search and arrest warrants, carry firearms, and perform other law enforcement functions. The Inspec

In [4]:
question = "What were the forecasts for U.S. crude oil production in the fourth quarter of 2016?"
answer, context, sources = get_answer_and_sources(question)

# Format the output in a readable way
formatted_context = "\n\n".join([f"File: {doc.metadata['file']}, Page {doc.metadata['page']}: {doc.page_content}" for doc in context])

output = f"""
Answer: {answer}

Context used for answer:
{formatted_context}
"""

print(output)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Answer: The forecasts for U.S. crude oil production in the fourth quarter of 2016, according to the retrieved context, were as follows:

- Q4 2016 Current Forecast: 8.75 million barrels per day
- Q4 2016 Previous Forecast: 8.68 million barrels per day

The percent change between the previous forecast and the current forecast for Q4 2016 was an increase of 0.7%.

Context used for answer:
File: 6_FAISS.pdf, Page 0: Current Forecast: December 6, 2016; Previous Forecast: November 8, 2016
Q1
Q2
Q3
Q4
Q1
Q2
Q3
Q4
Q1
Q2
Q3
Q4
2014
2015
2016
2017
2014-2015
2015-2016
2016-2017
U.S. Energy Supply
   U.S. Crude Oil Production (million barrels per day)
      Current
9.49
9.47
9.41
9.30
9.17
8.85
8.67
8.75
8.75
8.74
8.70
8.94
8.76
9.42
8.86
8.78
7.4%
-5.9%
-0.9%
      Previous
9.49
9.47
9.41
9.30
9.17
8.85
8.68
8.68
8.68
8.71
8.67
8.87
8.76
9.42
8.84
8.73
7.4%
-6.1%
-1.3%
         Percent Change
0.0%
0.0%
0.0%
0.0%
0.0%
0.0%
-0.1%
0.7%
0.7%
0.4%
0.3%
0.8%
0.0%
0.0%
0.2%
0.6%
   U.S. Dry Natural Ga

In [5]:
question = "How did the number of Chapter 11 filings in Connecticut change between 1994 and 2004?"
answer, context, sources = get_answer_and_sources(question)

# Format the output in a readable way
formatted_context = "\n\n".join([f"File: {doc.metadata['file']}, Page {doc.metadata['page']}: {doc.page_content}" for doc in context])

output = f"""
Answer: {answer}

Context used for answer:
{formatted_context}
"""

print(output)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Answer: The retrieved context indicates a decrease in the number of Chapter 11 filings in Connecticut between 1994 and 2004. In 1994, there were 216 filings, and by 2004, the number had dropped to 84 filings.

Context used for answer:
File: 7_FAISS.pdf, Page 0: 0
50
100
150
200
250
CHAPTER 11 FILINGS
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
CALENDAR YEAR
216
178
166
134
97
90
79
88
125
103
84
CHAPTER 11 FILINGS IN CONNECTICUT
CALENDAR YEARS   1994 --  2004


File: 9_FAISS.pdf, Page 0: UNITED STATES BANKRUPTCY COURT FOR THE DISTRICT OF ALASKA
Monday                  
Historic Courtroom
April 18, 2016            
605 West Fourth Avenue
Page 1                                                  Anchorage, Alaska
CALENDAR OF BANKRUPTCY JUDGE GARY SPRAKER
------------------------------------------------------------------------
  TIME      
  CASE/ADVERSARY NUMBER, NAME, and CHAPTER
                       TYPE OF PROCEEDING and COUNSEL
1:45 p.m. Case No. A14-00065-GS, In re DAVID

In [6]:
question = "What are the key provisions and political challenges faced by the MORE Act introduced by Rep. Ken Calvert?"
answer, context, sources = get_answer_and_sources(question)

# Format the output in a readable way
formatted_context = "\n\n".join([f"File: {doc.metadata['file']}, Page {doc.metadata['page']}: {doc.page_content}" for doc in context])

output = f"""
Answer: {answer}

Context used for answer:
{formatted_context}
"""

print(output)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Answer: The MORE Act introduced by Representative Ken Calvert, which stands for Maximize Offshore Resource Exploration, is focused on offshore drilling, not marijuana legalization. The key provisions of the MORE Act introduced by Calvert include revoking the congressional moratorium on offshore drilling, allowing drilling starting 25 miles from the coastline, and enabling states to receive a significant percentage of royalties from offshore drilling leases, up to 75% or 90% if they agree to drilling within 25 miles of their coastline.

The political challenges faced by this MORE Act relate to environmental concerns, opposition from Democrats and certain coastal communities who worry about the potential risks of offshore drilling to marine ecosystems and tourism, and debates over the energy policy and priorities of the United States. At the time of the bill's introduction, no Democrats had signed on to it, which indicates partisan challenges. Additionally, Representative Calvert's bill