###  Import Libraries

In [1]:
import PyPDF2
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os
import asyncio


###  Load Environment Variables

In [2]:
# Loading environment variables from .env file
load_dotenv() 
groq_api_key = os.getenv('GROQ_API_KEY', 'your-default-api-key')  # Replace 'your-default-api-key' with the actual key if not using .env file


### Initialize GROQ Chat

In [3]:
# Initializing GROQ chat with provided API key, model name, and settings
llm_groq = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="llama3-8b-8192",  # Options: mixtral-8x7b-32768, llama3-70b-8192, gemma-7b-it, llama3-8b-8192
    temperature=0.2
)


### Function to Handle File Uploads and Text Extraction

In [4]:
def handle_file_uploads(file_paths):
    texts = []
    metadatas = []

    for file_path in file_paths:
        print(file_path)  # Print the file path for debugging

        # Read the PDF file
        with open(file_path, 'rb') as file:
            pdf = PyPDF2.PdfReader(file)
            pdf_text = ""
            for page in pdf.pages:
                pdf_text += page.extract_text()
                
            # Split the text into chunks
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=50)
            file_texts = text_splitter.split_text(pdf_text)
            texts.extend(file_texts)

            # Create metadata for each chunk
            file_metadatas = [{"source": f"{i}-{os.path.basename(file_path)}"} for i in range(len(file_texts))]
            metadatas.extend(file_metadatas)

    return texts, metadatas


### Create Chroma Vector Store

In [5]:
def create_chroma_vector_store(texts, metadatas):
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    docsearch = Chroma.from_texts(texts, embeddings, metadatas=metadatas)
    return docsearch


### Initialize Conversation Chain

In [6]:
def initialize_conversation_chain(docsearch):
    message_history = ChatMessageHistory()
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        output_key="answer",
        chat_memory=message_history,
        return_messages=True,
    )

    chain = ConversationalRetrievalChain.from_llm(
        llm=llm_groq,
        chain_type="stuff",
        retriever=docsearch.as_retriever(),
        memory=memory,
        return_source_documents=True,
    )
    return chain


### Upload Files and Process Them

In [8]:
# Manually specify file paths for the PDFs to be uploaded
dir_path='/home/phdcs2/Hard_Disk/Projects/chatgpt/Multi-PDF-llama3Chat/file/'
file1=dir_path + 'dhUfQAsllb.pdf'
file2=dir_path + 'z1GMOTZvU1.pdf'
file_paths = [file1, file2]  # Replace with your file paths

# Process uploaded files
texts, metadatas = handle_file_uploads(file_paths)

# Create Chroma vector store
docsearch = create_chroma_vector_store(texts, metadatas)

# Initialize conversation chain
chain = initialize_conversation_chain(docsearch)


/home/phdcs2/Hard_Disk/Projects/chatgpt/Multi-PDF-llama3Chat/file/dhUfQAsllb.pdf
/home/phdcs2/Hard_Disk/Projects/chatgpt/Multi-PDF-llama3Chat/file/z1GMOTZvU1.pdf


### Handle User Queries

In [9]:
def handle_query(query, chain):
    # Call the chain with the user's query
    res = chain(query)
    answer = res["answer"]
    source_documents = res["source_documents"] 

    text_elements = []  # Initialize list to store text elements
    
    # Process source documents if available
    if source_documents:
        for source_idx, source_doc in enumerate(source_documents):
            source_name = f"source_{source_idx}"
            # Create the text element referenced in the message
            text_elements.append(source_doc.page_content)
    
    # Return answer and sources
    return answer, text_elements


### Simulate User Query

In [11]:
# Example user query
query = "What is the total bond amount enchased by TELUGU DESAM PARTY on 12th April 2019??"

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)
# print("Sources:", sources)


Answer: I don't have enough information to answer that question. The given data only includes transactions from 15th April 2019 onwards, and there is no transaction on 12th April 2019.


In [12]:
# Example user query
query = "What is the total bond amount purchased by CHOUDHARY GARMENTS on 12th April 2019?"

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)

Answer: I apologize, but there is no record of CHOUDHARY GARMENTS in the provided data. Therefore, I cannot determine the total bond amount purchased by CHOUDHARY GARMENTS on 12th April 2019 as it does not exist in the data.


In [13]:

# Example user query
query = "What is the total number of bonds purchased by ACROPOLIS MAINTENANCE SERVICES PRIVATE LIMITED on 12th April 2019?"

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)

Answer: I apologize, but there is no record of ACROPOLIS MAINTENANCE SERVICES PRIVATE LIMITED purchasing bonds on 12th April 2019 or any other date in the provided data. The data only shows transactions for other companies and individuals, but not for ACROPOLIS MAINTENANCE SERVICES PRIVATE LIMITED.


In [14]:

# Example user query
query = "What is the total amount received by AAM AADMI PARTY from DR. MANDEEP SHARMA in the year 2022?"

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)

Answer: I can help you with that!

After reviewing the provided data, I found the following transactions related to AAM AADMI PARTY and DR. MANDEEP SHARMA:

1. 11915-11919: 6 transactions with a total amount of 6 x 1,00,00,000 = 6,00,00,000
2. 11920-11927: 8 transactions with a total amount of 8 x 10,00,000 = 80,00,000
3. 11579-11591: 13 transactions with a total amount of 13 x 10,00,000 = 1,30,00,000
4. 11559-11562: 4 transactions with a total amount of 4 x 1,00,00,000 = 4,00,00,000
5. 13164: 1 transaction with a total amount of 1,00,00,000

Adding up these amounts, the total amount received by AAM AADMI PARTY from DR. MANDEEP SHARMA in the year 2022 is:

6,00,00,000 + 80,00,000 + 1,30,00,000 + 4,00,00,000 + 1,00,00,000 = 12,50,00,000

So, the total amount received by AAM AADMI PARTY from DR. MANDEEP SHARMA in the year 2022 is 12,50,00,000.


In [16]:

# Example user query
query = f""" Using the below information answer the given question.
Information:
Total bound encased by party list:
AAP : 500K
BJP : 1000K
TELUGU DESAM PARTY : 50K

Question:
What is the total bond amount enchased by TELUGU DESAM PARTY on 12th April 2019"""

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)

Answer: To find the total bond amount enchased by TELUGU DESAM PARTY, I'll go through the given data and add up the bond amounts.

Here are the bond amounts enchased by TELUGU DESAM PARTY:

* 14622-14630: 1,00,00,000 (10,00,000 x 10)
* 761-764: 1,00,00,000 (10,00,000 x 10)
* 19687-19694: 1,00,00,000 (10,00,000 x 10)
* 5562-5567: 1,00,00,000 (10,00,000 x 10)

Adding up these amounts, the total bond amount enchased by TELUGU DESAM PARTY is:

10,00,000 x 40 = 40,00,00,000

So, the total bond amount enchased by TELUGU DESAM PARTY is 40,00,00,000.
