###  Import Libraries

In [1]:
import PyPDF2
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os
import asyncio


###  Load Environment Variables

In [2]:
# Loading environment variables from .env file
load_dotenv() 
groq_api_key = os.getenv('GROQ_API_KEY', 'your-default-api-key')  # Replace 'your-default-api-key' with the actual key if not using .env file


### Initialize GROQ Chat

In [3]:
# Initializing GROQ chat with provided API key, model name, and settings
llm_groq = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="llama3-8b-8192",  # Options: mixtral-8x7b-32768, llama3-70b-8192, gemma-7b-it, llama3-8b-8192
    temperature=0.2
)


### Function to Handle File Uploads and Text Extraction

In [4]:
def handle_file_uploads(file_paths):
    texts = []
    metadatas = []

    for file_path in file_paths:
        print(file_path)  # Print the file path for debugging

        # Read the PDF file
        with open(file_path, 'rb') as file:
            pdf = PyPDF2.PdfReader(file)
            pdf_text = ""
            for page in pdf.pages:
                pdf_text += page.extract_text()
                
            # Split the text into chunks
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=50)
            file_texts = text_splitter.split_text(pdf_text)
            texts.extend(file_texts)

            # Create metadata for each chunk
            file_metadatas = [{"source": f"{i}-{os.path.basename(file_path)}"} for i in range(len(file_texts))]
            metadatas.extend(file_metadatas)

    return texts, metadatas


### Create Chroma Vector Store

In [5]:
def create_chroma_vector_store(texts, metadatas):
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    docsearch = Chroma.from_texts(texts, embeddings, metadatas=metadatas)
    return docsearch


### Initialize Conversation Chain

In [6]:
def initialize_conversation_chain(docsearch):
    message_history = ChatMessageHistory()
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        output_key="answer",
        chat_memory=message_history,
        return_messages=True,
    )

    chain = ConversationalRetrievalChain.from_llm(
        llm=llm_groq,
        chain_type="stuff",
        retriever=docsearch.as_retriever(),
        memory=memory,
        return_source_documents=True,
    )
    return chain


### Upload Files and Process Them

In [8]:
# Manually specify file paths for the PDFs to be uploaded
dir_path='/home/phdcs2/Hard_Disk/Projects/chatgpt/Multi-PDF-llama3Chat/file/'
file1=dir_path + 'dhUfQAsllb.pdf'
file2=dir_path + 'z1GMOTZvU1.pdf'
file_paths = [file1, file2]  # Replace with your file paths

# Process uploaded files
texts, metadatas = handle_file_uploads(file_paths)

# Create Chroma vector store
docsearch = create_chroma_vector_store(texts, metadatas)

# Initialize conversation chain
chain = initialize_conversation_chain(docsearch)


/home/phdcs2/Hard_Disk/Projects/chatgpt/Multi-PDF-llama3Chat/file/dhUfQAsllb.pdf
/home/phdcs2/Hard_Disk/Projects/chatgpt/Multi-PDF-llama3Chat/file/z1GMOTZvU1.pdf


### Handle User Queries

In [9]:
def handle_query(query, chain):
    # Call the chain with the user's query
    res = chain(query)
    answer = res["answer"]
    source_documents = res["source_documents"] 

    text_elements = []  # Initialize list to store text elements
    
    # Process source documents if available
    if source_documents:
        for source_idx, source_doc in enumerate(source_documents):
            source_name = f"source_{source_idx}"
            # Create the text element referenced in the message
            text_elements.append(source_doc.page_content)
    
    # Return answer and sources
    return answer, text_elements


### Simulate User Query

In [11]:
# Example user query
query = "What is the total bond amount enchased by TELUGU DESAM PARTY on 12th April 2019??"

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)
# print("Sources:", sources)


Answer: I don't have enough information to answer that question. The given data only includes transactions from 15th April 2019 onwards, and there is no transaction on 12th April 2019.


In [12]:
# Example user query
query = "What is the total bond amount purchased by CHOUDHARY GARMENTS on 12th April 2019?"

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)

Answer: I apologize, but there is no record of CHOUDHARY GARMENTS in the provided data. Therefore, I cannot determine the total bond amount purchased by CHOUDHARY GARMENTS on 12th April 2019 as it does not exist in the data.


In [13]:

# Example user query
query = "What is the total number of bonds purchased by ACROPOLIS MAINTENANCE SERVICES PRIVATE LIMITED on 12th April 2019?"

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)

Answer: I apologize, but there is no record of ACROPOLIS MAINTENANCE SERVICES PRIVATE LIMITED purchasing bonds on 12th April 2019 or any other date in the provided data. The data only shows transactions for other companies and individuals, but not for ACROPOLIS MAINTENANCE SERVICES PRIVATE LIMITED.


In [14]:

# Example user query
query = "What is the total amount received by AAM AADMI PARTY from DR. MANDEEP SHARMA in the year 2022?"

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)

Answer: I can help you with that!

After reviewing the provided data, I found the following transactions related to AAM AADMI PARTY and DR. MANDEEP SHARMA:

1. 11915-11919: 6 transactions with a total amount of 6 x 1,00,00,000 = 6,00,00,000
2. 11920-11927: 8 transactions with a total amount of 8 x 10,00,000 = 80,00,000
3. 11579-11591: 13 transactions with a total amount of 13 x 10,00,000 = 1,30,00,000
4. 11559-11562: 4 transactions with a total amount of 4 x 1,00,00,000 = 4,00,00,000
5. 13164: 1 transaction with a total amount of 1,00,00,000

Adding up these amounts, the total amount received by AAM AADMI PARTY from DR. MANDEEP SHARMA in the year 2022 is:

6,00,00,000 + 80,00,000 + 1,30,00,000 + 4,00,00,000 + 1,00,00,000 = 12,50,00,000

So, the total amount received by AAM AADMI PARTY from DR. MANDEEP SHARMA in the year 2022 is 12,50,00,000.


In [74]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"
    return full_text

def generate_few_shot_prompt(pdf_text):
    prompt = f"""
I have a PDF document containing details about the encashments of bonds by political parties. I need to extract specific information from this PDF. Here are some examples of the information I need:

### Example 1:
PDF Text:
"Date of Encashment: 2023-06-15
Name of the Political Party: Party A
Account Number: 123456789
Bond Number: ABC123
Denominations: 1000
Pay Branch Code: 001
Pay Teller: John Doe"

Extracted Information:
- Date of Encashment: 2023-06-15
- Name of the Political Party: Party A
- Account Number: 123456789
- Bond Number: ABC123
- Denominations: 1000


### Example 2:
PDF Text:
"Date of Encashment: 2023-07-01
Name of the Political Party: Party B
Account Number: 987654321
Bond Number: XYZ456
Denominations: 5000

Extracted Information:
- Date of Encashment: 2023-07-01
- Name of the Political Party: Party B
- Account Number: 987654321
- Bond Number: XYZ456
- Denominations: 5000


### Task:
Given the text from the PDF, extract the details in the same format as the examples above.

PDF Text:
"{pdf_text}"

Extracted Information:
- Date of Encashment:
- Name of the Political Party:
- Account Number:
- Bond Number:
- Denominations:
Question:
What is the total bond amount enchased by TELUGU DESAM PARTY on April 2019?
"""
    return prompt
# Example usage
pdf_path = '/home/phdcs2/Hard_Disk/Projects/chatgpt/Multi-PDF-llama3Chat/file/dhUfQAsllb.pdf'
pdf_text = extract_text_from_pdf(pdf_path)
few_shot_prompt = generate_few_shot_prompt(pdf_text)
# print(few_shot_prompt)

In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"
    return full_text

In [76]:
def generate_few_shot_prompt(pdf_text):
    prompt = f"""
I have a PDF document containing details about the encashments of bonds by political parties. Answer the Question using examples provided:

### Example 1:
PDF Text:
"Date of Encashment: 2023-06-15
Name of the Political Party: Party A
Account Number: 123456789
Bond Number: ABC123
Denominations: 1000
Pay Branch Code: 001
Pay Teller: John Doe"


### Example 2:
PDF Text:
"Date of Encashment: 2023-07-01
Name of the Political Party: Party B
Account Number: 987654321
Bond Number: XYZ456
Denominations: 5000

Question:
What is the total bond amount enchased by TELUGU DESAM PARTY on April 2019?
"""
    return prompt
# Example usage
pdf_path = '/home/phdcs2/Hard_Disk/Projects/chatgpt/Multi-PDF-llama3Chat/file/dhUfQAsllb.pdf'
# pdf_text = extract_text_from_pdf(pdf_path)
few_shot_prompt = generate_few_shot_prompt(pdf_text)
# print(few_shot_prompt)

In [77]:
query = f""" "{few_shot_prompt}"
Question:
What is the total bond amount enchased by TELUGU DESAM PARTY on April 2019?"""

# Handle the query`
answer, sources = handle_query(query, chain)

print("Answer:", answer)

Answer: Based on the provided data, the total bond amount enchased by TELUGU DESAM PARTY on April 2019 is:

750: 1,00,00,000
751: 10,00,000
752: 10,00,000
753: 10,00,000
754: 10,00,000
755: 10,00,000
756: 10,00,000
757: 10,00,000
758: 10,00,000
759: 10,00,000
760: 10,00,000

Total bond amount enchased by TELUGU DESAM PARTY on April 2019 is: 1,00,00,000 + 10,00,000 x 10 = 1,10,00,000


In [80]:
query = f""" "{few_shot_prompt}"
Question:
What is the total bond amount enchased by TELUGU DESAM PARTY on April 2019?"""

# Handle the query`
answer, sources = handle_query(query, chain)

print("Answer:", answer)

BadRequestError: Error code: 400 - {'error': {'message': 'Please reduce the length of the messages or completion.', 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [None]:
# Ignore Issue Branch Code, Pay Teller, Issue Teller, Account Number of Political Party.
# Example user query
query = f"""Using the below information answer the given question.
Information:
Total bonds encased by party list:
AAP : 500K
BJP : 1000K
TELUGU DESAM PARTY : 50K
Question:
What is the total bond amount enchased by TELUGU DESAM PARTY on April 2019?"""

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)

In [68]:
query='What is the total bond amount enchased by TELUGU DESAM PARTY on April 2019?'

In [67]:
def FewShotPrompt(test_entry, exemplars):
    messages = [{"role": "system", "content": "Read the math problem and write the answer."}]
    for entry in exemplars:
        messages.append({"role": "user", "content": "[question] " + entry['question']})
        messages.append({"role": "assistant", "content": "[answer] " + entry['numerical_answer']})
    messages.append({"role": "user", "content": "[question] " + test_entry['question']})

    # messages = [{"role": "system", "content": "MATH PROBLEM INSTRUCTIONS"}]
    # for entry in exemplars:
    #     messages.append({"role": "user", "content": "QUESTION HERE"})
    #     messages.append({"role": "assistant", "content": "ANSWER HERE"})
    # messages.append({"role": "user", "content": "QUESTION HERE"})

    return messages

In [70]:
message=FewShotPrompt(query, few_shot_prompt)

TypeError: string indices must be integers

In [None]:
results = []
for entry in tqdm(test_dataset):
    messages = few_shot_prompt(entry, exemplars)
    ret = run_groq_model(messages, "llama3-8b-8192", max_tokens=16, temperature=0.0)
    results.append(ret)

In [42]:
# Ignore Issue Branch Code, Pay Teller, Issue Teller, Account Number of Political Party.
# Example user query
query = f"""Using the below information answer the given question.
Information:
Total bonds encased by party list:
AAP : 500K
BJP : 1000K
TELUGU DESAM PARTY : 50K
Question:
What is the total bond amount enchased by TELUGU DESAM PARTY on April 2019?"""

# Handle the query
answer, sources = handle_query(query, chain)

print("Answer:", answer)

Answer: Based on the provided data, the total bond amount enchased by TELUGU DESAM PARTY on April 2019 is:

* 6666 (from record 750)
* 12957 (from record 751)
* 12965 (from record 752)
* 12848 (from record 753)
* 12967 (from record 754)
* 12955 (from record 755)
* 12850 (from record 756)
* 12959 (from record 757)
* 12963 (from record 758)
* 12961 (from record 759)
* 12969 (from record 760)

The total bond amount is:
6666 + 12957 + 12965 + 12848 + 12967 + 12955 + 12850 + 12959 + 12963 + 12961 + 12969 = 1,00,00,000

So, the total bond amount enchased by TELUGU DESAM PARTY on April 2019 is 1,00,00,000.


In [49]:
# Ignore Issue Branch Code, Pay Teller, Issue Teller, Account Number of Political Party.
# Example user query
query = f""" Using the below Information answer the given Question .
Information:
Denominations by party list enchased on a specific date.
Total bound encased by party list:
SHIVSENA : select denominations from file 
BHARATIYA JANATA PARTY : select denominations from file
ALL INDIA TRINAMOOL CONGRESS : select denominations from file
Question:
What is the total bond amount enchased by TELUGU DESAM PARTY between 17 to 20 April 2019"""

# Handle the query
answer, sources = handle_query(query, chain)
print("Answer:", answer)

Answer: I can help you with that!

According to the given data, the TELUGU DESAM PARTY has the following bond amounts enchased between 17 to 20 April 2019:

* 14622: 1,00,00,000
* 14623: 1,00,00,000
* 14624: 1,00,00,000
* 14625: 1,00,00,000
* 14626: 1,00,00,000
* 14627: 1,00,00,000
* 14628: 1,00,00,000
* 14629: 1,00,00,000
* 14630: 1,00,00,000
* 14631: 1,00,00,000
* 14632: 1,00,00,000
* 14633: 1,00,00,000
* 14634: 1,00,00,000
* 14635: 1,00,00,000

The total bond amount enchased by TELUGU DESAM PARTY between 17 to 20 April 2019 is:

1,00,00,000 + 1,00,00,000 + ... + 1,00,00,000 = 14,00,00,000

So, the total bond amount enchased by TELUGU DESAM PARTY between 17 to 20 April 2019 is 14,00,00,000.


In [62]:
query = f""" Using the below Information answer the given Question .
Information:
on 12/Apr/2019 Bhartiya Janta Party enchased total money : Sum of all Denominations on the 12/Apr/2019 for Bhartiya Janta PARTY 
on 16/Dec/2022 YSR CONGRESS PARTY enchased total money : Sum of all Denominations on the 12/Dec/2022 for YSR CONGRESS PARTY
Question:
What is the total bond amount enchased by TELUGU DESAM PARTY on 12th April 2019??"""

# Handle the query
answer, sources = handle_query(query, chain)
print("Answer:", answer)

Answer: I don't have enough information to answer that question. The provided data only includes transactions from 15th April 2019 onwards, and there is no record of a transaction on 12th April 2019.


In [63]:
query = f""" Using the below Information answer the given Question .
Information:
Q1: total bond amount enchased by ALL INDIA ANNA DRAVIDA MUNNETRA KAZHAGAM on 12th April 2019?

R1: make sum of all Denominations on the given date for ALL INDIA ANNA DRAVIDA MUNNETRA KAZHAGAM

30*10,00,000 + 3 * 1,00,00,000 = 6,00,00,000

Q2: total bond amount enchased by BHARATIYA JANATA PARTY on 12th April 2019?

R2: make sum of all Denominations on the given date for BHARATIYA JANATA PARTY

402 * denominations

Q3: total bond amount enchased by BHARAT RASHTRA SAMITHI on 12th April 2019?

R3: make sum of all Denominations on the given date for BHARAT RASHTRA SAMITHI

26 * Denominations

Question:
What is the total bond amount enchased by TELUGU DESAM PARTY on 12th April 2019??"""

# Handle the query
answer, sources = handle_query(query, chain)
print("Answer:", answer)

Answer: I don't have enough information to answer that question. The provided data only includes transactions from 15th April 2019 onwards, and there is no record of a transaction on 12th April 2019.


### Get The final Answer

In [18]:
def get_final_answer(entry):
    entry['numerical_answer'] = entry['answer'].split('\n')[-1].split(' ', 1)[-1].strip()

    return entry

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


### Compute Accuracy

In [19]:
def compute_accuracy(golds, preds):
    cnt = 0
    for gg, pp in zip(golds, preds):
        gg = float(gg.replace(',', ''))
        try:
            pp = float(pp.replace(',', ''))
        except:
            pp = None

        if gg == pp:
            cnt += 1
    return cnt / len(golds)

## Few Shot Prompts

In [None]:
exemplars={'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
 'numerical_answer': '72',
 'code': 'Here is the step-by-step Python code for the given problem.\n```# Clips sold in April\nclips_april = 48\n\n# Clips sold in May (half of April)\nclips_may = clips_april / 2\n\n# Total clips sold\ntotal_clips = clips_april + clips_may\nprint("FINAL ANSWER:", total_clips)\n```\nThe final answer is "total_clip" '}

In [None]:
def few_shot_prompt(test_entry, exemplars):
    messages = [{"role": "system", "content": "MATH PROBLEM INSTRUCTIONS"}]
    for entry in exemplars:
        messages.append({"role": "user", "content": "QUESTION HERE"})
        messages.append({"role": "assistant", "content": "ANSWER HERE"})
    messages.append({"role": "user", "content": "QUESTION HERE"})

    return messages

In [None]:
def few_shot_prompt(test_entry, exemplars):
    messages = [{"role": "system", "content": "Read the Question and write the answer."}]
    for entry in exemplars:
        messages.append({"role": "user", "content": "[question] " + entry['question']})
        messages.append({"role": "assistant", "content": "[answer] " + entry['numerical_answer']})
    messages.append({"role": "user", "content": "[question] " + test_entry['question']})

    # messages = [{"role": "system", "content": "MATH PROBLEM INSTRUCTIONS"}]
    # for entry in exemplars:
    #     messages.append({"role": "user", "content": "QUESTION HERE"})
    #     messages.append({"role": "assistant", "content": "ANSWER HERE"})
    # messages.append({"role": "user", "content": "QUESTION HERE"})

    return messages