In [3]:
import pdfplumber
import pandas as pd
from llama_index.core import VectorStoreIndex, Document
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load PDF and extract text and tables
pdf_path = "7df4dbdc-eb62-4d53-bc27-d334bfcb2335.pdf"
text = ""
tables = []

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text()
        page_tables = page.extract_tables()
        for table in page_tables:
            df = pd.DataFrame(table[1:], columns=table[0])
            tables.append(df)

# Inspect DataFrames
for i, df in enumerate(tables):
    print(f"DataFrame {i} columns:\n{df.columns}\n")
    print(f"DataFrame {i} head:\n{df.head()}\n")

# Handle duplicate columns
for i, df in enumerate(tables):
    if df.columns.duplicated().any():
        print(f"DataFrame {i} has duplicate columns: {df.columns[df.columns.duplicated()].unique()}")
        # Rename duplicate columns
        df.columns = [f"{col}_{j}" if duplicated else col
                      for j, (col, duplicated) in enumerate(zip(df.columns, df.columns.duplicated()))]
        tables[i] = df

# Align columns
all_columns = set()
for df in tables:
    all_columns.update(df.columns)

for i, df in enumerate(tables):
    tables[i] = df.reindex(columns=all_columns)

# Remove duplicates if necessary
tables = [df.drop_duplicates() for df in tables]

# Concatenate DataFrames
try:
    combined_table = pd.concat(tables, ignore_index=True)
    print(combined_table.head())
except Exception as e:
    print(f"Error during concatenation: {e}")

# Split text into chunks
max_chunk_size = 2000  # Smaller chunks for batch processing
chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

# Load quantized model and tokenizer
model_name = "facebook/bart-base"  # Smaller model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Summarization function
def summarize_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Summarize each chunk
batch_size = 4
summaries = []
for i in range(0, len(chunks), batch_size):
    batch_chunks = chunks[i:i + batch_size]
    batch_summaries = [summarize_text(chunk) for chunk in batch_chunks]
    summaries.extend(batch_summaries)

# Combine summaries
final_summary = " ".join(summaries)
print(final_summary)

# Convert tables to string format
def tables_to_string(tables):
    table_strings = []
    for table in tables:
        table_strings.append(table.to_string(index=False))
    return "\n\n".join(table_strings)

# Convert combined table to string
table_string = tables_to_string(tables)

# Create documents from text chunks and table string
documents = [Document(text=chunk) for chunk in chunks] + [Document(text=table_string)]

# Initialize VectorStoreIndex
index = VectorStoreIndex.from_documents(documents)

# Function to perform QnA
def ask_question(question):
    response = index.query(question)
    return response

# Example question
question = "What is the net income for NVIDIA for the three months ended October 29, 2023?"
answer = ask_question(question)
print(answer)

DataFrame 0 columns:
Index(['Revenue', '$', '18,120', '', '$', '5,931', '', '$', '38,819', '', '$',
       '20,923'],
      dtype='object')

DataFrame 0 head:
                             Revenue       $ 18,120        $ 5,931         $  \
0                    Cost of revenue   4,720   None    2,754  None    11,309   
1                       Gross profit  13,400   None    3,177  None    27,510   
2                 Operating expenses           None           None             
3           Research and development   2,294   None    1,945  None     6,210   
4  Sales, general and administrative     689   None      631  None     1,942   

  38,819         $ 20,923  
0   None     9,400   None  
1   None    11,523   None  
2   None             None  
3   None     5,387   None  
4   None     1,815   None  

DataFrame 1 columns:
Index(['Net income', '$', '9,243', '', '$', '680', '', '$', '17,475', '', '$',
       '2,954'],
      dtype='object')

DataFrame 1 head:
                                 

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [2]:
import pdfplumber
import pandas as pd
from llama_index.core import VectorStoreIndex, Document
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from multiprocessing import Pool, cpu_count

# Helper function to process a single PDF page
def process_pdf_page(args):
    pdf_path, page_number = args
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_number]
        text = page.extract_text()
        tables = page.extract_tables()
    return text, tables

# Load PDF and extract text and tables using multiprocessing
pdf_path = "7df4dbdc-eb62-4d53-bc27-d334bfcb2335.pdf"
text = ""
tables = []

with pdfplumber.open(pdf_path) as pdf:
    page_numbers = list(range(len(pdf.pages)))
    with Pool(cpu_count()) as p:
        results = p.map(process_pdf_page, [(pdf_path, page_number) for page_number in page_numbers])

for page_text, page_tables in results:
    text += page_text
    for table in page_tables:
        df = pd.DataFrame(table[1:], columns=table[0])
        tables.append(df)

# Handle duplicate columns and align DataFrames
all_columns = set()
for i, df in enumerate(tables):
    if df.columns.duplicated().any():
        df.columns = [f"{col}_{j}" if duplicated else col
                      for j, (col, duplicated) in enumerate(zip(df.columns, df.columns.duplicated()))]
    all_columns.update(df.columns)

for i, df in enumerate(tables):
    tables[i] = df.reindex(columns=all_columns).drop_duplicates()

# Concatenate DataFrames
try:
    combined_table = pd.concat(tables, ignore_index=True)
    print(combined_table.head())
except Exception as e:
    print(f"Error during concatenation: {e}")

# Split text into chunks
max_chunk_size = 2000
chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

# Load model and tokenizer with GPU support
model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Summarization function with GPU support
def summarize_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Summarize each chunk in batches
batch_size = 4
summaries = []
for i in range(0, len(chunks), batch_size):
    batch_chunks = chunks[i:i + batch_size]
    batch_summaries = [summarize_text(chunk) for chunk in batch_chunks]
    summaries.extend(batch_summaries)

# Combine summaries
final_summary = " ".join(summaries)
print(final_summary)

# Convert tables to string format
def tables_to_string(tables):
    table_strings = []
    for table in tables:
        table_strings.append(table.to_string(index=False))
    return "\n\n".join(table_strings)

# Convert combined table to string
table_string = tables_to_string(tables)

# Create documents from text chunks and table string
documents = [Document(text=chunk) for chunk in chunks] + [Document(text=table_string)]

# Initialize VectorStoreIndex
index = VectorStoreIndex.from_documents(documents)

# Function to perform QnA
def ask_question(question):
    response = index.query(question)
    return response

# Example question
question = "What is the net income for NVIDIA for the three months ended October 29, 2023?"
answer = ask_question(question)
print(answer)


    11,971 10,829 Cash flows from operating activities:    %  \
0      NaN    NaN                                   NaN  NaN   
1      NaN    NaN                                   NaN  NaN   
2      NaN    NaN                                   NaN  NaN   
3      NaN    NaN                                   NaN  NaN   
4      NaN    NaN                                   NaN  NaN   

  Three Months Ended October 29, 2023 18,112 1,206 15,640 12,629  ...  \
0                                 NaN    NaN   NaN    NaN    NaN  ...   
1                                 NaN    NaN   NaN    NaN    NaN  ...   
2                                 NaN    NaN   NaN    NaN    NaN  ...   
3                                 NaN    NaN   NaN    NaN    NaN  ...   
4                                 NaN    NaN   NaN    NaN    NaN  ...   

                             Revenue  234 Supplemental cash flows information  \
0                    Cost of revenue  NaN                                 NaN   
1             

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}