In [6]:
!pip install openai langchain llama-index pdfminer.six pandas

Collecting langchain
  Downloading langchain-0.2.10-py3-none-any.whl.metadata (6.9 kB)
Collecting llama-index
  Downloading llama_index-0.10.56-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-core<0.3.0,>=0.2.22 (from langchain)
  Downloading langchain_core-0.2.22-py3-none-any.whl.metadata (6.0 kB)
Collecting langchain-text-splitters<0.3.0

In [1]:
from pdfminer.high_level import extract_text

pdf_path = "7df4dbdc-eb62-4d53-bc27-d334bfcb2335.pdf"

text = extract_text(pdf_path)
print(text[:500]) 

UNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-Q☒QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the quarterly period ended October 29, 2023OR☐TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934Commission file number: 0-23985NVIDIA CORPORATION(Exact name of registrant as specified in its charter)Delaware94-3177549(State or other jurisdiction of(I.R.S. Employerincorporation or organizatio


In [2]:
len(text)

171314

In [3]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def chunk_text(text, chunk_size=4000):
  sentences = sent_tokenize(text)
  chunks = []
  current_chunk = []
  current_length = 0
  
  for sentence in sentences:
    sentence_length = len(sentence.split())
    if current_length + sentence_length > chunk_size:
      chunks.append(' '.join(current_chunk))
      current_chunk = [sentence]
      current_length = sentence_length
    else:
      current_chunk.append(sentence)
      current_length += sentence_length
  
  if current_chunk:
    chunks.append(' '.join(current_chunk))
  
  return chunks

chunks = chunk_text(text)
print(f"Number of chunks: {len(chunks)}")
print(f"First chunk: {chunks[0][:500]}")

Number of chunks: 7
First chunk: UNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-Q☒QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the quarterly period ended October 29, 2023OR☐TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934Commission file number: 0-23985NVIDIA CORPORATION(Exact name of registrant as specified in its charter)Delaware94-3177549(State or other jurisdiction of(I.R.S. Employerincorporation or organizatio


[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
from transformers import pipeline

#T5 for summarization, known for handling longer text chunks
try:
  summarizer = pipeline("summarization", model="t5-base")
except Exception as e:
  print(f"Error loading summarizer: {e}")
  summarizer = None  

def summarize_chunk(chunk):
  if summarizer is not None:
    max_length = 256  
    try:
      summary = summarizer(chunk, max_length=max_length, min_length=25, do_sample=False)
      return summary[0]['summary_text']
    except Exception as e:
      print(f"Error summarizing chunk '{chunk[:50]}...': {e}")
      return "Error during summarization."
  else:
    return "Summarization pipeline failed to load."

def chunk_text(text, chunk_size=2000):  
  sentences = sent_tokenize(text)
  chunks = []
  current_chunk = []
  current_length = 0
  
  for sentence in sentences:
    sentence_length = len(sentence.split())
    if current_length + sentence_length > chunk_size:
      chunks.append(' '.join(current_chunk))
      current_chunk = [sentence]
      current_length = sentence_length
    else:
      current_chunk.append(sentence)
      current_length += sentence_length
  
  if current_chunk:
    chunks.append(' '.join(current_chunk))
  
  return chunks

chunks = chunk_text(text)
print(f"Number of chunks: {len(chunks)}")

summaries = [summarize_chunk(chunk) for chunk in chunks]
full_summary = "\n".join(summaries)
print(full_summary[:1000])


Number of chunks: 13
the number of shares of common stock, $0.001 par value, outstanding as of November 17, 2023, was 2.47 billion . investors should monitor these accounts and the blog, in addition to following our investor relations website, press releases, SEC filings and public conference calls and webcasts.
nviidia's balance sheet was reclassified to conform to the current fiscal year presentation . a total of $9.03 billion of aggregate unearned stock-based compensation expense was recognized for the third quarter and first nine months of fiscal year 2023 . the irs audit of our federal income tax returns for fiscal years 2018 and 2019 was resolved .
the effective tax rate for the first nine months of fiscal year 2024 was lower than the federal statutory rate of 21% . net realized gains and losses were not significant for all periods presented . two customers each accounted for 11% of our accounts receivable balance as of October 29, 2023 .
During the first nine months of fiscal ye

In [5]:
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Convert summaries to LangChain Documents
documents = [Document(page_content=summary) for summary in summaries]

# Initialize Hugging Face embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the FAISS vector store index
vectorstore = FAISS.from_documents(documents, embeddings)

  warn_deprecated(


In [6]:
# Define the QA model and tokenizer
qa_model_name = "deepset/roberta-base-squad2"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)

# QA pipeline
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)

In [7]:
def retrieve_documents(question):
    # Use the vector store's retriever to get relevant documents
    retriever = vectorstore.as_retriever()
    
    # Use get_relevant_documents method
    results = retriever.get_relevant_documents(question)
    
    return results

def answer_question(question):
    # Retrieve relevant documents
    retrieved_docs = retrieve_documents(question)
    
    # Combine the contents of the retrieved documents
    context = " ".join(doc.page_content for doc in retrieved_docs)
    
    # Answer the question using the QA pipeline
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Example question
question = "What is the financial performance of NVIDIA for the quarter ended October 29, 2023?"
answer = answer_question(question)
print(f'Answer: {answer}')

  warn_deprecated(


Answer: $18.28 billion in cash, cash equivalents, and marketable securities


In [10]:
import pandas as pd

def preprocess_tables(text):
    tables = []
    # Append each table DataFrame to the tables list
    return tables

tables = preprocess_tables(text)

# Convert each table to a text chunk
table_chunks = [df.to_string() for df in tables]

# Combine text and table chunks
all_chunks = chunks + table_chunks

# Summarize combined chunks
summaries1 = [summarize_chunk(chunk) for chunk in all_chunks]
full_summary = "\n".join(summaries1)

In [11]:
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Convert summaries to LangChain Documents
documents = [Document(page_content=summary) for summary in summaries1]

# Initialize Hugging Face embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the FAISS vector store index
vectorstore = FAISS.from_documents(documents, embeddings)

# Define the QA model and tokenizer
qa_model_name = "deepset/roberta-base-squad2"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)

# QA pipeline
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)

def retrieve_documents(question):
    # Use the vector store's retriever to get relevant documents
    retriever = vectorstore.as_retriever()
    
    # Use get_relevant_documents method
    results = retriever.get_relevant_documents(question)
    
    return results

def answer_question(question):
    # Retrieve relevant documents
    retrieved_docs = retrieve_documents(question)
    
    # Combine the contents of the retrieved documents
    context = " ".join(doc.page_content for doc in retrieved_docs)
    
    # Answer the question using the QA pipeline
    result = qa_pipeline(question=question, context=context)
    return result['answer']

question = "What are the key financial metrics for NVIDIA in 2023?"
answer = answer_question(question)
print(answer)

cash, cash equivalents, and marketable securities
