In [1]:
# Cell 1: Install required packages
!pip install pypdf chromadb anthropic langchain unstructured

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting anthropic
  Downloading anthropic-0.43.1-py3-none-any.whl.metadata (23 kB)
Collecting unstructured
  Downloading unstructured-0.16.14-py3-none-any.whl.metadata (24 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.8.3-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting onnxruntime>=1.14.1 (from c

In [2]:
# Cell 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
# Cell 3: Import libraries
import os
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils import embedding_functions
import anthropic
from typing import List, Tuple, Dict
import re
from urllib.parse import urlparse
import time
from anthropic.types import TextBlock


In [4]:
# If you want to see debug info set this flag to True
IS_DEBUGGING = False

In [5]:
# Cell 4: Set up paths and configurations
PDF_DIR = "/content/drive/My Drive/Colab Notebooks/MTG/RAG_Project/articles"
# Replace with your actual API key
ANTHROPIC_API_KEY = "YOUR_KEY_HERE"

In [6]:
class PDFProcessor:
    def __init__(self, pdf_directory: str):
        self.pdf_directory = pdf_directory
        # Regular expression for visible URLs
        self.url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'

    def extract_text_with_metadata(self) -> List[Dict]:
        documents = []

        for filename in os.listdir(self.pdf_directory):
            if filename.endswith('.pdf'):
                filepath = os.path.join(self.pdf_directory, filename)
                try:
                    pdf = PdfReader(filepath)
                    print(f"Processing {filename}...")

                    for page_num, page in enumerate(pdf.pages, 1):
                        text = page.extract_text()
                        if text.strip():
                            # Get visible URLs from text
                            visible_urls = re.findall(self.url_pattern, text)

                            # Get hyperlink URLs from annotations
                            hyperlink_urls = []
                            if '/Annots' in page:
                                for annot in page['/Annots']:
                                    annotation = annot.get_object()
                                    if annotation['/Subtype'] == '/Link':
                                        if '/A' in annotation and '/URI' in annotation['/A']:
                                            hyperlink_urls.append(annotation['/A']['/URI'])

                            # Combine and deduplicate URLs
                            all_urls = list(set(visible_urls + hyperlink_urls))

                            documents.append({
                                'text': text,
                                'metadata': {
                                    'document': filename,
                                    'page': page_num,
                                    'urls': all_urls
                                }
                            })

                            if all_urls and IS_DEBUGGING:
                                print(f"Found {len(all_urls)} URLs in {filename}, page {page_num}")
                                print("URLs found:", all_urls)

                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")

        return documents

In [7]:
class TextChunker:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            is_separator_regex=False
        )

    def split_documents(self, documents: List[Dict]) -> List[Dict]:
        """Split documents into chunks while preserving metadata"""
        chunked_documents = []

        for doc in documents:
            chunks = self.text_splitter.create_documents(
                texts=[doc['text']],
                metadatas=[doc['metadata']]
            )

            for chunk in chunks:
                chunked_documents.append({
                    'text': chunk.page_content,
                    'metadata': chunk.metadata
                })

        return chunked_documents

In [8]:
class VectorStore:
    def __init__(self, collection_name: str = "cars_collection"):
        self.client = chromadb.Client()
        self.collection = self.client.create_collection(
            name=collection_name,
            embedding_function=embedding_functions.DefaultEmbeddingFunction()
        )

    def add_documents(self, documents: List[Dict]):
        """Add documents to the vector store"""
        texts = [doc['text'] for doc in documents]
        ids = [str(i) for i in range(len(documents))]

        # Convert metadata to format compatible with ChromaDB
        metadatas = []
        for doc in documents:
            metadata = doc['metadata'].copy()
            # Convert URLs list to string if it exists
            if 'urls' in metadata:
                metadata['urls'] = ','.join(metadata['urls'])  # Join URLs with comma
            metadatas.append(metadata)

        self.collection.add(
            documents=texts,
            ids=ids,
            metadatas=metadatas
        )

    def query(self, question: str, n_results: int = 5) -> List[Dict]:
        """Query the vector store"""
        results = self.collection.query(
            query_texts=[question],
            n_results=n_results
        )

        retrieved_docs = []
        for i in range(len(results['documents'][0])):
            metadata = results['metadatas'][0][i]
            # Convert URLs string back to list if it exists
            if 'urls' in metadata and metadata['urls']:
                metadata['urls'] = metadata['urls'].split(',')

            retrieved_docs.append({
                'text': results['documents'][0][i],
                'metadata': metadata
            })

        return retrieved_docs

In [9]:
class ClaudeQuerier:
    def __init__(self, api_key: str):
        self.client = anthropic.Anthropic(api_key=api_key)
        #self.client = ""

    def query_with_context(self, question: str, context_docs: List[Dict]) -> str:
        # Format context
        formatted_context = "\n\n".join([
            f"Document: {doc['metadata']['document']}, Page: {doc['metadata']['page']}\n" +
            f"URLs: {', '.join(doc['metadata'].get('urls', []))}\n" +
            f"Text: {doc['text']}"
            for doc in context_docs
        ])

        # Create the prompt
        prompt = f"""Here are some relevant passages about cars:

{formatted_context}

Based only on the information provided above, please answer this question: {question}

Use a language of a car enthusiast but formal no slang or any other language.

Format your response exactly like this:
<Answer>
[Your detailed answer here]
</Answer>
<Sources>
[List each document name and page number used to create the answer]
</Sources>
If the text has links please enable them as a a href estyle in your response to be able to click them"""

        if IS_DEBUGGING:
          print(prompt)

        # Get response from Claude
        response = self.client.messages.create(
            model="claude-3-sonnet-20240229",
            max_tokens=1024,
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

In [10]:
# Verify the directory exists and show PDF files
pdf_files = [f for f in os.listdir(PDF_DIR) if f.endswith('.pdf')]
print(f"Found {len(pdf_files)} PDF files:")
for pdf in pdf_files:
    print(f"- {pdf}")

Found 4 PDF files:
- MotorTrend Announces Winners of the 2025 Software-Defined Vehicle Innovator Awards.pdf
- 2025 Ford Mustang GTD Spirit of America Celebrates Aero and Speed.pdf
- There's a New Self-Driving Electric Minibus In the Works.pdf
- 2025 Chevrolet Corvette ZR1 Claims BONKERS 0–60 Time!.pdf


In [11]:
# Cell 6: Initialize components and process documents
pdf_processor = PDFProcessor(PDF_DIR)
text_chunker = TextChunker()
vector_store = VectorStore()
claude_querier = ClaudeQuerier(ANTHROPIC_API_KEY)

In [18]:
# Process PDFs and will return a list which size will be the total number of pages extracted
print("Extracting text from PDFs...")
documents = pdf_processor.extract_text_with_metadata()
print(f"Extracted text from {len(documents)} pages")

Extracting text from PDFs...
Processing MotorTrend Announces Winners of the 2025 Software-Defined Vehicle Innovator Awards.pdf...
Processing 2025 Ford Mustang GTD Spirit of America Celebrates Aero and Speed.pdf...
Processing There's a New Self-Driving Electric Minibus In the Works.pdf...
Processing 2025 Chevrolet Corvette ZR1 Claims BONKERS 0–60 Time!.pdf...
Extracted text from 30 pages


In [17]:
documents

[{'text': 'MotorTrend Announces Winners of the 2025 Software-Deﬁned VehicleInnovator Awards\nThese 17 leaders, pioneers, and experts are driving the auto industry toward a future with smarter and more sophisticated cars.\nEric Tingwall-Writer;Ryan Lugo-Illustrator|Jan 07, 2025\nWhile EVs and autonomous cars earn all the headlines, software-defined vehicles are quietly revolutionizing how we movearound the world with relatively li\x00le fanfare. Cars with fewer but more powerful computers running cu\x00ing-edge software arechanging every interaction we have with our vehicles—from unlocking and starting them to steering and braking them.\nSee All 4 Photos\nAsk MOTORTREND BETA\n1/9/25, 10:45 PM MotorTrend Announces Winners of the 2025 Software-Defined Vehicle Innovator Awards\nhttps://www.motortrend.com/news/2025-software-defined-vehicle-innovator-award-winners/ 1/8',
  'metadata': {'document': 'MotorTrend Announces Winners of the 2025 Software-Defined Vehicle Innovator Awards.pdf',
   'p

In [22]:
# Chunk documents and returns a list of ech of the pieces of those documents
print("\nChunking documents...")
chunked_docs = text_chunker.split_documents(documents)
print(f"Created {len(chunked_docs)} chunks")


Chunking documents...
Created 45 chunks


In [21]:
chunked_docs

[{'text': 'MotorTrend Announces Winners of the 2025 Software-Deﬁned VehicleInnovator Awards\nThese 17 leaders, pioneers, and experts are driving the auto industry toward a future with smarter and more sophisticated cars.\nEric Tingwall-Writer;Ryan Lugo-Illustrator|Jan 07, 2025\nWhile EVs and autonomous cars earn all the headlines, software-defined vehicles are quietly revolutionizing how we movearound the world with relatively li\x00le fanfare. Cars with fewer but more powerful computers running cu\x00ing-edge software arechanging every interaction we have with our vehicles—from unlocking and starting them to steering and braking them.\nSee All 4 Photos\nAsk MOTORTREND BETA\n1/9/25, 10:45 PM MotorTrend Announces Winners of the 2025 Software-Defined Vehicle Innovator Awards\nhttps://www.motortrend.com/news/2025-software-defined-vehicle-innovator-award-winners/ 1/8',
  'metadata': {'document': 'MotorTrend Announces Winners of the 2025 Software-Defined Vehicle Innovator Awards.pdf',
   'p

In [25]:
# Store the chunks of the documents in vector database
print("\nStoring documents in vector database...")
vector_store.add_documents(chunked_docs)
print("Documents stored successfully")


Storing documents in vector database...


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 104MiB/s] 


Documents stored successfully


In [26]:
# Cell 7: Function to ask questions
def ask_question(question: str, n_results: int = 5):
    if IS_DEBUGGING:
        print(f"Question: {question}")
        print(f"Retrieving {n_results} most relevant document chunks...")

    # Get relevant contexts based on the question
    relevant_docs = vector_store.query(question, n_results=n_results)

    # Print the sources being used
    if IS_DEBUGGING:
      print("\nUsing information from:")
      for doc in relevant_docs:
          print(f"<document>- {doc['metadata']['document']}</document>")
          print(f"<page> {doc['metadata']['page']}</page>")
          # Print text content
          print("<Text>")
          print(doc['text'])
          if 'urls' in doc['metadata'] and doc['metadata']['urls']:
              # Find all URLs in the text and create a mapping of text:url
              text_url_mapping = {}
              for url in doc['metadata']['urls']:
                  # You might want to add additional text matching logic here
                  # For now, just add the URLs at the end of the text
                  text_url_mapping[url] = url

              print("\nRelated URLs in this text:")
              for text, url in text_url_mapping.items():
                  print(f"- {url}")

    # Query Claude with context
    answer = claude_querier.query_with_context(question, relevant_docs)

    return answer

In [38]:
def parse_textblock(block: TextBlock) -> Dict[str, str]:
    result = {
        'answer': '',
        'sources': ''
    }

    # Access the text content directly from the TextBlock
    text = block.text

    # Extract Answer section
    answer_match = re.search(r'<Answer>\s*(.*?)\s*</Answer>', text, re.DOTALL)
    if answer_match:
        result['answer'] = answer_match.group(1).strip()

    # Extract Sources section
    sources_match = re.search(r'<Sources>\s*(.*?)\s*</Sources>', text, re.DOTALL)
    if sources_match:
        result['sources'] = sources_match.group(1).strip()

    return result


In [44]:
# Cell 8: Example usage - you can run this cell multiple times with different questions
question = "why is recognized Anders Bell?"  # Replace with your question
answer = ask_question(question)
parsed_content = parse_textblock(answer[0])
the_answer = parsed_content['answer']
the_source = parsed_content['sources']
print(the_answer)
print(the_source)

Anders Bell is recognized for his exceptional leadership in the development of core computing technology that underpins the EX90, Volvo's first vehicle designed from inception as a truly software-defined vehicle. As the Chief Engineering & Technology Officer at Volvo Cars, Bell has spearheaded a transformative shift in the automotive industry by pioneering the broad adoption and application of software solutions within Volvo's production vehicles. His visionary work on the EX90 showcases how software can be seamlessly integrated into a vehicle's architecture from the initial design phase, setting a new standard for the software-defined future of the automotive realm.
- MotorTrend Announces Winners of the 2025 Software-Defined Vehicle Innovator Awards.pdf, Page 4


In [45]:
question = "What is May Mobility?"  # Replace with your question
answer = ask_question(question)
parsed_content = parse_textblock(answer[0])
the_answer = parsed_content['answer']
the_source = parsed_content['sources']
print(the_answer)
print(the_source)

May Mobility is an autonomous vehicle technology company based in Ann Arbor, Michigan. They are expanding their fleet of self-driving vehicles by partnering with Italy's Tecnobus to introduce a new autonomous electric minibus platform capable of seating up to 30 passengers. This minibus will be designed for urban transit, airports, corporate campuses, and planned communities where its top speed of 45 mph is suitable.

The new electric minibus will feature wheelchair accessibility, allowing it to be part of May Mobility's mobility-as-a-service fleet. It will have swappable batteries to minimize downtime and is expected to be ready for customer use in the latter half of 2026 after homologation for use in the U.S., Canada, and Europe.

May Mobility already operates a fleet of autonomous Toyota Sienna minivans for ride-hailing services. They have partnerships with major companies like Toyota, NTT, and Lyft. The company is currently testing its self-driving vehicles without a driver on publ

In [46]:
start_time = time.time()
question = "What cars can go from 0 to 60 mph in less than 5 seconds?"  # Replace with your question
answer = ask_question(question)
parsed_content = parse_textblock(answer[0])
the_answer = parsed_content['answer']
the_source = parsed_content['sources']
print(the_answer)
print(the_source)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.8f} seconds")

According to the passages, the following cars can go from 0 to 60 mph in less than 5 seconds:

- 2025 Chevrolet Corvette ZR1 ZTK: 2.3 seconds (claimed)
- 2025 Chevrolet Corvette ZR1 (standard): 2.6 seconds (claimed)
- 2017 Tesla Model S P100D Ludicrous+: 2.3 seconds 
- 2022 Ferrari SF90 Spider: 2.3 seconds
- 2023 Ferrari 296 GTB Assetto Fiorano: 2.3 seconds
- 2021 Porsche 911 Turbo S: 2.3 seconds
- 2021 Tesla Model S Plaid: 2.1 seconds
- 2021 Ferrari SF90 Stradale Assetto Fiorano: 2.1 seconds  
- 2024 Lucid Air Sapphire: 2.2 seconds
- 2021 Porsche 911 Turbo S Lightweight: 2.2 seconds
- 2022 Tesla Model S Plaid: 2.2 seconds

The passages highlight that if the claimed times for the 2025 Corvette ZR1 models are substantiated, the ZR1 ZTK would tie for the third-quickest car MotorTrend has ever tested to 60 mph, and would be the second-quickest non-hybrid, non-electric car.
- 2025 Chevrolet Corvette ZR1 Claims BONKERS 0–60 Time!.pdf, Page 1
- 2025 Chevrolet Corvette ZR1 Claims BONKERS 0–60