# Playground for custom RAG system
____
## Ingestion

In [1]:
import PyPDF2
import numpy as np
import os

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

if not MISTRAL_API_KEY:
    raise ValueError("MISTRAL_API_KEY not found in environment variables")

In [3]:
def extract_text_from_pdf(pdf_file):
    """Extract text from PDF file"""
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    
    return text


def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    """Split text into overlapping chunks (simple word-based chunking)"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks

In [4]:
pdf_file = "data/test/Custom_RAG_Test_Document.pdf"
if not pdf_file.endswith('.pdf'):
    raise ValueError("File must be a PDF")

In [49]:
text = extract_text_from_pdf(pdf_file)
print(text)

Custom RAG Test Document
1. Company Overview
Acme AI is a fictional enterprise software company specializing in AI agent deployment platforms
for mid-sized businesses. The company was founded in 2021 and is headquartered in New York.
2. Financial Summary (2025)
Revenue: $18 million. Gross Margin: 72%. Net Income: -$2.4 million. Customer Growth Rate: 38%
year-over-year.
3. Competitive Landscape
Primary competitors include StackAI, Credal, and Glean. Acme AI differentiates itself through rapid
deployment cycles and strong enterprise security compliance.
4. Macro Environment Context
In 2025, tightening credit conditions and elevated interest rates impacted enterprise software
spending. However, AI infrastructure investments continued due to productivity gains and
automation trends.
5. Risk Factors
Key risks include increased competition, dependency on venture funding, and potential slowdown in
SaaS budgets if macroeconomic conditions deteriorate.



In [50]:
chunk_texts = chunk_text(text)
print(f"Created {len(chunk_texts)} chunks")

Created 1 chunks


### Creating embeddings (Mistral)

In [5]:
from mistralai.client import MistralClient
from app.config import MISTRAL_API_KEY, CHUNK_SIZE, CHUNK_OVERLAP

In [6]:
client = MistralClient(api_key=MISTRAL_API_KEY)

# Global storage (in-memory)
chunks = []
embeddings = []

In [7]:
def embed_chunks(text_chunks):
    """Get embeddings from Mistral API"""
    response = client.embeddings(
            model="mistral-embed",
            input=text_chunks
        )
    data = response.data

    return [item.embedding for item in response.data]

# Testing full ingetion pipeline
def ingest_pdf(pdf_file):
    """Main ingestion pipeline"""
        
    # Extract text
    text = extract_text_from_pdf(pdf_file)
    print(f"Extracted text length: {len(text)} characters")
    
    # Chunk text
    new_text_chunks = chunk_text(text)
    print(f"Created {len(new_text_chunks)} chunks")
    
    # Get embeddings
    new_embeddings = embed_chunks(new_text_chunks)
    print(f"Generated embeddings for {len(new_embeddings)} chunks")
    
    # Store
    chunks.extend(new_text_chunks)
    embeddings.extend(new_embeddings)
    
    return len(new_text_chunks)

In [60]:
embed_chunks = embed_chunks(chunk_texts)
print(f"Generated embeddings for {len(embed_chunks)} chunks")

Generated embeddings for 1 chunks


In [8]:
num_chunks = ingest_pdf(pdf_file)
print(f"Ingested {num_chunks} chunks from PDF")

Extracted text length: 958 characters
Created 1 chunks
Generated embeddings for 1 chunks
Ingested 1 chunks from PDF
