# Playground for custom RAG system
____
## Ingestion

In [27]:
import PyPDF2
import numpy as np
import os

from dotenv import load_dotenv
load_dotenv()

True

In [43]:
CHUNK_SIZE = 80
CHUNK_OVERLAP = 30
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

if not MISTRAL_API_KEY:
    raise ValueError("MISTRAL_API_KEY not found in environment variables")

In [44]:
def extract_text_from_pdf(pdf_file):
    """Extract text from PDF file"""
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    
    return text


def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    """Split text into overlapping chunks (simple word-based chunking)"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks

In [45]:
pdf_file = "data/test/Custom_RAG_Test_Document.pdf"
if not pdf_file.endswith('.pdf'):
    raise ValueError("File must be a PDF")

In [46]:
text = extract_text_from_pdf(pdf_file)
print(text)

Custom RAG Test Document
1. Company Overview
Acme AI is a fictional enterprise software company specializing in AI agent deployment platforms
for mid-sized businesses. The company was founded in 2021 and is headquartered in New York.
2. Financial Summary (2025)
Revenue: $18 million. Gross Margin: 72%. Net Income: -$2.4 million. Customer Growth Rate: 38%
year-over-year.
3. Competitive Landscape
Primary competitors include StackAI, Credal, and Glean. Acme AI differentiates itself through rapid
deployment cycles and strong enterprise security compliance.
4. Macro Environment Context
In 2025, tightening credit conditions and elevated interest rates impacted enterprise software
spending. However, AI infrastructure investments continued due to productivity gains and
automation trends.
5. Risk Factors
Key risks include increased competition, dependency on venture funding, and potential slowdown in
SaaS budgets if macroeconomic conditions deteriorate.



In [47]:
chunk_texts = chunk_text(text)
print(f"Created {len(chunk_texts)} chunks")

Created 3 chunks


### Creating embeddings (Mistral)

In [48]:
from mistralai.client import MistralClient
from app.config import MISTRAL_API_KEY, CHUNK_SIZE, CHUNK_OVERLAP

In [49]:
client = MistralClient(api_key=MISTRAL_API_KEY)

# Global storage (in-memory)
chunks = []
embeddings = []

In [50]:
def embed_chunks(text_chunks):
    """Get embeddings from Mistral API"""
    response = client.embeddings(
            model="mistral-embed",
            input=text_chunks
        )
    data = response.data

    return [item.embedding for item in response.data]

# Testing full ingetion pipeline
def ingest_pdf(pdf_file):
    """Main ingestion pipeline"""
        
    # Extract text
    text = extract_text_from_pdf(pdf_file)
    print(f"Extracted text length: {len(text)} characters")
    
    # Chunk text
    new_text_chunks = chunk_text(text)
    print(f"Created {len(new_text_chunks)} chunks")
    
    # Get embeddings
    new_embeddings = embed_chunks(new_text_chunks)
    print(f"Generated embeddings for {len(new_embeddings)} chunks")
    
    # Store
    chunks.extend(new_text_chunks)
    embeddings.extend(new_embeddings)
    
    return len(new_text_chunks)

In [51]:
# embed_chunks = embed_chunks(chunk_texts)
# print(f"Generated embeddings for {len(embed_chunks)} chunks")

In [52]:
num_chunks = ingest_pdf(pdf_file)
print(f"Ingested {num_chunks} chunks from PDF")

Extracted text length: 958 characters
Created 3 chunks
Generated embeddings for 3 chunks
Ingested 3 chunks from PDF


In [53]:
print(f"Len of embeddings: {len(embeddings)}")

Len of embeddings: 3


____
### Query processing
____

In [54]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from app.config import MISTRAL_API_KEY, SIMILARITY_THRESHOLD

In [67]:
SIMILARITY_THRESHOLD = 0.2

In [56]:
def semantic_search(query, top_k=5):
    """Semantic search using embeddings"""
    if not embeddings:
        return []
    
    # Get query embedding
    query_embedding = client.embeddings(
        model="mistral-embed",
        input=[query]
    ).data[0].embedding
    
    # Calculate cosine similarity
    query_emb_array = np.array(query_embedding).reshape(1, -1)
    chunk_emb_array = np.array(embeddings)
    
    similarities = cosine_similarity(query_emb_array, chunk_emb_array)[0]
    
    # Get top k indices
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    results = [
        {
            "chunk": chunks[i],
            "score": float(similarities[i]),
            "index": int(i)
        }
        for i in top_indices
    ]
    
    return results


def keyword_search(query, top_k=5):
    """Keyword search using TF-IDF"""
    if not chunks:
        return []
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunks + [query])
    
    # Get query vector (last one)
    query_vector = tfidf_matrix[-1:]
    chunk_vectors = tfidf_matrix[:-1]
    
    # Calculate similarity
    similarities = cosine_similarity(query_vector, chunk_vectors)[0]
    
    # Get top k indices
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    results = [
        {
            "chunk": chunks[i],
            "score": float(similarities[i]),
            "index": int(i)
        }
        for i in top_indices
    ]
    
    return results


def hybrid_search(query, semantic_weight=0.25, keyword_weight=0.3, top_k=5):
    """Combine semantic and keyword search"""
    semantic_results = semantic_search(query, top_k=top_k * 2)
    keyword_results = keyword_search(query, top_k=top_k * 2)
    
    # Combine scores
    combined_scores = {}
    
    for result in semantic_results:
        idx = result["index"]
        combined_scores[idx] = semantic_weight * result["score"]
    
    for result in keyword_results:
        idx = result["index"]
        if idx in combined_scores:
            combined_scores[idx] += keyword_weight * result["score"]
        else:
            combined_scores[idx] = keyword_weight * result["score"]
    
    # Sort by combined score
    sorted_indices = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top k unique chunks
    final_results = [
        {
            "chunk": chunks[idx],
            "score": score,
            "index": idx
        }
        for idx, score in sorted_indices[:top_k]
    ]
    
    return final_results

In [57]:
query = "What are Acme AI' revenues?"

semantic_search_results = semantic_search(query)
print(f"Semantic Search Results: {semantic_search_results}")

keyword_search_results = keyword_search(query)
print(f"Keyword Search Results: {keyword_search_results}")

Semantic Search Results: [{'chunk': 'Custom RAG Test Document 1. Company Overview Acme AI is a fictional enterprise software company specializing in AI agent deployment platforms for mid-sized businesses. The company was founded in 2021 and is headquartered in New York. 2. Financial Summary (2025) Revenue: $18 million. Gross Margin: 72%. Net Income: -$2.4 million. Customer Growth Rate: 38% year-over-year. 3. Competitive Landscape Primary competitors include StackAI, Credal, and Glean. Acme AI differentiates itself through rapid deployment cycles and strong enterprise security compliance. 4. Macro', 'score': 0.8227605339500232, 'index': 0}, {'chunk': 'Customer Growth Rate: 38% year-over-year. 3. Competitive Landscape Primary competitors include StackAI, Credal, and Glean. Acme AI differentiates itself through rapid deployment cycles and strong enterprise security compliance. 4. Macro Environment Context In 2025, tightening credit conditions and elevated interest rates impacted enterpris

In [58]:
hybrid_search_results = hybrid_search(query)
print(f"Hybrid Search Results: {hybrid_search_results}")

Hybrid Search Results: [{'chunk': 'Custom RAG Test Document 1. Company Overview Acme AI is a fictional enterprise software company specializing in AI agent deployment platforms for mid-sized businesses. The company was founded in 2021 and is headquartered in New York. 2. Financial Summary (2025) Revenue: $18 million. Gross Margin: 72%. Net Income: -$2.4 million. Customer Growth Rate: 38% year-over-year. 3. Competitive Landscape Primary competitors include StackAI, Credal, and Glean. Acme AI differentiates itself through rapid deployment cycles and strong enterprise security compliance. 4. Macro', 'score': 0.24049978890093307, 'index': 0}, {'chunk': 'Customer Growth Rate: 38% year-over-year. 3. Competitive Landscape Primary competitors include StackAI, Credal, and Glean. Acme AI differentiates itself through rapid deployment cycles and strong enterprise security compliance. 4. Macro Environment Context In 2025, tightening credit conditions and elevated interest rates impacted enterprise

____
### Retieval 
____

In [None]:
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity

In [59]:
def detect_intent(query):
    """Simple intent detection - check if query needs search"""
    greetings = ["hello", "hi", "hey", "thanks", "thank you", "bye"]
    query_lower = query.lower().strip()
    
    # Don't search for greetings
    if any(greeting in query_lower for greeting in greetings):
        return False
    
    # Search for actual questions
    return len(query_lower.split()) > 2


def transform_query(query):
    """Simple query transformation"""
    # Basic cleaning
    query = query.strip().lower()
    
    # Expand common abbreviations (optional)
    expansions = {
        "q&a": "question and answer",
        "info": "information",
    }
    
    for abbr, full in expansions.items():
        query = query.replace(abbr, full)
    
    return query

In [60]:
def generate_answer(query, retrieved_chunks):
    """Generate answer using Mistral LLM"""

    if retrieved_chunks:
        for i, chunk in enumerate(retrieved_chunks[:3]):
            print(f"Chunk {i+1} score: {chunk['score']:.4f}")
    else:
        print("No chunks retrieved!")
    
    # Check similarity threshold
    if not retrieved_chunks or retrieved_chunks[0]["score"] < SIMILARITY_THRESHOLD:
        print(f"Top score {retrieved_chunks[0]['score'] if retrieved_chunks else 0:.2f} < threshold {SIMILARITY_THRESHOLD}")
        
        return {
            "answer": "Insufficient evidence in the knowledge base to answer this question confidently.",
            "chunks": retrieved_chunks[:3] if retrieved_chunks else [],  # Return chunks anyway for debugging
            "reasoning": f"Top chunk similarity ({retrieved_chunks[0]['score'] if retrieved_chunks else 0:.2f}) is below threshold ({SIMILARITY_THRESHOLD})"
        }
    
    # Build context from top chunks
    context = "\n\n".join([
        f"[Chunk {i+1}]: {chunk['chunk']}"
        for i, chunk in enumerate(retrieved_chunks[:3])
    ])
    
    # Build prompt
    prompt = f"""Based on the following document excerpts, answer the user's question.

        Context:
        {context}

        Question: {query}

        Instructions:
        - Provide a clear, concise answer based ONLY on the context provided
        - Cite which chunk(s) support your answer (e.g., "According to Chunk 1...")
        - If the context doesn't contain enough information, say so
        - Do not make up information not present in the context

        Answer:
    """
    
    # Call Mistral API
    response = client.chat(
        model="mistral-small-latest",
        messages=[{"role": "user", "content": prompt}]
    )
    
    answer = response.choices[0].message.content
    
    return {
        "answer": answer,
        "chunks": retrieved_chunks[:3],
        "threshold_passed": True
    }

In [68]:
query = "What are the revenues of Acme AI?"

In [69]:
# Intent detection
needs_search = detect_intent(query)
if not needs_search:
    print("No search needed, returning default response.")

# Transform query
processed_query = transform_query(query)

# Hybrid search
retrieved_chunks = hybrid_search(processed_query)

# Generate answer
result = generate_answer(query, retrieved_chunks)

Chunk 1 score: 0.2243
Chunk 2 score: 0.1977
Chunk 3 score: 0.1367


In [70]:
result

{'answer': "According to Chunk 1, Acme AI's revenue in 2025 is $18 million.",
 'chunks': [{'chunk': 'Custom RAG Test Document 1. Company Overview Acme AI is a fictional enterprise software company specializing in AI agent deployment platforms for mid-sized businesses. The company was founded in 2021 and is headquartered in New York. 2. Financial Summary (2025) Revenue: $18 million. Gross Margin: 72%. Net Income: -$2.4 million. Customer Growth Rate: 38% year-over-year. 3. Competitive Landscape Primary competitors include StackAI, Credal, and Glean. Acme AI differentiates itself through rapid deployment cycles and strong enterprise security compliance. 4. Macro',
   'score': 0.2242576753076676,
   'index': 0},
  {'chunk': 'Customer Growth Rate: 38% year-over-year. 3. Competitive Landscape Primary competitors include StackAI, Credal, and Glean. Acme AI differentiates itself through rapid deployment cycles and strong enterprise security compliance. 4. Macro Environment Context In 2025, tig