### 1. Import Dependencies

In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import json
import os


In [2]:
load_dotenv()

# Configure Tesseract path from .env
import pytesseract
tess_path = os.getenv("TESSERACT_PATH")
if tess_path:
    # 1. Set path for pytesseract library
    pytesseract.pytesseract.tesseract_cmd = tess_path
    
    # 2. Add Tesseract directory to system PATH for unstructured's subprocess calls
    tess_dir = os.path.dirname(tess_path)
    if tess_dir not in os.environ['PATH']:
        os.environ['PATH'] = tess_dir + os.pathsep + os.environ['PATH']
    print(f"Tesseract path injected: {tess_dir}")


Tesseract path injected: C:\Program Files\Tesseract-OCR


### 2. Data Ingestion

In [6]:
file_path="../data/report.pdf"


In [7]:
loader = UnstructuredPDFLoader(
    file_path,
    strategy="hi_res",  # indentyfying table and layout
    infer_table_structure=True, # To get data from inside table
)


In [8]:
docs  =loader.load()




The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [9]:
print(f"Characters: {len(docs[0].page_content)}")
print(f"Start: {docs[0].page_content[:200]}")
print(f"End: {docs[0].page_content[-200:]}")


Characters: 626032
Start: On Our Way  2024 ANNUAL REPORT 

On Our Way

Uber’s Mission We reimagine the way the world moves for the better

Uber’s Mission

We reimagine the way the world moves for the better We are Uber. The go
End:  94158

PricewaterhouseCoopers LLP

Independent Public Registered

Accounting Firm

PricewaterhouseCoopers LLP

f

2021 Proxy statement

Uber

1725 3rd Street San Francisco, California 94158

uber.com


### 3. Text Chunking

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150,
    length_function = len,
    separators=["\n\n", "\n", " ", ""]
)


In [11]:
chunks = text_splitter.split_documents(docs)

#### 3.1 Saving & Loading chunks

In [12]:
chunks_json_path = '../artifacts/data_factory/chunks.json'

serializable_chunks = [
    {
        "page_content": doc.page_content, 
        "metadata": doc.metadata
    } 
    for doc in chunks
]

with open(chunks_json_path, "w", encoding="utf-8") as f:
    json.dump(serializable_chunks, f, indent=4)

print("Chunks Saved Sucessfully !")

Chunks Saved Sucessfully !


In [13]:
if os.path.exists(chunks_json_path):
    with open(chunks_json_path, 'r', encoding='utf-8') as f:
        chunks = json.load(f)
    print("✅ Chunks loaded from JSON file.")
else:
    chunks = text_splitter.split_documents(docs)
    print("✅ Documents split into new chunks.")

✅ Chunks loaded from JSON file.


### 4. Question & Answer generator

In [14]:
llm_gemini = ChatGoogleGenerativeAI(model="gemini-2.5-flash")
llm_openai = ChatOpenAI(model="gpt-4o-mini")
llm_groq = ChatGroq(model="llama-3.1-8b-instant")


#### 4.1 Question & Answer generator

In [15]:
question_gen_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a Lead AI Architect. Your task is to generate 10 high-quality questions based ONLY on the provided context.

    RULES:
    1. Every question MUST be answerable using ONLY the provided text.
    2. Do NOT use outside knowledge (like general financial stats for Uber).
    3. If the context is narrative, ask about goals, tone, or mission.
    4. If the context has data, ask about specific numbers and percentages.
    5. No intro/outro. Just 10 numbered questions."""),
    ("user", "Context: {context}")
])


In [16]:
answer_gen_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a Senior Financial Analyst. 
    Your task is to answer the provided questions using ONLY the given context.
    
    You must return the output as a list of Question-Answer pairs.
    Format each pair exactly like this:
    Q: [The original question]
    A: [The concise answer based on context]

    Rules:
    1. Do not use outside knowledge.
    2. No citations or tags.
    3. If the answer is not in the context, write "Information not available".
    4. Provide exactly 10 pairs."""),
    ("user", "Context: {context}\n\nQuestions: {questions}")
])


In [17]:
output_parser = StrOutputParser()
chain_openAI = question_gen_prompt | llm_openai | output_parser
chain_gemini = answer_gen_prompt | llm_gemini | output_parser
chain_groq = answer_gen_prompt | llm_groq | output_parser


In [None]:
import os

output_file = "../artifacts/data_factory/uber_dataset.json"

if os.path.exists(output_file):
    with open(output_file, "r") as f:
        dataset = json.load(f)
else:
    dataset = []


start_chunk = len(dataset)
end_chunk = start_chunk + 50 

print(f" From {start_chunk} to {end_chunk} processing ...")

for i, chunk in enumerate(chunks[start_chunk:end_chunk]):
    current_idx = start_chunk + i
    try:
        questions_text = chain_openAI.invoke({"context": chunk})
        qa_output = chain_groq.invoke({"context": chunk, "questions": questions_text})
        
        dataset.append({
            "chunk_id": current_idx,
            "context": chunk['page_content'],
            "qa_pairs": qa_output
        })
        
        if (current_idx + 1) % 10 == 0:
            with open(output_file, "w") as f:
                json.dump(dataset, f, indent=4)
            print(f"Saved checkpoint at chunk {current_idx + 1}")

    except Exception as e:
        print(f"Error on chunk {current_idx}: {e}")
        break 


with open(output_file, "w") as f:
    json.dump(dataset, f, indent=4)

 From 560 to 610 processing ...
Saved checkpoint at chunk 570


### 5. Loading Q & A pairs and Splitting

In [2]:
output_file = "../artifacts/data_factory/uber_dataset.json"
import json
import re
import random

# Load raw data
with open(output_file, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

expanded_dataset = []

for item in raw_data:
    context = item["context"]
    qa_text = item["qa_pairs"]
    

    pairs = re.split(r'\n?\s?Q:', qa_text)
    
    for pair in pairs:
        if not pair.strip(): continue
        
        # split question and the answer
        if 'A:' in pair:
            parts = pair.split('A:', 1)
            question = parts[0].strip().replace('Q:', '')
            answer = parts[1].strip()
            
            expanded_dataset.append({
                "context": context,
                "question": question,
                "answer":answer
            })

# Shuffle the final expanded dataset
random.shuffle(expanded_dataset)

In [29]:
# 80/20 Split
split_point = int(len(expanded_dataset) * 0.8)
train_set = expanded_dataset[:split_point]
test_set = expanded_dataset[split_point:]


def save_jsonl(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')

save_jsonl(train_set, "../artifacts/train/train.jsonl")
save_jsonl(test_set, "../artifacts/test/golden_test_set.jsonl")

In [None]:

# with open("readable_train.json", "w", encoding="utf-8") as f:
#     json.dump(train_set, f, indent=4, ensure_ascii=False)