In [6]:
from typing import List
import requests
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util
from PyPDF2 import PdfReader
from langchain_google_genai.llms import GoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.rate_limiters import InMemoryRateLimiter
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from dotenv import load_dotenv
from tqdm.auto import tqdm
from loguru import logger
import os

load_dotenv()


# Set up Google Gemini API Key
# os.environ["GOOGLE_API_KEY"] = "your_gemini_api_key"

# Setup logging
logger.add("finetuning_log.log", rotation="10MB", level="DEBUG")

4

In [5]:
# 1. Download NVIDIA's latest 10-K report
def download_10k(url: str, filename: str) -> None:
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get("content-length", 0))

    with open(filename, "wb") as f, tqdm(
        desc="Downloading 10-K report",
        total=total_size,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)
            bar.update(len(chunk))

    logger.success(f"Downloaded 10-K report: {filename}")

url = "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf"
filename = "nvidia_10k.pdf"

logger.info("Downloading NVIDIA 10-K report...")
download_10k(url, filename)


[32m2025-02-12 19:52:30.061[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m150[0m - [1mStarting NVIDIA 10-K processing pipeline...[0m
[32m2025-02-12 19:52:30.063[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m152[0m - [1mDownloading NVIDIA 10-K report...[0m
Downloading 10-K report: 100%|██████████| 1.02M/1.02M [00:00<00:00, 7.02MB/s]
[32m2025-02-12 19:52:30.431[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mdownload_10k[0m:[36m23[0m - [32m[1mDownloaded 10-K report: nvidia_10k.pdf[0m
[32m2025-02-12 19:52:30.433[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m155[0m - [1mExtracting text from PDF...[0m
Extracting text from PDF: 100%|██████████| 96/96 [00:03<00:00, 29.64page/s]
[32m2025-02-12 19:52:33.681[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_text_from_pdf[0m:[36m33[0m - [1mExtracted text from PDF (368809 characters).[0m
[32m2025-02-12 19:52:33.682[0m | [1mINFO    [0m | [36m__main_

In [None]:
# 2. Extract text from the 10-K PDF
def extract_text_from_pdf(filename: str) -> str:
    reader = PdfReader(filename)
    text = ""

    for page in tqdm(reader.pages, desc="Extracting text from PDF", unit="page"):
        text += page.extract_text() + "\n\n"  # Keep paragraph separation

    logger.info(f"Extracted text from PDF ({len(text)} characters).")
    return text

logger.info("Extracting text from PDF...")
text = extract_text_from_pdf(filename)

In [None]:
# 3. Paragraph-Based Chunking
def paragraph_chunking(text: str, min_words: int = 50, max_words: int = 300) -> List[str]:
    """Process paragraphs with progress bar"""
    paragraphs = text.split("\n\n")
    chunks = []
    current_chunk = ""

    for para in tqdm(paragraphs, desc="Processing paragraphs", unit="chunk"):
        words = para.split()
        if len(words) < min_words:
            continue

        if len(current_chunk.split()) + len(words) <= max_words:
            current_chunk += " " + para
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para

    if current_chunk:
        chunks.append(current_chunk.strip())

    logger.info(f"Segmented text into {len(chunks)} paragraph-based sections.")
    return chunks

logger.info("Segmenting report into paragraphs...")
paragraphs = paragraph_chunking(text)

logger.info(f"Generated {len(paragraphs)} paragraph-based sections.")

In [None]:
# 4. Generate Questions Using Gemini via LangChain
def generate_questions_gemini(section_text: str) -> List[str]:
    rate_limiter = InMemoryRateLimiter(requests_per_second=0.25)
    llm = GoogleGenerativeAI(
        model="gemini-2.0-flash",
        api_key=os.getenv("GOOGLE_API_KEY"),
        rate_limiter=rate_limiter
    )
    
    prompt_template = PromptTemplate(
        template="Generate five financial questions based on this financial report section:\n\n{context}",
        input_variables=["context"]
    )

    chain = prompt_template | llm | CommaSeparatedListOutputParser()
    try:
        response = chain.invoke({"context": section_text})  # Limit text length
        return response  # Returns list of generated questions
    except Exception as e:
        logger.error(f"Error generating questions: {e}")
        return []

In [None]:
# 5. Create Q&A Pairs
def generate_question_context_pairs(paragraphs: List[str]) -> List[str]:
    pairs: List[str] = []
    for para in tqdm(paragraphs, desc="Generating Q&A pairs", unit="pair"):
        questions = generate_questions_gemini(para)
        for question in questions:
            pairs.append((question, para))

    logger.info(f"Generated {len(pairs)} Q&A pairs for fine-tuning.")
    return pairs

logger.info("Generating question-context pairs using Gemini...")
pairs = generate_question_context_pairs(paragraphs)
logger.info(f"Generated {len(pairs)} Q&A pairs for fine-tuning.")

In [12]:
# 6. Evaluation: Compare Pre and Post Fine-Tuning Performance
def evaluate_model(model, questions, contexts):
    """
    Evaluates the model by computing cosine similarity between questions and their corresponding contexts.

    Args:
        model: The sentence embedding model.
        questions (List[str]): List of financial questions.
        contexts (List[str]): List of corresponding report contexts.

    Returns:
        float: Average cosine similarity score.
    """
    scores = []

    for question, context in tqdm(zip(questions, contexts), total=len(questions), desc="Evaluating Model", unit="pair"):
        question_embedding = model.encode(question, convert_to_tensor=True)
        context_embedding = model.encode(context, convert_to_tensor=True)
        similarity_score = util.pytorch_cos_sim(question_embedding, context_embedding).item()
        scores.append(similarity_score)

    avg_score = np.mean(scores)
    logger.info(f"Evaluation Complete - Average Cosine Similarity: {avg_score:.4f}")
    
    return avg_score

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Pre-training evaluation
pre_score = evaluate_model(model, [q.texts[0] for q in pairs], [q.texts[1] for q in pairs])
logger.info(f"Pre-Fine-Tuning Cosine Similarity: {pre_score:.4f}")

Evaluating Model: 100%|██████████| 1667/1667 [01:19<00:00, 20.85pair/s]
[32m2025-02-12 20:19:36.190[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_model[0m:[36m23[0m - [1mEvaluation Complete - Average Cosine Similarity: 0.3449[0m


In [None]:
# 6. Fine-Tune Using Sentence-Transformer's `fit`
def fine_tune_model(model, pairs, epochs=3, batch_size=16, learning_rate=2e-5):
    """
    Fine-tunes the Sentence-Transformer model with MNR loss.
    """
    train_dataloader = DataLoader(pairs, batch_size=batch_size, shuffle=True)
    train_loss = losses.MultipleNegativesRankingLoss(model)

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        warmup_steps=1000,
        optimizer_params={"lr": learning_rate},
        show_progress_bar=True,
    )

    logger.success("Fine-tuning complete!")
    
# Load the fine-tuned model
model = SentenceTransformer("finetuned_model")

# Post-training evaluation
post_score = evaluate_model(model, [q for q, _ in pairs], [c for _, c in pairs])
logger.info(f"Post-Fine-Tuning Cosine Similarity: {post_score:.4f}")

Evaluating Model: 100%|██████████| 1667/1667 [01:19<00:00, 20.88pair/s]
[32m2025-02-12 21:28:57.778[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_model[0m:[36m23[0m - [1mEvaluation Complete - Average Cosine Similarity: 0.3449[0m
[32m2025-02-12 21:28:57.781[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mPost-Fine-Tuning Cosine Similarity: 0.3449[0m


In [None]:
# Post-training evaluation
post_score = evaluate_model(model, [q.texts[0] for q in pairs], [q.texts[1] for q in pairs])
logger.info(f"Post-Fine-Tuning Cosine Similarity: {post_score:.4f}")

logger.success("Pipeline execution complete!")