In [16]:
from typing import List
import requests
import numpy as np
import pickle
from torch.utils.data import DataLoader
from sentence_transformers import (
    SentenceTransformer,
    InputExample,
    losses,
    evaluation,
    util,
)
from datasets import Dataset
from PyPDF2 import PdfReader
from langchain_google_genai.llms import GoogleGenerativeAI
from langchain_ollama.llms import OllamaLLM
from langchain.prompts import PromptTemplate
from langchain_core.rate_limiters import InMemoryRateLimiter
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from tqdm.auto import tqdm
from loguru import logger
import os

from dotenv import load_dotenv

load_dotenv()

import warnings

warnings.filterwarnings("ignore")


# Set up Google Gemini API Key | Replace with your own key
# os.environ["GOOGLE_API_KEY"] = "your_gemini_api_key" 

# Setup logging
logger.add("finetuning_log.log", rotation="10MB", level="DEBUG")

2

In [2]:
# 1. Download NVIDIA's latest 10-K report
def download_10k(url: str, filename: str) -> None:
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get("content-length", 0))

    with open(filename, "wb") as f, tqdm(
        desc="Downloading 10-K report",
        total=total_size,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)
            bar.update(len(chunk))

    logger.success(f"Downloaded 10-K report: {filename}")

url = "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/1cbe8fe7-e08a-46e3-8dcc-b429fc06c1a4.pdf"
filename = "nvidia_10k.pdf"

logger.info("Downloading NVIDIA 10-K report...")
download_10k(url, filename)


[32m2025-02-15 10:27:13.827[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mDownloading NVIDIA 10-K report...[0m
Downloading 10-K report: 100%|██████████| 1.02M/1.02M [00:00<00:00, 4.79MB/s]
[32m2025-02-15 10:27:14.481[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mdownload_10k[0m:[36m17[0m - [32m[1mDownloaded 10-K report: nvidia_10k.pdf[0m


In [3]:
# 2. Extract text from the 10-K PDF
def extract_text_from_pdf(filename: str) -> str:
    reader = PdfReader(filename)
    text = ""

    for page in tqdm(reader.pages, desc="Extracting text from PDF", unit="page"):
        text += page.extract_text() + "\n\n"  # Keep paragraph separation

    logger.info(f"Extracted text from PDF ({len(text)} characters).")
    return text

logger.info("Extracting text from PDF...")
text = extract_text_from_pdf(filename)

[32m2025-02-15 10:27:14.490[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mExtracting text from PDF...[0m
Extracting text from PDF: 100%|██████████| 96/96 [00:03<00:00, 25.00page/s]
[32m2025-02-15 10:27:18.340[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_text_from_pdf[0m:[36m9[0m - [1mExtracted text from PDF (368809 characters).[0m


In [4]:
# 3. Paragraph-Based Chunking
def paragraph_chunking(text: str, min_words: int = 50, max_words: int = 300) -> List[str]:
    """Process paragraphs with progress bar"""
    paragraphs = text.split("\n\n")
    chunks = []
    current_chunk = ""

    for para in tqdm(paragraphs, desc="Processing paragraphs", unit="chunk"):
        words = para.split()
        if len(words) < min_words:
            continue

        if len(current_chunk.split()) + len(words) <= max_words:
            current_chunk += " " + para
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para

    if current_chunk:
        chunks.append(current_chunk.strip())

    logger.info(f"Segmented text into {len(chunks)} paragraph-based sections.")
    return chunks

logger.info("Segmenting report into paragraphs...")
paragraphs = paragraph_chunking(text)

logger.info(f"Generated {len(paragraphs)} paragraph-based sections.")

[32m2025-02-15 10:27:18.349[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1mSegmenting report into paragraphs...[0m
Processing paragraphs: 100%|██████████| 97/97 [00:00<00:00, 11805.68chunk/s]
[32m2025-02-15 10:27:18.364[0m | [1mINFO    [0m | [36m__main__[0m:[36mparagraph_chunking[0m:[36m22[0m - [1mSegmented text into 96 paragraph-based sections.[0m
[32m2025-02-15 10:27:18.365[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mGenerated 96 paragraph-based sections.[0m


In [19]:
# 4. Generate Questions Using Gemini via LangChain
def generate_questions_gemini(section_text: str) -> List[str]:
    rate_limiter = InMemoryRateLimiter(requests_per_second=0.1)
    # llm = GoogleGenerativeAI(
    #     model="gemini-2.0-flash",
    #     api_key=os.getenv("GOOGLE_API_KEY"),
    #     rate_limiter=rate_limiter
    # )
    llm = OllamaLLM(
        model="deepseek-r1:latest",
        host="http://localhost:11434",
    )
    
    prompt_template = PromptTemplate(
        template="Generate one question based on this financial report section:\n\n{context}",
        input_variables=["context"]
    )

    chain = prompt_template | llm
    try:
        response = chain.invoke({"context": section_text})  # Limit text length
        return response  # Returns list of generated questions
    except Exception as e:
        logger.error(f"Error generating questions: {e}")
        return []

In [20]:
# 5. Create Q&A Pairs
def generate_question_context_pairs(paragraphs: List[str]) -> List[InputExample]:
    pairs: List[InputExample] = []
    for para in tqdm(paragraphs, desc="Generating Q&A pairs", unit="pair"):
        questions = generate_questions_gemini(para)
        for question in questions:
            pairs.append(InputExample(texts=[question, para]))

    logger.info(f"Generated {len(pairs)} Q&A pairs for fine-tuning.")
    return pairs

logger.info("Generating question-context pairs using Gemini...")
pairs = generate_question_context_pairs(paragraphs)
logger.info(f"Generated {len(pairs)} Q&A pairs for fine-tuning.")

[32m2025-02-15 10:39:17.012[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mGenerating question-context pairs using Gemini...[0m
Generating Q&A pairs:   0%|          | 0/96 [00:00<?, ?pair/s][32m2025-02-15 10:39:17.096[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mgenerate_questions_gemini[0m:[36m24[0m - [31m[1mError generating questions: model 'deepseek-r1:latest' not found (status code: 404)[0m
[32m2025-02-15 10:39:17.179[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mgenerate_questions_gemini[0m:[36m24[0m - [31m[1mError generating questions: model 'deepseek-r1:latest' not found (status code: 404)[0m
Generating Q&A pairs:   2%|▏         | 2/96 [00:00<00:07, 12.04pair/s][32m2025-02-15 10:39:17.261[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mgenerate_questions_gemini[0m:[36m24[0m - [31m[1mError generating questions: model 'deepseek-r1:latest' not found (status code: 404)[0m
[32m2025-02-15 10:39:17.344[0m | [

KeyboardInterrupt: 

In [7]:
# Save the Q&A pairs to a pickle file
with open("qa_pairs.pkl", "wb") as f:
    pickle.dump(pairs, f)

In [11]:
# Load Q&A Pairs
with open("qa_pairs.pkl", "rb") as f:
    pairs = pickle.load(f)

In [None]:
# 6. Evaluation: Compare Pre and Post Fine-Tuning Performance
def evaluate_model(model, questions, contexts):
    """
    Evaluates the model by computing cosine similarity between questions and their corresponding contexts.

    Args:
        model: The sentence embedding model.
        questions (List[str]): List of financial questions.
        contexts (List[str]): List of corresponding report contexts.

    Returns:
        float: Average cosine similarity score.
    """
    scores = []

    for question, context in tqdm(zip(questions, contexts), total=len(questions), desc="Evaluating Model", unit="pair"):
        question_embedding = model.encode(question, convert_to_tensor=True)
        context_embedding = model.encode(context, convert_to_tensor=True)
        similarity_score = util.pytorch_cos_sim(question_embedding, context_embedding).item()
        scores.append(similarity_score)

    avg_score = np.mean(scores)
    logger.info(f"Evaluation Complete - Average Cosine Similarity: {avg_score:.4f}")
    
    return avg_score

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Pre-training evaluation
pre_score = evaluate_model(model, [q.texts[0] for q in pairs], [q.texts[1] for q in pairs])
logger.info(f"Pre-Fine-Tuning Cosine Similarity: {pre_score:.4f}")

In [None]:
import torch
from torch.utils.data import DataLoader, SubsetRandomSampler
import numpy as np

# 6. Fine-Tune Using Sentence-Transformer's `fit` method
def fine_tune_model(model: SentenceTransformer, pairs, sample_ratio=0.1, epochs=50, batch_size=16, learning_rate=2e-5):
    """
    Fine-tunes using DataLoader with SubsetRandomSampler for efficient sampling.
    """
    # Create indices and sample subset
    # indices = np.arange(len(pairs))
    # sample_size = int(len(pairs) * sample_ratio)
    # sampler = SubsetRandomSampler(np.random.choice(indices, sample_size, replace=False))
    
    train_dataloader = DataLoader(
        pairs, 
        batch_size=batch_size,
        # sampler=sampler,  # Use sampler instead of shuffle
    )

    train_loss = losses.MultipleNegativesRankingLoss(model)

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        optimizer_params={
            "lr": learning_rate,
            "eps": 1e-6,
        },
        show_progress_bar=True,
    )

    logger.success("Fine-tuning complete!")
    
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# model = SentenceTransformer("finetuned_model")

# Fine-tune the model
fine_tune_model(model, pairs)

                                                                     

Step,Training Loss


In [30]:
# Save the fine-tuned model
model.save("finetuned_model", safe_serialization=True)

In [5]:
# Post-training evaluation
model = SentenceTransformer("finetuned_model")
post_score = evaluate_model(model, [q.texts[0] for q in pairs], [q.texts[1] for q in pairs])
logger.info(f"Post-Fine-Tuning Cosine Similarity: {post_score:.4f}")

Evaluating Model: 100%|██████████| 1657/1657 [06:07<00:00,  4.50pair/s]
[32m2025-02-15 10:12:22.178[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_model[0m:[36m23[0m - [1mEvaluation Complete - Average Cosine Similarity: 0.2494[0m
[32m2025-02-15 10:12:22.181[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mPost-Fine-Tuning Cosine Similarity: 0.2494[0m
