In [17]:
# Install all required packages
!pip install transformers torch accelerate bitsandbytes peft trl sentence-transformers langchain chromadb rank_bm25 PyMuPDF pandas datasets -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [18]:
import fitz  # PyMuPDF
import re
import os

# Create a directory for data
os.makedirs('data', exist_ok=True)

def extract_text_from_pdf(pdf_path):
    """Extracts raw text from a PDF file."""
    print(f"Extracting text from {pdf_path}...")
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def clean_text(text):
    """Cleans the text by removing noise and normalizing whitespace."""
    print("Cleaning text...")
    text = re.sub(r'Page \d+ of \d+', '', text) # Remove page numbers
    text = re.sub(r'\s*\n\s*', '\n', text) # Normalize whitespace
    text = re.sub(r'Microsoft Corporation\s+Form 10-K', '', text, flags=re.IGNORECASE) # Remove common headers
    return text.strip()

# --- Main Execution ---
# IMPORTANT: Make sure you have uploaded your PDFs
pdf_path_2023 = 'MSFT_2023_10K.pdf'
pdf_path_2022 = 'MSFT_2022_10K.pdf'

msft_2023_raw = extract_text_from_pdf(pdf_path_2023)
msft_2022_raw = extract_text_from_pdf(pdf_path_2022)

msft_2023_clean = clean_text(msft_2023_raw)
msft_2022_clean = clean_text(msft_2022_raw)

full_text = "--- 2023 Report ---\n" + msft_2023_clean + "\n\n--- 2022 Report ---\n" + msft_2022_clean

processed_text_path = 'data/processed_financials.txt'
with open(processed_text_path, 'w', encoding='utf-8') as f:
    f.write(full_text)

print(f"Data preprocessing complete. Cleaned text saved to '{processed_text_path}'")

Extracting text from MSFT_2023_10K.pdf...
Extracting text from MSFT_2022_10K.pdf...
Cleaning text...
Cleaning text...
Data preprocessing complete. Cleaned text saved to 'data/processed_financials.txt'


In [19]:
import pandas as pd

# Create a dummy CSV for demonstration if you haven't uploaded one
try:
    qa_df = pd.read_csv('qa_dataset.csv')
    print("Q&A Dataset loaded successfully!")
except FileNotFoundError:
    print("Creating a dummy 'qa_dataset.csv'. Please upload your own for real results.")
    dummy_data = {
        'question': ["What was Microsoft's total revenue in fiscal year 2023?", "How much did Microsoft spend on research and development in 2023?"],
        'answer': ["In fiscal year 2023, Microsoft's total revenue was $211.9 billion.", "Microsoft spent $27.2 billion on research and development in 2023."]
    }
    qa_df = pd.DataFrame(dummy_data)
    qa_df.to_csv('qa_dataset.csv', index=False)

print(qa_df.head())

Q&A Dataset loaded successfully!
                                            question  \
0  What was Microsoft's total revenue for the fis...   
1        What was the total cost of revenue in 2023?   
2   What was the gross margin for Microsoft in 2023?   
3  How much did Microsoft spend on Research and D...   
4  What was the total for Sales and marketing exp...   

                                              answer  
0  Microsoft's total revenue for the fiscal year ...  
1  The total cost of revenue in 2023 was $65.9 bi...  
2  Microsoft's gross margin in 2023 was $146.0 bi...  
3  In 2023, Microsoft spent $27.2 billion on Rese...  
4  Sales and marketing expenses totaled $22.7 bil...  


In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from rank_bm25 import BM25Okapi
import numpy as np

with open('data/processed_financials.txt', 'r', encoding='utf-8') as f:
    full_text = f.read()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
chunks = text_splitter.split_text(full_text)
print(f"Created {len(chunks)} text chunks.")

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Build Dense Vector Store (ChromaDB)
client = chromadb.Client()
collection = client.get_or_create_collection(name="financials_rag_small")
chunk_embeddings = embedding_model.encode(chunks, show_progress_bar=True)
collection.add(
    ids=[str(i) for i in range(len(chunks))],
    embeddings=chunk_embeddings.tolist(),
    documents=chunks
)
print("Dense vector store created.")

# Build Sparse Index (BM25)
tokenized_chunks = [chunk.lower().split() for chunk in chunks]
bm25 = BM25Okapi(tokenized_chunks)
print("Sparse index (BM25) created.")

Created 2174 text chunks.


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Batches:   0%|          | 0/68 [00:00<?, ?it/s]

Insert of existing embedding ID: 0
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3
Insert of existing embedding ID: 4
Insert of existing embedding ID: 5
Insert of existing embedding ID: 6
Insert of existing embedding ID: 7
Insert of existing embedding ID: 8
Insert of existing embedding ID: 9
Insert of existing embedding ID: 10
Insert of existing embedding ID: 11
Insert of existing embedding ID: 12
Insert of existing embedding ID: 13
Insert of existing embedding ID: 14
Insert of existing embedding ID: 15
Insert of existing embedding ID: 16
Insert of existing embedding ID: 17
Insert of existing embedding ID: 18
Insert of existing embedding ID: 19
Insert of existing embedding ID: 20
Insert of existing embedding ID: 21
Insert of existing embedding ID: 22
Insert of existing embedding ID: 23
Insert of existing embedding ID: 24
Insert of existing embedding ID: 25
Insert of existing embedding ID: 26
Insert of existing embedding ID: 27
In

Dense vector store created.
Sparse index (BM25) created.


In [21]:
def hybrid_retrieval(query, top_k=5):
    """Performs hybrid retrieval using BM25 and vector search, fused with RRF."""
    processed_query = query.lower()

    # Dense Retrieval
    query_embedding = embedding_model.encode(processed_query).tolist()
    dense_results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
    dense_docs = dense_results['documents'][0]

    # Sparse Retrieval
    tokenized_query = processed_query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    top_n_indices = np.argsort(bm25_scores)[::-1][:top_k]
    sparse_docs = [chunks[i] for i in top_n_indices]

    # RRF Fusion
    fused_scores = {}
    k = 60
    for i, doc in enumerate(dense_docs):
        fused_scores[doc] = fused_scores.get(doc, 0) + 1 / (k + i + 1)
    for i, doc in enumerate(sparse_docs):
        fused_scores[doc] = fused_scores.get(doc, 0) + 1 / (k + i + 1)

    reranked_results = sorted(fused_scores.items(), key=lambda item: item[1], reverse=True)
    return [doc for doc, score in reranked_results][:top_k]

# Test the retriever
test_query = "What was the revenue in 2023?"
retrieved_context = hybrid_retrieval(test_query)
print(f"--- Retrieved Context for '{test_query}' ---")
# Fix: Move slicing outside the f-string
context_snippet = '\n\n'.join(retrieved_context)[:1000]
print(f"{context_snippet}...")

--- Retrieved Context for 'What was the revenue in 2023?' ---
in future periods, was $229 billion as of June 30, 2023, of which $224 billion is related to the commercial portion of revenue. We expect to recognize
approximately 45% of this revenue over the next 12 months and the remainder thereafter.
NOTE 14 — LEASES

determining revenue recognition for these customer agreements was extensive and required a high degree of auditor judgment.
How the Critical Audit Matter Was Addressed in the Audit
Our principal audit procedures related to the Company's revenue recognition for these customer agreements included the following:
•

10-K) for further discussion.
The following table outlines the expected future recognition of unearned revenue as of June 30, 2023:
(In millions)
Three Months Ending
September 30, 2023
$
19,673
December 31, 2023
15,600
March 31, 2024
10,801
June 30, 2024
4,827
Thereafter
2,912
Total
$
53,813

recognition for these customer agreements was extensive and required a hi

In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time

# Load the small model and tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Add a pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32, # Use standard precision
    device_map="auto",
)
print("Base model DistilGPT-2 loaded successfully.")

Base model DistilGPT-2 loaded successfully.


In [32]:
def generate_answer(query, context, model_to_use):
    """Generates an answer using a model, context, and query."""
    # Simpler prompt for a base model
    prompt = f"""
    Context:
    {context}

    Question:
    {query}

    Answer:
    """
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to("cpu") # Change to "cpu"

    start_time = time.time()
    outputs = model_to_use.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.1,
        pad_token_id=tokenizer.pad_token_id,
        top_k=10
    )
    end_time = time.time()

    # Decode only the newly generated tokens
    answer = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True).strip()
    response_time = end_time - start_time
    return answer, response_time

def rag_output_guardrail(answer, context):
    """Flags answers that are non-committal or don't seem to use the context."""
    low_confidence_phrases = ["not mentioned", "not available", "i cannot answer"]
    is_confident = not any(phrase in answer.lower() for phrase in low_confidence_phrases)
 
    # Simple check: Does the answer contain keywords from the context?
    # This is a heuristic and can be improved.
    context_keywords = set(word.lower() for word in re.findall(r'\b\w+\b', context) if len(word) > 2)
    answer_keywords = set(word.lower() for word in re.findall(r'\b\w+\b', answer) if len(word) > 2)
 
    # Calculate the percentage of answer keywords present in the context keywords
    overlap = len(answer_keywords.intersection(context_keywords))
    answer_length = len(answer_keywords)
    keyword_overlap_ratio = overlap / answer_length if answer_length > 0 else 0
 
    # Combine the checks for a confidence score (heuristic)
    # Assign higher confidence if the answer seems to use the context and is not a low-confidence phrase
    if is_confident and keyword_overlap_ratio > 0.1: # Threshold can be adjusted
        confidence_score = 0.7 + (keyword_overlap_ratio * 0.3) # Scale keyword overlap into confidence
        confidence_score = min(confidence_score, 1.0) # Cap confidence at 1.0
    else:
        confidence_score = 0.3 + (keyword_overlap_ratio * 0.1) # Lower base confidence if not confident or low overlap
 
    return answer, confidence_score
# --- Full RAG Pipeline Test ---
def answer_query_rag(query):
    retrieved_chunks = hybrid_retrieval(query)
    context = "\n\n".join(retrieved_chunks)
    answer, response_time = generate_answer(query, context, base_model)
    final_answer, confidence = rag_output_guardrail(answer, context)
    return final_answer, confidence, response_time

# Example run
test_query = "What was Microsoft's revenue in fiscal year 2023?"
answer, conf, r_time = answer_query_rag(test_query)
print(f"RAG Answer: {answer}")
print(f"Confidence: {conf}")
print(f"Response Time: {r_time:.2f}s")



RAG Answer: 
Confidence: 0.3
Response Time: 13.72s


In [33]:
from datasets import Dataset

def create_raft_dataset(qa_df, chunks):
    raft_data = []
    print("Creating RAFT dataset...")
    for index, row in qa_df.iterrows():
        question, answer = row['question'], row['answer']
        context = "\n\n".join(hybrid_retrieval(question, top_k=1))
        if context:
            # Format for fine-tuning with the simpler prompt
            text = f"""
            Context:
            {context}

            Question:
            {question}

            Answer:
            {answer}
            """
            raft_data.append({"text": text})
    return Dataset.from_list(raft_data)

raft_dataset = create_raft_dataset(qa_df, chunks)
print("RAFT dataset created successfully.")
print(raft_dataset[0]['text'])

Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


Creating RAFT dataset...


Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


RAFT dataset created successfully.

            Context:
            Highlights from fiscal year 2023 compared with fiscal year 2022 included:
•Microsoft Cloud revenue increased 22% to $111.6 billion.
•Office Commercial products and cloud services revenue increased 10% driven by Office 365 Commercial growth of 13%.
•Office Consumer products and cloud services revenue increased 2% and Microsoft 365 Consumer subscribers increased to 67.0 million.

            Question:
            What was Microsoft's total revenue for the fiscal year ended June 30, 2023?

            Answer:
            Microsoft's total revenue for the fiscal year ended June 30, 2023, was $211.9 billion.
            


In [25]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [29]:
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["c_attn"],
    bias="none",
    task_type="CAUSAL_LM",
)

ft_model = get_peft_model(base_model, lora_config)
ft_model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir="./ft_model_distilgpt2",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=10,
    logging_steps=5,
    fp16=False,
    save_strategy="epoch",
    report_to="none",
)


trainer = SFTTrainer(
    model=ft_model,
    train_dataset=raft_dataset,
    args=training_args,
    dataset_text_field="text", 
    tokenizer=tokenizer,
)

print("--- Starting Fine-Tuning ---")
trainer.train()
print("--- Fine-Tuning Complete ---")

trainer.save_model("./ft_model_distilgpt2_adapters")


trainable params: 147,456 || all params: 82,060,032 || trainable%: 0.17969283755580304


Map:   0%|          | 0/101 [00:00<?, ? examples/s]

--- Starting Fine-Tuning ---


  0%|          | 0/130 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.6452, 'learning_rate': 4.8076923076923084e-05, 'epoch': 0.38}
{'loss': 2.569, 'learning_rate': 4.615384615384616e-05, 'epoch': 0.77}
{'loss': 2.6736, 'learning_rate': 4.423076923076923e-05, 'epoch': 1.15}
{'loss': 2.6102, 'learning_rate': 4.230769230769231e-05, 'epoch': 1.54}
{'loss': 2.5334, 'learning_rate': 4.038461538461539e-05, 'epoch': 1.92}
{'loss': 2.584, 'learning_rate': 3.846153846153846e-05, 'epoch': 2.31}
{'loss': 2.5516, 'learning_rate': 3.653846153846154e-05, 'epoch': 2.69}
{'loss': 2.5637, 'learning_rate': 3.461538461538462e-05, 'epoch': 3.08}
{'loss': 2.5717, 'learning_rate': 3.269230769230769e-05, 'epoch': 3.46}
{'loss': 2.6008, 'learning_rate': 3.0769230769230774e-05, 'epoch': 3.85}
{'loss': 2.4268, 'learning_rate': 2.8846153846153845e-05, 'epoch': 4.23}
{'loss': 2.5148, 'learning_rate': 2.6923076923076923e-05, 'epoch': 4.62}
{'loss': 2.5385, 'learning_rate': 2.5e-05, 'epoch': 5.0}
{'loss': 2.4987, 'learning_rate': 2.307692307692308e-05, 'epoch': 5.38}
{'los

In [30]:
from peft import PeftModel

print("Loading fine-tuned model...")
# Reload the base model to avoid memory issues
base_model_for_ft = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
ft_model_merged = PeftModel.from_pretrained(base_model_for_ft, "./ft_model_distilgpt2_adapters")
ft_model_merged = ft_model_merged.merge_and_unload()
print("Fine-tuned model loaded and merged.")

def answer_query_ft(query):
    context = "\n\n".join(hybrid_retrieval(query, top_k=1))
    answer, response_time = generate_answer(query, context, ft_model_merged)
    final_answer, confidence = rag_output_guardrail(answer)
    return final_answer, confidence, response_time

# --- Run your evaluation questions ---
evaluation_questions = [
    "What was Microsoft's revenue in 2023?",
    "What are the primary strategic risks related to AI development?",
    "What is the capital of France?",
    "How much cash was used in financing activities in 2023?",
    "Compare the net income of 2023 and 2022.",
]

results = []
print("\n--- Starting Evaluation ---")
for q in evaluation_questions:
    print(f"Processing question: {q}")
    rag_ans, rag_conf, rag_time = answer_query_rag(q)
    results.append(["RAG (DistilGPT2)", q, rag_ans, rag_conf, rag_time])

    ft_ans, ft_conf, ft_time = answer_query_ft(q)
    results.append(["Fine-Tune (DistilGPT2)", q, ft_ans, ft_conf, ft_time])

results_df = pd.DataFrame(results, columns=["Method", "Question", "Answer", "Confidence", "Time (s)"])
print("\n--- Evaluation Results ---")
print(results_df)

results_df.to_csv("evaluation_results_small_model.csv", index=False)

Loading fine-tuned model...




Fine-tuned model loaded and merged.

--- Starting Evaluation ---
Processing question: What was Microsoft's revenue in 2023?




Processing question: What are the primary strategic risks related to AI development?
Processing question: What is the capital of France?
Processing question: How much cash was used in financing activities in 2023?
Processing question: Compare the net income of 2023 and 2022.

--- Evaluation Results ---
                   Method                                           Question  \
0        RAG (DistilGPT2)              What was Microsoft's revenue in 2023?   
1  Fine-Tune (DistilGPT2)              What was Microsoft's revenue in 2023?   
2        RAG (DistilGPT2)  What are the primary strategic risks related t...   
3  Fine-Tune (DistilGPT2)  What are the primary strategic risks related t...   
4        RAG (DistilGPT2)                     What is the capital of France?   
5  Fine-Tune (DistilGPT2)                     What is the capital of France?   
6        RAG (DistilGPT2)  How much cash was used in financing activities...   
7  Fine-Tune (DistilGPT2)  How much cash was used in fin

In [31]:
# Save models for Streamlit app
import os
import pickle
import json

print("🔄 Saving models for Streamlit app...")

# Create directories
os.makedirs('models', exist_ok=True)
os.makedirs('models/rag', exist_ok=True)
os.makedirs('models/fine_tuned', exist_ok=True)

# 1. Save RAG components
print("💾 Saving RAG components...")

# Save embedding model
embedding_model.save('models/rag/embedding_model')
print("✅ Embedding model saved")

# Save text chunks
with open('models/rag/chunks.pkl', 'wb') as f:
    pickle.dump(chunks, f)
print("✅ Text chunks saved")

# Save BM25 index
with open('models/rag/bm25_index.pkl', 'wb') as f:
    pickle.dump(bm25, f)
print("✅ BM25 index saved")

# Save base model
base_model_path = 'models/rag/base_model'
os.makedirs(base_model_path, exist_ok=True)
base_model.save_pretrained(base_model_path)
tokenizer.save_pretrained(base_model_path)
print("✅ Base model saved")

# 2. Save Fine-tuned model
print("💾 Saving fine-tuned model...")

if 'ft_model_merged' in locals():
    ft_model_merged.save_pretrained('models/fine_tuned/merged_model')
    tokenizer.save_pretrained('models/fine_tuned/merged_model')
    print("✅ Fine-tuned model saved")
else:
    print("⚠️  Fine-tuned model not available")

# 3. Create model configuration
model_config = {
    "rag": {
        "embedding_model": "all-MiniLM-L6-v2",
        "base_model": "distilgpt2",
        "chunks_file": "chunks.pkl",
        "bm25_file": "bm25_index.pkl",
        "embedding_model_path": "embedding_model",
        "base_model_path": "base_model"
    },
    "fine_tuned": {
        "model_path": "merged_model",
        "base_model": "distilgpt2",
        "training_method": "LoRA"
    },
    "data": {
        "processed_text": "data/processed_financials.txt",
        "qa_dataset": "qa_dataset.csv"
    }
}

with open('models/model_config.json', 'w') as f:
    json.dump(model_config, f, indent=2)
print("✅ Model configuration saved")

print("\n🎉 All models saved successfully for Streamlit app!")
print("📁 Check the 'models/' directory for all saved components.")
print("🚀 Your Streamlit app is now ready to use these models!")

🔄 Saving models for Streamlit app...
💾 Saving RAG components...
✅ Embedding model saved
✅ Text chunks saved
✅ BM25 index saved
✅ Base model saved
💾 Saving fine-tuned model...
✅ Fine-tuned model saved
✅ Model configuration saved

🎉 All models saved successfully for Streamlit app!
📁 Check the 'models/' directory for all saved components.
🚀 Your Streamlit app is now ready to use these models!
