In [2]:
#load Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# List files in the 'CAI' folder
import os

folder_path = '/content/drive/My Drive/CAI'

# Check if the folder exists
if os.path.exists(folder_path):
  # List files in the folder
  files = os.listdir(folder_path)
  print("Files in the 'CAI' folder:")
  for file in files:
    print(file)
else:
  print(f"Folder not found at {folder_path}")

Files in the 'CAI' folder:
streamlit_app.py
sandbox.txt
MM-Annual-Report-2022-23_Cash_Flow_Statement.txt
MM-Annual-Report-2022-23.txt
MM-Annual-Report-2022-23_Other.txt
README.md
mahindra_qa_pairs.txt
mahindra_qa_pairs.json
MM-Annual-Report-2022-23_cleaned.txt
MM-Annual-Report-2023-24_cleaned.txt
MM-Annual-Report-2022-23_Balance_Sheet.txt
testing_evaluation.py
Comparative_Financial_QA_Report.pdf
MM-Annual-Report-2023-24_Cash_Flow_Statement.txt
Comparative_Financial_QA_System.ipynb
MM-Annual-Report-2023-24.txt
create_qa_pairs.py
Comparative_Financial_QA_Report.md
clean_and_segment.py
MM-Annual-Report-2023-24_Balance_Sheet.txt
MM-Annual-Report-2022-23.pdf
MM-Annual-Report-2023-24.pdf
fine_tuning_script.py
rag_system.py
MM-Annual-Report-2023-24_Other.txt
financial_qa_system


# Task Fine-tune a gpt model using Retrieval-Augmented Fine-Tuning (RAFT) 

## Load the qa pairs data

In [4]:
import json
import os

file_path = '/content/drive/My Drive/CAI/mahindra_qa_pairs.json'

# Check if the file exists
if os.path.exists(file_path):
    with open(file_path, 'r') as f:
        qa_pairs = json.load(f)
    print(f"Successfully loaded {len(qa_pairs)} QA pairs from {file_path}")
else:
    print(f"File not found at {file_path}")
    qa_pairs = None


Successfully loaded 51 QA pairs from /content/drive/My Drive/CAI/mahindra_qa_pairs.json


## Prepare the data for raft

In [5]:
import glob

def find_relevant_contexts(question, text_files_path):
    """Finds relevant contexts for a question from multiple text files."""
    relevant_contexts = []
    for filepath in glob.glob(os.path.join(text_files_path, '*.txt')):
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            # Simple heuristic: check if keywords from the question are in the text
            # A more sophisticated approach would involve vector embeddings or keyword extraction
            keywords = question.lower().split()
            if any(keyword in content.lower() for keyword in keywords if len(keyword) > 2):
                # For simplicity, we'll use the whole file content as context if relevant
                # In a real scenario, you'd extract specific relevant snippets
                relevant_contexts.append(content)
    return relevant_contexts

formatted_data = []
text_files_directory = '/content/drive/My Drive/CAI'

if qa_pairs:
    for item in qa_pairs:
        question = item.get('question')
        answer = item.get('answer')

        if question and answer:
            contexts = find_relevant_contexts(question, text_files_directory)
            formatted_data.append({
                'question': question,
                'answer': answer,
                'context': contexts
            })

print(f"Formatted {len(formatted_data)} training examples for RAFT.")
# Display the first few formatted examples
if formatted_data:
    for i, example in enumerate(formatted_data[:5]):
        print(f"\nExample {i+1}:")
        print(f"Question: {example['question']}")
        print(f"Answer: {example['answer']}")
        print(f"Number of Contexts: {len(example['context'])}")
        

Formatted 51 training examples for RAFT.

Example 1:
Question: What was Mahindra & Mahindra's total income from operations in 2023-24?
Answer: Mahindra & Mahindra's total income from operations in 2023-24 was ₹103,158 crores.
Number of Contexts: 11

Example 2:
Question: What was the PAT (Profit After Tax) for M&M standalone in 2023-24?
Answer: The PAT for M&M standalone in 2023-24 was ₹8,172 crores, representing a 64% increase compared to F23.
Number of Contexts: 12

Example 3:
Question: What was M&M's automotive volume in 2023-24?
Answer: M&M's automotive volume in 2023-24 was 5,88,062 units, representing a 18.1% increase in total automotive volume.
Number of Contexts: 10

Example 4:
Question: What was the tractor volume for Mahindra in 2023-24?
Answer: The tractor volume for Mahindra in 2023-24 was 3,37,818 units (includes domestic sales and exports; includes Mahindra, Swaraj & Trakstar Brands).
Number of Contexts: 12

Example 5:
Question: What is Mahindra's market share in SUVs?
Ans

In [9]:
!pip install faiss-cpu sentence-transformers



In [10]:
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle
import glob
import os

# 2. Load the content of all relevant text files
document_texts = []
document_filenames = []
text_files_directory = '/content/drive/My Drive/CAI'

for filepath in glob.glob(os.path.join(text_files_directory, '*.txt')):
    try:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            document_texts.append(content)
            document_filenames.append(os.path.basename(filepath))
    except Exception as e:
        print(f"Error reading file {filepath}: {e}")

print(f"Loaded {len(document_texts)} text documents.")

# 3. Initialize a sentence transformer model
# Using a general-purpose model that works well for various tasks
model = SentenceTransformer('all-MiniLM-L6-v2')

# 4. Generate embeddings for all loaded documents
print("Generating document embeddings...")
document_embeddings = model.encode(document_texts, show_progress_bar=True)
print("Embedding generation complete.")

# 5. Create an index using FAISS
dimension = document_embeddings.shape[1] # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension) # Using L2 distance for similarity search
index.add(np.array(document_embeddings).astype('float32')) # Add embeddings to the index

print(f"Created FAISS index with {index.ntotal} documents.")

# 6. Save the index and the list of documents to disk
index_save_path = '/content/drive/My Drive/CAI/faiss_index.bin'
documents_save_path = '/content/drive/My Drive/CAI/document_data.pkl'

try:
    faiss.write_index(index, index_save_path)
    with open(documents_save_path, 'wb') as f:
        pickle.dump({'texts': document_texts, 'filenames': document_filenames}, f)
    print(f"FAISS index saved to {index_save_path}")
    print(f"Document data saved to {documents_save_path}")
except Exception as e:
    print(f"Error saving index or documents: {e}")

Loaded 12 text documents.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating document embeddings...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedding generation complete.
Created FAISS index with 12 documents.
FAISS index saved to /content/drive/My Drive/CAI/faiss_index.bin
Document data saved to /content/drive/My Drive/CAI/document_data.pkl


## Fine-tune the gpt model with raft

In [11]:
!pip install transformers torch datasets



In [12]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pickle
import faiss
import numpy as np
import os

# Load formatted_data (assuming it's in the environment from the previous step)
if 'formatted_data' not in locals() or not formatted_data:
    print("Formatted data not found. Please run the data preparation step first.")
    # Set formatted_data to an empty list or handle the error as appropriate
    formatted_data = []
else:
    print(f"Using loaded formatted data with {len(formatted_data)} examples.")


# Load the saved FAISS index and document data
index_save_path = '/content/drive/My Drive/CAI/faiss_index.bin'
documents_save_path = '/content/drive/My Drive/CAI/document_data.pkl'

try:
    index = faiss.read_index(index_save_path)
    with open(documents_save_path, 'rb') as f:
        document_data = pickle.load(f)
    document_texts = document_data['texts']
    document_filenames = document_data['filenames']
    print(f"Successfully loaded FAISS index and document data with {len(document_texts)} documents.")
except Exception as e:
    print(f"Error loading FAISS index or document data: {e}")
    index = None
    document_texts = []

# Initialize the base GPT model and tokenizer
model_name = "gpt2" # Using GPT-2 for demonstration
try:
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    # Add a padding token if the model doesn't have one
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
        model.resize_token_embeddings(len(tokenizer))

    print(f"Successfully loaded model and tokenizer: {model_name}")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    tokenizer = None
    model = None


# Prepare data for training using a custom dataset
if formatted_data and index and document_texts and tokenizer and model:
    def preprocess_function(examples):
        # This function will process each example in the dataset
        # Retrieve contexts using the FAISS index
        questions = examples['question']
        answers = examples['answer']
        retrieved_contexts = []

        # Assuming we retrieve top k contexts for each question
        k = 3 # Number of contexts to retrieve
        retriever_model = SentenceTransformer('all-MiniLM-L6-v2') # Re-initialize retriever model if not global or passed

        question_embeddings = retriever_model.encode(questions).astype('float32')
        distances, indices = index.search(question_embeddings, k)

        for doc_indices in indices:
            contexts = [document_texts[i] for i in doc_indices]
            retrieved_contexts.append(" ".join(contexts)) # Concatenate retrieved contexts

        # Format the input for the GPT model (RAFT style)
        # Example format: "Question: [Q] Context: [C] Answer: [A]"
        inputs = [f"Question: {q} Context: {c} Answer: {a}" for q, c, a in zip(questions, retrieved_contexts, answers)]

        # Tokenize the inputs and outputs
        model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

        # For language modeling, the labels are the input tokens shifted
        model_inputs["labels"] = model_inputs["input_ids"].clone()

        return model_inputs

    # Create a Hugging Face Dataset
    # We need to structure formatted_data as a dictionary for the Dataset.from_dict
    dataset_dict = {'question': [ex['question'] for ex in formatted_data],
                    'answer': [ex['answer'] for ex in formatted_data],
                    'context': [ex['context'] for ex in formatted_data]} # Keep original contexts for reference if needed

    dataset = Dataset.from_dict(dataset_dict)

    # Apply the preprocessing function to the dataset
    # We need to remove the original context column before passing to the model
    tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['question', 'answer', 'context'])

    print("Tokenized dataset created.")
    print(tokenized_dataset)
else:
    print("Skipping dataset preparation due to missing data or model.")


Using loaded formatted data with 51 examples.
Successfully loaded FAISS index and document data with 12 documents.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Successfully loaded model and tokenizer: gpt2


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Tokenized dataset created.
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 51
})


In [15]:
# Configure training parameters
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/CAI/raft_finetuned_gpt2",  # Output directory for checkpoints and results
    num_train_epochs=50,               # Number of training epochs
    per_device_train_batch_size=2,    # Batch size per device during training
    save_steps=100,                   # Save checkpoint every 100 steps
    save_total_limit=2,               # Limit the total number of saved checkpoints
    logging_dir="/content/drive/My Drive/CAI/raft_finetuned_gpt2/logs", # Directory for storing logs
    logging_steps=10,                 # Log every 10 steps
    eval_strategy="no",         # No evaluation during training for simplicity
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Train the model
print("Starting model training...")
trainer.train()
print("Training complete.")

# Save the final model
final_model_path = "/content/drive/My Drive/CAI/raft_finetuned_gpt2/final_model"
trainer.save_model(final_model_path)
print(f"Final model saved to {final_model_path}")

  trainer = Trainer(


Starting model training...


Step,Training Loss
10,0.78
20,0.6639
30,0.4102
40,0.5313
50,0.179
60,0.4006
70,0.2285
80,0.2108
90,0.2512
100,0.1112


Training complete.
Final model saved to /content/drive/My Drive/CAI/raft_finetuned_gpt2/final_model
