# Main Script

## Setup

In [None]:
!pip install langchain
!pip install langchain-community
!pip install faiss-gpu
!pip install peft
!pip install transformers
!pip install ragas
!pip install llama_index
!pip install langchain-aws
!pip install bitsandbytes
!pip install trl



In [4]:
import boto3
import yaml
import boto3
import src
from src.document_processor.loader import DocumentLoader
from src.document_processor.chunker import DocumentChunker
from src.document_processor.cleaner import TextCleaner
from src.embeddings.embedding_manager import EmbeddingManager
from src.question_generation.generator import EnhancedQuestionGenerator
import logging
import pandas as pd
import torch
import ragas
from ragas.llms import LangchainLLMWrapper
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from ragas.testset import TestsetGenerator
from langchain_openai import ChatOpenAI
from llama_index.core import SimpleDirectoryReader
import gc
import pandas as pd
from datasets import Dataset
import torch
import gc
import pandas as pd
from transformers import pipeline
# Import necessary modules
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_aws import ChatBedrockConverse
from langchain_aws import BedrockEmbeddings
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns





# Initialize a session using the default profile or environment credentials
session = boto3.Session()

Matplotlib is building the font cache; this may take a moment.


In [None]:
with open('/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Setup logging
logging.basicConfig(
    level=getattr(logging, config['logging']['level']),
    format=config['logging']['format']
)

logger = logging.getLogger(__name__)

# Initialize AWS client
bedrock_client = boto3.client('bedrock-runtime')

# Initialize components
loader = DocumentLoader()
chunker = DocumentChunker(
    chunk_size=config['document_processing']['chunk_size'],
    chunk_overlap=config['document_processing']['chunk_overlap']
)
cleaner = TextCleaner()

embedding_manager = EmbeddingManager(
    bedrock_client,
    model_id=config['embedding']['model_id']
)

generator = EnhancedQuestionGenerator(
    llm_client=bedrock_client,
    model_id=config['question_generation']['model_id'],
    embedding_manager=embedding_manager,
    max_tokens=config['question_generation']['max_tokens'],
    temperature=config['question_generation']['temperature']
)

In [None]:
# Process documents
documents = loader.load_document("/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/data/NM_changed.pdf")
logger.info("Documents loaded")

# Clean and chunk documents
cleaned_documents = []
for doc in documents:
    doc.page_content = cleaner.clean_text(doc.page_content)
    cleaned_documents.append(doc)
logger.info("Documents cleaned")

chunks = chunker.chunk_documents(cleaned_documents)
logger.info(f"Documents chunked into {len(chunks)} chunks")

# Create embeddings
embedding_manager.create_embeddings(chunks)
logger.info("Embeddings created")

In [None]:
# Create embeddings and save them
#embedding_manager.create_embeddings(documents)
embedding_manager.save_embeddings("faiss_index", "metadata.txt")

# Load the saved embeddings
#embedding_manager.load_embeddings("faiss_index")

# Wipe the existing database
#embedding_manager.wipe_embeddings("faiss_index", "metadata.txt")

In [None]:
# Generate questions
questions = generator.generate_questions_from_docs(
    chunks,
    num_questions= 100
)
logger.info(f"{len(questions)} question-answer pairs generated")

# Prepare data for DataFrame
qa_data = [{"Question": qa.question, "Answer": qa.answer, "Context": qa.context} for qa in questions]

# Create DataFrame
qa_df = pd.DataFrame(qa_data)

In [None]:
# Display the DataFrame
qa_df.head()

In [None]:
qa_df.to_csv("qa_data.csv", index=False)

# Fine-tuning

## Ministral-3B-instruct

I'm going to test this model on really specific questions from the Nuclear Medicine manual and see how well it responds firs

In [None]:
# Memory management class
class MemoryTracker:
    def __init__(self):
        self.records = []
    
    def log_memory(self, checkpoint):
        memory_stats = {
            'checkpoint': checkpoint,
            'allocated': torch.cuda.memory_allocated() / 1024**2,
            'reserved': torch.cuda.memory_reserved() / 1024**2
        }
        self.records.append(memory_stats)
        return memory_stats

    def clear_memory(self):
        torch.cuda.empty_cache()
        gc.collect()

memory_tracker = MemoryTracker()

In [14]:
# Clear CUDA cache
torch.cuda.empty_cache()
# Run garbage collector
gc.collect()

19

In [15]:
def print_gpu_memory():
    print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    print(f"Cached memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

print_gpu_memory()

Allocated memory: 0.00 MB
Cached memory: 0.00 MB


## Generating answers based on the questions generated by Claude in the previous section

In [None]:
pipe = pipeline("text-generation", model="ministral/Ministral-3b-instruct", device = 'cuda')

In [16]:
# Load questions from the CSV
qa_df = pd.read_csv("qa_data.csv")  # Replace with your CSV file path
questions = qa_df['Question'].tolist()  # Assuming 'question' is the column name

# Initialize the Mistral model pipeline
pipe = pipeline("text-generation", model="ministral/Ministral-3b-instruct", device = 0)

responses = []
for question in questions:
    messages = [{"role": "user", "content": question}]
    response = pipe(messages, max_new_tokens=600, do_sample=True, temperature = 0.4, top_p = 0.7)

    # Navigate the nested structure to extract the assistant's content
    try:
        generated_messages = response[0]['generated_text']
        assistant_message = next(
            (msg['content'] for msg in generated_messages if msg['role'] == 'assistant'), 
            "No assistant response found"
        )
        responses.append(assistant_message)
    except KeyError as e:
        print(f"KeyError: {e} in response {response}")
        responses.append("Error: Unexpected response format")

# Combine questions and responses
result_df = pd.DataFrame({'question': questions, 'mistral_response': responses})

# Save the results to a new CSV
result_df .to_csv("qa_with_mistral_responses.csv", index=False)

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/2.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/2.97G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/698M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/510 [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [17]:
qa_df = pd.read_csv("qa_data.csv")

In [18]:
qa_df.head()

Unnamed: 0,Question,Answer,Context
0,What is the fundamental concept and process in...,Nuclear medicine imaging involves administerin...,1 chapter What Is Nuclear Medicine?A. FUNDAMEN...
1,What are the current applications and global u...,"As of 2006, there were roughly 100 different d...",1 chapter What Is Nuclear Medicine?A. FUNDAMEN...
2,What are the two broad classes of nuclear medi...,The two broad classes of nuclear medicine imag...,photons are emitted. The energy of these gamma...
3,"What are the key components of a gamma camera,...",The key components of a gamma camera are a col...,photons are emitted. The energy of these gamma...
4,What are the two broad classes of nuclear medi...,The two broad classes of nuclear medicine imag...,particular angle. This results in an image wit...


In [22]:
!pip install langchain-aws

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-aws
  Downloading langchain_aws-0.2.7-py3-none-any.whl.metadata (3.2 kB)
Downloading langchain_aws-0.2.7-py3-none-any.whl (87 kB)
Installing collected packages: langchain-aws
Successfully installed langchain-aws-0.2.7


In [7]:
qa_df = pd.read_csv("qa_data.csv")
result_df = pd.read_csv("qa_with_mistral_responses.csv")

## Metrics: 

- `Faithfulness` metric measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. Higher the better.
- `FactualCorrectness` is a metric that compares and evaluates the factual accuracy of the generated response with the reference.
- `Semantic Similarity` pertains to the assessment of the semantic resemblance between the generated answer and the ground truth. This evaluation is based on the ground truth and the answer, with values falling within the range of 0 to 1. A higher score signifies a better alignment between the generated answer and the ground truth.

In [5]:
# Import necessary modules
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import Faithfulness, FactualCorrectness, SemanticSimilarity
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_aws import ChatBedrockConverse
from langchain_aws import BedrockEmbeddings

# AWS Bedrock configuration
config = {
    "llm": "anthropic.claude-3-sonnet-20240229-v1:0",
    "embeddings": "amazon.titan-embed-text-v1",
    "temperature": 0.4,
}

# Initialize Bedrock evaluator models
evaluator_llm = LangchainLLMWrapper(ChatBedrockConverse(
    model=config["llm"],
    temperature=config["temperature"],
))

evaluator_embeddings = LangchainEmbeddingsWrapper(BedrockEmbeddings(
    model_id=config["embeddings"],
))

In [8]:
# Create dataset dictionary
dataset_dict_1 = {
    "question": qa_df['Question'].tolist()[:8],
    "answer": result_df['mistral_response'].tolist()[:8],
    "ground_truth": qa_df['Answer'].tolist()[:8],
    "retrieved_contexts": [[context] if isinstance(context, str) else context 
                         for context in qa_df['Context'].tolist()[:8]]
}
ragas_dataset_1 = Dataset.from_dict(dataset_dict_1)

metrics = [
    FactualCorrectness(llm=evaluator_llm), 
    Faithfulness(llm=evaluator_llm),
    SemanticSimilarity(embeddings=evaluator_embeddings),
]

results_mistral = evaluate(
    dataset=ragas_dataset_1,
    metrics=metrics
)

# Display results
print("Evaluation Results:", results_mistral)

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Exception raised in Job[4]: AttributeError('StringIO' object has no attribute 'statements')


Evaluation Results: {'factual_correctness': 0.0312, 'faithfulness': 0.1342, 'semantic_similarity': 0.4264}


In [None]:
results_mistral.head()

In [10]:
# Create dataset dictionary
dataset_dict_2 = {
    "question": qa_df['Question'].tolist()[:8],
    "answer": qa_df['Answer'].tolist()[:8],
    "ground_truth": qa_df['Answer'].tolist()[:8],
    "retrieved_contexts": [[context] if isinstance(context, str) else context 
                         for context in qa_df['Context'].tolist()[:8]]
}
ragas_dataset_2 = Dataset.from_dict(dataset_dict_2)

metrics = [
    FactualCorrectness(llm=evaluator_llm), 
    Faithfulness(llm=evaluator_llm),
    SemanticSimilarity(embeddings=evaluator_embeddings)
]

results_claude = evaluate(
    dataset=ragas_dataset_2,
    metrics=metrics
)

# Display results
print("Evaluation Results:", results_claude)

Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Evaluation Results: {'factual_correctness': 1.0000, 'faithfulness': 0.9531, 'semantic_similarity': 1.0000}


In [None]:
results_claude.head()

In [None]:
def analysis(mistral_df, claude_df):
    sns.set_style("whitegrid")
    fig, axs = plt.subplots(1, 3, figsize=(12, 5))
    
    for i, col in enumerate(mistral_df.columns):
        sns.kdeplot(
            data=[mistral_df[col].values, claude_df[col].values],
            legend=False,
            ax=axs[i],
            fill=True
        )
        axs[i].set_title(f'{col} scores distribution')
        axs[i].legend(labels=["mistral", "claude"])
    
    plt.tight_layout()
    plt.show()

# Call the analysis function
analysis(
    results_mistral[['factual_correctness', 'faithfulness', 'semantic_similarity']],
    results_mistral[['factual_correctness', 'faithfulness', 'semantic_similarity']])


Obviously the results of comparing claude answers to claude answers will be perfect, this is just to test if the evaluation is working correctly. After, we will fill tune the mistral model and check again! 

## Fine tune using PEFT - LORA

In [6]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [15]:
from datasets import load_dataset

dataset_name = "duartemoura/generathor_fine_tune"
dataset = load_dataset(dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['Question', 'Answer', 'Context'],
        num_rows: 100
    })
})

In [16]:
def format_dataset(example):
    return {
        "messages": [
            {"role": "user", "content": example["Question"]},
            {"role": "assistant", "content": example["Answer"]}
        ]
    }

# Transform your dataset
formatted_dataset = dataset.map(format_dataset)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [29]:
formatted_dataset = dataset.map(
    format_dataset,
    remove_columns=["Question", "Answer", "Context"]  # Remove original fields
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [30]:
print(formatted_dataset["train"][0])


{'messages': [{'content': 'What is the fundamental concept and process involved in nuclear medicine imaging?', 'role': 'user'}, {'content': 'Nuclear medicine imaging involves administering trace amounts of compounds labeled with radioactive isotopes (radionuclides) into the body. These radiolabeled compounds, called radiotracers or radiopharmaceuticals, emit gamma rays or high-energy photons when the radionuclide decays. External, position-sensitive gamma-ray detectors can detect these emissions and generate three-dimensional representations of the radiotracer distribution within the body, providing diagnostic information about various disease states.', 'role': 'assistant'}]}


In [31]:

# define some variables - model names
model_name = "ministral/Ministral-3b-instruct"
new_model = "ministral-ft"

################################################################################
# LoRA parameters
################################################################################
# LoRA attention dimension
# lora_r = 64
lora_r = 4
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"
# Number of training epochs
num_train_epochs = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False
# Batch size per GPU for training
per_device_train_batch_size = 4
# Batch size per GPU for evaluation
per_device_eval_batch_size = 4
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Save checkpoint every X updates steps
save_steps = 25
# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################
# Maximum sequence length to use
max_seq_length = 128 # None
# Pack multiple short examples in the same input sequence to increase efficiency
packing = False # True
# Load the entire model on the GPU 0
# device_map = {"": 0}
device_map="auto"

In [None]:
# Load QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ministral/Ministral-3b-instruct")
tokenizer.pad_token = tokenizer.eos_token

# Quantization config for bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    "ministral/Ministral-3b-instruct",
    quantization_config=bnb_config,
    device_map="auto",
)

# LoRA configuration
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",  # LoRA task type
)

def tokenize_messages(example):
    input_text = ""
    for message in example["messages"]:
        input_text += f"{message['role']}: {message['content']}\n"
    return tokenizer(
        input_text,
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Apply tokenization
tokenized_dataset = formatted_dataset.map(tokenize_messages, batched=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    weight_decay=0.001,
    optim="paged_adamw_32bit",
    fp16=True,  # Use fp16 if supported
    save_steps=25,
    logging_steps=25,
    group_by_length=True,
    warmup_ratio=0.03,
    max_grad_norm=0.3,
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    tokenizer=tokenizer,
    peft_config=lora_config,
)

# Start training
trainer.train()

# Save the model
trainer.save_model("./results")