In [1]:
import sys
import os

# Add the parent directory of the current working directory to sys.path
sys.path.append(os.path.abspath(".."))

In [9]:
print(os.getcwd())

/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/src


In [21]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m170.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [2]:
import yaml
import boto3
import src
from src.document_processor.loader import DocumentLoader
from src.document_processor.chunker import DocumentChunker
from src.document_processor.cleaner import TextCleaner
from src.embeddings.embedding_manager import EmbeddingManager
from src.question_generation.generator import EnhancedQuestionGenerator
import logging


with open('/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Setup logging
logging.basicConfig(
    level=getattr(logging, config['logging']['level']),
    format=config['logging']['format']
)

logger = logging.getLogger(__name__)

# Initialize AWS client
bedrock_client = boto3.client('bedrock-runtime')

# Initialize components
loader = DocumentLoader()
chunker = DocumentChunker(
    chunk_size=config['document_processing']['chunk_size'],
    chunk_overlap=config['document_processing']['chunk_overlap']
)
cleaner = TextCleaner()

embedding_manager = EmbeddingManager(
    bedrock_client,
    model_id=config['embedding']['model_id']
)

generator = EnhancedQuestionGenerator(
    llm_client=bedrock_client,
    model_id=config['question_generation']['model_id'],
    embedding_manager=embedding_manager,
    max_tokens=config['question_generation']['max_tokens'],
    temperature=config['question_generation']['temperature']
)

2024-11-08 13:40:45,476 - botocore.credentials - INFO - Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  self.embeddings = BedrockEmbeddings(


In [4]:
# Process documents
documents = loader.load_document("/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/data/NM_changed.pdf")
logger.info("Documents loaded")

# Clean and chunk documents
cleaned_documents = []
for doc in documents:
    doc.page_content = cleaner.clean_text(doc.page_content)
    cleaned_documents.append(doc)
logger.info("Documents cleaned")

chunks = chunker.chunk_documents(cleaned_documents)
logger.info(f"Documents chunked into {len(chunks)} chunks")

# Create embeddings
embedding_manager.create_embeddings(chunks)
logger.info("Embeddings created")

2024-11-08 13:42:07,928 - __main__ - INFO - Documents loaded
2024-11-08 13:42:08,082 - __main__ - INFO - Documents cleaned
2024-11-08 13:42:08,473 - __main__ - INFO - Documents chunked into 2095 chunks
2024-11-08 13:44:33,228 - faiss.loader - INFO - Loading faiss with AVX2 support.
2024-11-08 13:44:33,229 - faiss.loader - INFO - Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2024-11-08 13:44:33,229 - faiss.loader - INFO - Loading faiss.
2024-11-08 13:44:33,440 - faiss.loader - INFO - Successfully loaded faiss.
2024-11-08 13:44:33,625 - src.embeddings.embedding_manager - INFO - Embeddings created and stored in FAISS
2024-11-08 13:44:33,626 - __main__ - INFO - Embeddings created


In [13]:
# Create embeddings and save them
#embedding_manager.create_embeddings(documents)
embedding_manager.save_embeddings("faiss_index", "metadata.txt")

# Load the saved embeddings
#embedding_manager.load_embeddings("faiss_index")

# Wipe the existing database
#embedding_manager.wipe_embeddings("faiss_index", "metadata.txt")

2024-11-08 13:51:48,104 - src.embeddings.embedding_manager - INFO - Embeddings saved to faiss_index


In [16]:
import pandas as pd
# Generate questions
questions = generator.generate_questions_from_docs(
    chunks,
    num_questions= 100
)
logger.info(f"{len(questions)} question-answer pairs generated")

# Prepare data for DataFrame
qa_data = [{"Question": qa.question, "Answer": qa.answer, "Context": qa.context} for qa in questions]

# Create DataFrame
qa_df = pd.DataFrame(qa_data)

2024-11-08 14:08:12,650 - __main__ - INFO - 100 question-answer pairs generated


In [17]:
# Display the DataFrame
qa_df.head()

Unnamed: 0,Question,Answer,Context
0,What is the fundamental concept and process in...,Nuclear medicine imaging involves administerin...,1 chapter What Is Nuclear Medicine?A. FUNDAMEN...
1,What are the current applications and global u...,"As of 2006, there were roughly 100 different d...",1 chapter What Is Nuclear Medicine?A. FUNDAMEN...
2,What are the two broad classes of nuclear medi...,The two broad classes of nuclear medicine imag...,photons are emitted. The energy of these gamma...
3,"What are the key components of a gamma camera,...",The key components of a gamma camera are a col...,photons are emitted. The energy of these gamma...
4,What are the two broad classes of nuclear medi...,The two broad classes of nuclear medicine imag...,particular angle. This results in an image wit...


In [18]:
# Display the DataFrame
qa_df.tail()

Unnamed: 0,Question,Answer,Context
95,"What is the significance of the notation ""eKxx...","In the notation ""eKxx"", the symbol ""e"" represe...","the symbol x is inclusive, referring to all Au..."
96,What is the relationship between the atomic nu...,"In a neutral atom, the number of orbital elect...","of the atom, Z. As mentioned earlier, this als..."
97,What is the significance of the mass number (A...,The mass number (A) of an atomic nucleus repre...,"of the atom, Z. As mentioned earlier, this als..."
98,What are the two types of forces that nucleons...,Nucleons within the nucleus are subject to two...,14 Physics in Nuclear Medicine4. Forces and En...
99,How does the shell model describe the motion o...,The shell model portrays the nucleons as movin...,14 Physics in Nuclear Medicine4. Forces and En...


In [19]:
qa_df.to_csv("qa_data.csv", index=False)

# Fine-tuning

## Ministral-3B-instruct

I'm going to test this model on really specific questions from the Nuclear Medicine manual and see how well it responds firs

In [None]:
# Required imports
!pip install ragas
!pip install llama_index
import torch
import ragas
from ragas.llms import LangchainLLMWrapper
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from ragas.testset import TestsetGenerator
from langchain_openai import ChatOpenAI
from llama_index.core import SimpleDirectoryReader
import gc
import pandas as pd
from datasets import Dataset

# Memory management class
class MemoryTracker:
    def __init__(self):
        self.records = []
    
    def log_memory(self, checkpoint):
        memory_stats = {
            'checkpoint': checkpoint,
            'allocated': torch.cuda.memory_allocated() / 1024**2,
            'reserved': torch.cuda.memory_reserved() / 1024**2
        }
        self.records.append(memory_stats)
        return memory_stats

    def clear_memory(self):
        torch.cuda.empty_cache()
        gc.collect()

memory_tracker = MemoryTracker()

In [11]:
from transformers import pipeline

messages = [
    {"role": "user", "content": "What was the significance of the discovery of technetium-99m (99mTc) in the advancement of nuclear medicine imaging"},
]

# Specify the device parameter to use the first GPU (cuda:0)
pipe = pipeline("text-generation", model="ministral/Ministral-3b-instruct")
response = pipe(messages, max_new_tokens=50)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [12]:
response

[{'generated_text': [{'role': 'user',
    'content': 'What was the significance of the discovery of technetium-99m (99mTc) in the advancement of nuclear medicine imaging'},
   {'role': 'assistant',
    'content': 'The discovery of the first atomic nucleus, the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the'}]}]