In [1]:
import sys
import os

# Add the parent directory of the current working directory to sys.path
sys.path.append(os.path.abspath(".."))

In [9]:
print(os.getcwd())

/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/src


In [21]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m170.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [2]:
import yaml
import boto3
import src
from src.document_processor.loader import DocumentLoader
from src.document_processor.chunker import DocumentChunker
from src.document_processor.cleaner import TextCleaner
from src.embeddings.embedding_manager import EmbeddingManager
from src.question_generation.generator import EnhancedQuestionGenerator
import logging


with open('/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Setup logging
logging.basicConfig(
    level=getattr(logging, config['logging']['level']),
    format=config['logging']['format']
)

logger = logging.getLogger(__name__)

# Initialize AWS client
bedrock_client = boto3.client('bedrock-runtime')

# Initialize components
loader = DocumentLoader()
chunker = DocumentChunker(
    chunk_size=config['document_processing']['chunk_size'],
    chunk_overlap=config['document_processing']['chunk_overlap']
)
cleaner = TextCleaner()

embedding_manager = EmbeddingManager(
    bedrock_client,
    model_id=config['embedding']['model_id']
)

generator = EnhancedQuestionGenerator(
    llm_client=bedrock_client,
    model_id=config['question_generation']['model_id'],
    embedding_manager=embedding_manager,
    max_tokens=config['question_generation']['max_tokens'],
    temperature=config['question_generation']['temperature']
)

2024-11-08 13:40:45,476 - botocore.credentials - INFO - Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  self.embeddings = BedrockEmbeddings(


In [4]:
# Process documents
documents = loader.load_document("/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/data/NM_changed.pdf")
logger.info("Documents loaded")

# Clean and chunk documents
cleaned_documents = []
for doc in documents:
    doc.page_content = cleaner.clean_text(doc.page_content)
    cleaned_documents.append(doc)
logger.info("Documents cleaned")

chunks = chunker.chunk_documents(cleaned_documents)
logger.info(f"Documents chunked into {len(chunks)} chunks")

# Create embeddings
embedding_manager.create_embeddings(chunks)
logger.info("Embeddings created")

2024-11-08 13:42:07,928 - __main__ - INFO - Documents loaded
2024-11-08 13:42:08,082 - __main__ - INFO - Documents cleaned
2024-11-08 13:42:08,473 - __main__ - INFO - Documents chunked into 2095 chunks
2024-11-08 13:44:33,228 - faiss.loader - INFO - Loading faiss with AVX2 support.
2024-11-08 13:44:33,229 - faiss.loader - INFO - Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2024-11-08 13:44:33,229 - faiss.loader - INFO - Loading faiss.
2024-11-08 13:44:33,440 - faiss.loader - INFO - Successfully loaded faiss.
2024-11-08 13:44:33,625 - src.embeddings.embedding_manager - INFO - Embeddings created and stored in FAISS
2024-11-08 13:44:33,626 - __main__ - INFO - Embeddings created


In [13]:
# Create embeddings and save them
#embedding_manager.create_embeddings(documents)
embedding_manager.save_embeddings("faiss_index", "metadata.txt")

# Load the saved embeddings
#embedding_manager.load_embeddings("faiss_index")

# Wipe the existing database
#embedding_manager.wipe_embeddings("faiss_index", "metadata.txt")

2024-11-08 13:51:48,104 - src.embeddings.embedding_manager - INFO - Embeddings saved to faiss_index


In [14]:
import pandas as pd
# Generate questions
questions = generator.generate_questions_from_docs(
    chunks,
    num_questions= 5
)
logger.info(f"{len(questions)} question-answer pairs generated")

# Prepare data for DataFrame
qa_data = [{"Question": qa.question, "Answer": qa.answer, "Context": qa.context} for qa in questions]

# Create DataFrame
qa_df = pd.DataFrame(qa_data)

2024-11-08 13:57:28,052 - __main__ - INFO - 5 question-answer pairs generated


In [15]:
# Display the DataFrame
qa_df.head()

Unnamed: 0,Question,Answer,Context
0,What is the fundamental concept behind the sci...,The fundamental concept behind nuclear medicin...,1 chapter What Is Nuclear Medicine?A. FUNDAMEN...
1,What is the current state of nuclear medicine ...,"As of 2008, more than 30 million nuclear medic...",1 chapter What Is Nuclear Medicine?A. FUNDAMEN...
2,What are the two broad classes of nuclear medi...,The two broad classes of nuclear medicine imag...,photons are emitted. The energy of these gamma...
3,What is the fundamental difference between sin...,Single photon imaging uses radionuclides that ...,photons are emitted. The energy of these gamma...
4,What are the two broad classes of nuclear medi...,The two broad classes of nuclear medicine imag...,particular angle. This results in an image wit...


In [7]:
# Display the DataFrame
qa_df.tail()

Unnamed: 0,Question,Answer
15,Why is it common practice to complement tomogr...,Clinical decisions often depend not only on de...
16,What are the key requirements for a radiopharm...,The text outlines three main requirements for ...
17,What is the purpose of acquiring multiple imag...,The purpose of acquiring multiple images as a ...
18,What were the key historical developments that...,The origins of nuclear medicine can be traced ...
19,What were the key technological advancements t...,The development of positron emission tomograph...


In [8]:
qa_df.to_csv("qa_data.csv", index=False)

In [15]:
!pip install huggingface_hub datasets



In [1]:
# Required imports
!pip install ragas
!pip install llama_index
import torch
import ragas
from ragas.llms import LangchainLLMWrapper
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from ragas.testset import TestsetGenerator
from langchain_openai import ChatOpenAI
from llama_index.core import SimpleDirectoryReader
import gc
import pandas as pd
from datasets import Dataset

# Memory management class
class MemoryTracker:
    def __init__(self):
        self.records = []
    
    def log_memory(self, checkpoint):
        memory_stats = {
            'checkpoint': checkpoint,
            'allocated': torch.cuda.memory_allocated() / 1024**2,
            'reserved': torch.cuda.memory_reserved() / 1024**2
        }
        self.records.append(memory_stats)
        return memory_stats

    def clear_memory(self):
        torch.cuda.empty_cache()
        gc.collect()

memory_tracker = MemoryTracker()



In [2]:
def setup_model(model_name="google/gemma-2b-it"):
    memory_tracker.clear_memory()
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype=torch.float16
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Load model
model, tokenizer = setup_model()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# Test basic questions
def test_model(question):
    inputs = tokenizer(question, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example testing
test_questions = [
    "What was the significance of the discovery of technetium-99m (99mTc) in the advancement of nuclear medicine imaging",
]

for question in test_questions:
    print(f"Q: {question}")
    print(f"A: {test_model(question)}\n")

Q: What was the significance of the discovery of technetium-99m (99mTc) in the advancement of nuclear medicine imaging


NameError: name 'tokenizer' is not defined

In [11]:
from transformers import pipeline

messages = [
    {"role": "user", "content": "What was the significance of the discovery of technetium-99m (99mTc) in the advancement of nuclear medicine imaging"},
]

# Specify the device parameter to use the first GPU (cuda:0)
pipe = pipeline("text-generation", model="ministral/Ministral-3b-instruct")
response = pipe(messages, max_new_tokens=50)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [12]:
response

[{'generated_text': [{'role': 'user',
    'content': 'What was the significance of the discovery of technetium-99m (99mTc) in the advancement of nuclear medicine imaging'},
   {'role': 'assistant',
    'content': 'The discovery of the first atomic nucleus, the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the nucleus of the'}]}]