In [1]:
import sys
import os

# Add the parent directory of the current working directory to sys.path
sys.path.append(os.path.abspath(".."))

In [9]:
print(os.getcwd())

/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/src


In [21]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m170.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [2]:
import yaml
import boto3
import src
from src.document_processor.loader import DocumentLoader
from src.document_processor.chunker import DocumentChunker
from src.document_processor.cleaner import TextCleaner
from src.embeddings.embedding_manager import EmbeddingManager
from src.question_generation.generator import EnhancedQuestionGenerator
import logging


with open('/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Setup logging
logging.basicConfig(
    level=getattr(logging, config['logging']['level']),
    format=config['logging']['format']
)

logger = logging.getLogger(__name__)

# Initialize AWS client
bedrock_client = boto3.client('bedrock-runtime')

# Initialize components
loader = DocumentLoader()
chunker = DocumentChunker(
    chunk_size=config['document_processing']['chunk_size'],
    chunk_overlap=config['document_processing']['chunk_overlap']
)
cleaner = TextCleaner()

embedding_manager = EmbeddingManager(
    bedrock_client,
    model_id=config['embedding']['model_id']
)

generator = EnhancedQuestionGenerator(
    llm_client=bedrock_client,
    model_id=config['question_generation']['model_id'],
    embedding_manager=embedding_manager,
    max_tokens=config['question_generation']['max_tokens'],
    temperature=config['question_generation']['temperature']
)

2024-11-08 11:02:15,337 - botocore.credentials - INFO - Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  self.embeddings = BedrockEmbeddings(


In [10]:
# Process documents
documents = loader.load_document("/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/Physics_NuclMed.pdf")
logger.info("Documents loaded")

# Clean and chunk documents
cleaned_documents = []
for doc in documents:
    doc.page_content = cleaner.clean_text(doc.page_content)
    cleaned_documents.append(doc)
logger.info("Documents cleaned")

chunks = chunker.chunk_documents(cleaned_documents)
logger.info(f"Documents chunked into {len(chunks)} chunks")

# Create embeddings
embedding_manager.create_embeddings(chunks)
logger.info("Embeddings created")

2024-11-08 11:13:01,778 - __main__ - INFO - Documents loaded
2024-11-08 11:13:01,927 - __main__ - INFO - Documents cleaned
2024-11-08 11:13:02,374 - __main__ - INFO - Documents chunked into 2167 chunks
2024-11-08 11:15:30,794 - src.embeddings.embedding_manager - INFO - Embeddings created and stored in FAISS
2024-11-08 11:15:30,795 - __main__ - INFO - Embeddings created


In [11]:
import pandas as pd
# Generate questions
questions = generator.generate_questions_from_docs(
    chunks,
    num_questions=100
)
logger.info(f"{len(questions)} question-answer pairs generated")

# Prepare data for DataFrame
qa_data = [{"Question": qa.question, "Answer": qa.answer} for qa in questions]

# Create DataFrame
qa_df = pd.DataFrame(qa_data)

2024-11-08 11:21:15,684 - __main__ - INFO - 100 question-answer pairs generated


Unnamed: 0,Question,Answer
0,What is the process for a first-time user to a...,"For a first-time user, the process to access t..."
1,What are the contact details provided for tech...,The contact details provided for technical ass...
2,What is the role of physics in nuclear medicine?,"According to the table of contents, the role o..."
3,What topics are covered in Chapter 2?,Chapter 2 covers Basic Atomic and Nuclear Phys...
4,"Why is the text ""This page intentionally left ...","The text ""This page intentionally left blank"" ..."


In [13]:
# Display the DataFrame
qa_df.head()

Unnamed: 0,Question,Answer
0,What is the process for a first-time user to a...,"For a first-time user, the process to access t..."
1,What are the contact details provided for tech...,The contact details provided for technical ass...
2,What is the role of physics in nuclear medicine?,"According to the table of contents, the role o..."
3,What topics are covered in Chapter 2?,Chapter 2 covers Basic Atomic and Nuclear Phys...
4,"Why is the text ""This page intentionally left ...","The text ""This page intentionally left blank"" ..."


In [14]:
# Display the DataFrame
qa_df.tail()

Unnamed: 0,Question,Answer
95,How many pages are intentionally left blank ac...,"The text mentions ""This page intentionally lef..."
96,What is a radiopharmaceutical or radiotracer?,"A radiopharmaceutical, or more commonly called..."
97,What is the basic process involved in a nuclea...,"In a nuclear medicine study, a radiopharmaceut..."
98,What are the two broad classes of nuclear medi...,The two broad classes of nuclear medicine imag...
99,What is the purpose of radionuclide imaging?,The purpose of radionuclide imaging is to obta...


In [15]:
!pip install huggingface_hub datasets



In [1]:
# Required imports
!pip install ragas
!pip install llama_index
import torch
import ragas
from ragas.llms import LangchainLLMWrapper
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from ragas.testset import TestsetGenerator
from langchain_openai import ChatOpenAI
from llama_index.core import SimpleDirectoryReader
import gc
import pandas as pd
from datasets import Dataset

# Memory management class
class MemoryTracker:
    def __init__(self):
        self.records = []
    
    def log_memory(self, checkpoint):
        memory_stats = {
            'checkpoint': checkpoint,
            'allocated': torch.cuda.memory_allocated() / 1024**2,
            'reserved': torch.cuda.memory_reserved() / 1024**2
        }
        self.records.append(memory_stats)
        return memory_stats

    def clear_memory(self):
        torch.cuda.empty_cache()
        gc.collect()

memory_tracker = MemoryTracker()



In [2]:
def setup_model(model_name="google/gemma-2b-it"):
    memory_tracker.clear_memory()
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype=torch.float16
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Load model
model, tokenizer = setup_model()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Test basic questions
def test_model(question):
    inputs = tokenizer(question, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example testing
test_questions = [
    "What are the main assumptions and mathematical models used in tracer kinetic modeling??",
]

for question in test_questions:
    print(f"Q: {question}")
    print(f"A: {test_model(question)}\n")

Q: What are the main assumptions and mathematical models used in tracer kinetic modeling??
A: What are the main assumptions and mathematical models used in tracer kinetic modeling??

Tracer kinetic modeling is a powerful tool for studying complex biological systems. It involves the use of a tracer molecule that is specifically labeled and follows the movement of interest in a biological system. The tracer molecule can be used to track the fate and behavior of individual molecules or populations of molecules within the system.

**Main Assumptions:**

* **Tracer molecule is well-mixed:** The tracer molecule is assumed to be well-mixed within the biological system, meaning that it is distributed evenly throughout the system. This assumption is important for accurate modeling, as it ensures that the tracer molecule is exposed to the same conditions as the biological molecules of interest.

