In [1]:
import sys
import os

# Add the parent directory of the current working directory to sys.path
sys.path.append(os.path.abspath(".."))

In [9]:
print(os.getcwd())

/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/src


In [21]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m170.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [2]:
import yaml
import boto3
import src
from src.document_processor.loader import DocumentLoader
from src.document_processor.chunker import DocumentChunker
from src.document_processor.cleaner import TextCleaner
from src.embeddings.embedding_manager import EmbeddingManager
from src.question_generation.generator import EnhancedQuestionGenerator
import logging


with open('/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Setup logging
logging.basicConfig(
    level=getattr(logging, config['logging']['level']),
    format=config['logging']['format']
)

logger = logging.getLogger(__name__)

# Initialize AWS client
bedrock_client = boto3.client('bedrock-runtime')

# Initialize components
loader = DocumentLoader()
chunker = DocumentChunker(
    chunk_size=config['document_processing']['chunk_size'],
    chunk_overlap=config['document_processing']['chunk_overlap']
)
cleaner = TextCleaner()

embedding_manager = EmbeddingManager(
    bedrock_client,
    model_id=config['embedding']['model_id']
)

generator = EnhancedQuestionGenerator(
    llm_client=bedrock_client,
    model_id=config['question_generation']['model_id'],
    embedding_manager=embedding_manager,
    max_tokens=config['question_generation']['max_tokens'],
    temperature=config['question_generation']['temperature']
)

2024-11-08 11:02:15,337 - botocore.credentials - INFO - Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
  self.embeddings = BedrockEmbeddings(


In [None]:
# Process documents
documents = loader.load_document("/home/ec2-user/SageMaker/Fine_Tune_LLMs/Big/Physics_NuclMed.pdf")
logger.info("Documents loaded")

# Clean and chunk documents
cleaned_documents = []
for doc in documents:
    doc.page_content = cleaner.clean_text(doc.page_content)
    cleaned_documents.append(doc)
logger.info("Documents cleaned")

chunks = chunker.chunk_documents(cleaned_documents)
logger.info(f"Documents chunked into {len(chunks)} chunks")

# Create embeddings
embedding_manager.create_embeddings(chunks)
logger.info("Embeddings created")

2024-11-08 11:13:01,778 - __main__ - INFO - Documents loaded
2024-11-08 11:13:01,927 - __main__ - INFO - Documents cleaned
2024-11-08 11:13:02,374 - __main__ - INFO - Documents chunked into 2167 chunks


In [4]:
import pandas as pd
# Generate questions
questions = generator.generate_questions_from_docs(
    chunks,
    num_questions=100
)
logger.info(f"{len(questions)} question-answer pairs generated")

# Prepare data for DataFrame
qa_data = [{"Question": qa.question, "Answer": qa.answer} for qa in questions]

# Create DataFrame
qa_df = pd.DataFrame(qa_data)

# Display the DataFrame
print(qa_df)

2024-11-08 11:02:35,335 - __main__ - INFO - 3 question-answer pairs generated


Question 1: What is the objective of this project?
Answer 1: The objective of this project is to improve an automatic image segmentation system applied to retinal images by testing various preprocessing, segmentation, and post-processing techniques to enhance segmentation accuracy.

Question 2: What performance metric is used to evaluate the effectiveness of the different approaches?
Answer 2: The project evaluates the effectiveness of each approach using the Jaccard Index (IoU) as the performance metric.

Question 3: What was the baseline technique used for retinal image segmentation and its performance (IoU)?
Answer 3: The baseline technique was a threshold-based segmentation method involving preprocessing steps like Gaussian smoothing, illumination compensation, and contrast enhancement, followed by thresholding and small-object removal to isolate retinal structures. Its performance, measured by the Jaccard (IoU) metric, was 0.340 ± 0.036.



In [8]:
chunks = chunker.chunk_documents(cleaned_documents)
logger.info(f"Chunked into {len(chunks)} chunks.")

for idx, chunk in enumerate(chunks, start=1):
    print(f"Chunk {idx}: {chunk.page_content}")


2024-11-08 11:10:29,338 - __main__ - INFO - Chunked into 17 chunks.


Chunk 1: Biomedical Image Segmentation for Retinal Images Universidad Carlos III de Madrid Duarte Pinto Correia de Moura Guillermo Rey Paniagua Masters in Machine Learning for Health October 28, 2024 This project aims to improve an automatic image segmentation system applied to retinal images. Various preprocessing, segmentation, and post-processing techniques are tested to enhance segmentation accuracy. The project evaluates the effectiveness of each approach using the Jaccard Index (IoU) as the performance metric.
Chunk 2: Duarte Pinto Correia de Moura, Guillermo Rey Paniagua Retina Segmentation Project Report 1 Introduction This report explores biomedical image segmentation techniques applied to retinal images with the objective of improving upon a baseline system provided by Professor Fernando. The baseline system utilized a threshold-based segmentation method involving preprocessing steps like Gaussian smoothing, illumination compensation, and contrast enhancement, followed by thr