### Discovery Notebook

In [48]:
# Imports and config
import pandas as pd
from sklearn.model_selection import train_test_split
import zipfile

In [49]:
# Add logging for AWS
from dotenv import load_dotenv
import os
import boto3
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Load environment variables and verify
load_dotenv()
logging.info("Environment variables loaded")

# Verify AWS credentials are present
AWS_BUCKET_NAME = os.getenv('AWS_BUCKET_NAME')
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')

# Log credentials (mask sensitive parts)
logging.info(f"Bucket name: {AWS_BUCKET_NAME}")
logging.info(f"Access key present: {'Yes' if AWS_ACCESS_KEY else 'No'}")
logging.info(f"Secret key present: {'Yes' if AWS_SECRET_KEY else 'No'}")

# Verify S3 connection
try:
    s3 = boto3.client('s3', 
                      aws_access_key_id=AWS_ACCESS_KEY,
                      aws_secret_access_key=AWS_SECRET_KEY)
    
    # Test the connection
    response = s3.list_buckets()
    logging.info(f"Connected to S3. Found {len(response['Buckets'])} buckets")
except Exception as e:
    logging.error(f"Failed to connect to S3: {str(e)}")

2025-03-25 15:18:56,442 - INFO - Environment variables loaded
2025-03-25 15:18:56,444 - INFO - Bucket name: dtaphnx
2025-03-25 15:18:56,444 - INFO - Access key present: Yes
2025-03-25 15:18:56,445 - INFO - Secret key present: Yes
2025-03-25 15:18:56,707 - INFO - Connected to S3. Found 2 buckets


In [50]:
# Imports for image processing
import os
from pathlib import Path
import shutil
import tqdm
import boto3
from dotenv import load_dotenv
import s3fs
from PIL import Image
import io

# Load environment variables
load_dotenv()

# Load configuration from .env
AWS_BUCKET_NAME = os.getenv('AWS_BUCKET_NAME')

# Create a connection to S3
fs = s3fs.S3FileSystem()

# List images in the bucket
directory = f's3://{AWS_BUCKET_NAME}'
filenames = fs.ls(directory)

In [51]:
# Error handling for retrieving images

try:
    BUCKET_NAME = os.getenv('AWS_BUCKET_NAME')
    if not BUCKET_NAME:
        raise ValueError("AWS_BUCKET_NAME must be set in environment")
    
    # Your S3 operations here
    
except Exception as e:
    print(f"Error accessing S3 bucket: {str(e)}")

In [52]:
# Download compressed file from S3
fs.get(f'{AWS_BUCKET_NAME}/300.zip', './data/images/compressed/300.zip')


[None]

In [53]:
# Function to extract the compressed images file
def extract_zip(file_path, output_path):
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(output_path)

In [54]:
# Function to extract the images
extract_zip('./data/images/compressed/300.zip', './data/images/raw')

In [55]:

# with fs.open(filenames[0], 'rb') as f:
    #img = Image.open(io.BytesIO(f.read()))

In [56]:
# Load the CSV for image metadata
# data = pd.read_csv('./data/table.csv')

In [57]:
# data.head()

In [58]:
# data.shape