# Fine-Tuning Dataset Preparation

This notebook prepares the multimodal dataset for fine-tuning embedding models such as CLIP.

**Objectives:**
1. Load the curated multimodal data from the Trusted Zone
2. Create image-text positive pairs suitable for contrastive learning
3. Clean the data (remove missing values, duplicates)
4. Split into reproducible train/test sets
5. Copy referenced images to the fine-tuning zone
6. Save datasets to MinIO for downstream fine-tuning

**Data Sources:**
- `trusted-zone/documents/recipes.jsonl` — Recipe metadata with cleaned text
- `recipe_ids_with_images.json` — Mapping from recipe IDs to image keys
- `trusted-zone/images/` — Normalized images (512×512 JPEG)

**Outputs:**
- `fine-tuning-zone/datasets/train_pairs_positive.csv`
- `fine-tuning-zone/datasets/test_pairs_positive.csv`
- `fine-tuning-zone/images/` — Copies of training/test images

## 1. Setup and Configuration

In [1]:
import os
import io
import json
from pathlib import Path, PurePosixPath
from typing import Dict, List, Optional, Set, Tuple

import pandas as pd
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split

# Load environment variables from project root or notebooks folder
# The .env file is located outside the fine_tuning folder
NOTEBOOK_DIR = Path.cwd()  # fine_tuning/training_data/
PROJECT_ROOT = NOTEBOOK_DIR.parent.parent  # Go up two levels to project root

ENV_PATHS = [
    PROJECT_ROOT / "notebooks" / ".env",
    PROJECT_ROOT / "app" / ".env",
    PROJECT_ROOT / ".env",
]

env_loaded = False
for env_path in ENV_PATHS:
    if env_path.exists():
        load_dotenv(env_path)
        print(f"✓ Loaded .env from: {env_path}")
        env_loaded = True
        break

if not env_loaded:
    print("⚠ No .env file found in expected locations. Trying default load_dotenv()...")
    load_dotenv()

# Configuration
MINIO_USER = os.getenv("MINIO_USER")
MINIO_PASSWORD = os.getenv("MINIO_PASSWORD")
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT")

# Data paths in MinIO
TRUSTED_BUCKET = "trusted-zone"
TRUSTED_DOCS_KEY = "documents/recipes.jsonl"
TRUSTED_IMAGES_PREFIX = "images"

# Fine-tuning zone configuration (for storing training datasets and images in MinIO)
FINE_TUNING_BUCKET = "fine-tuning-zone"
FINE_TUNING_PREFIX = "datasets"
FINE_TUNING_IMAGES_PREFIX = "images"  # Store copies of training images here

# Output paths (local)
OUTPUT_DIR = Path(".")  # Current directory (fine_tuning/training_data/)
TRAIN_OUTPUT_FILE = OUTPUT_DIR / "train_pairs_positive.csv"
TEST_OUTPUT_FILE = OUTPUT_DIR / "test_pairs_positive.csv"

# Split configuration
TEST_SIZE = 0.2  # 80% train, 20% test
RANDOM_SEED = 42  # Fixed seed for reproducibility

print(f"Configuration:")
print(f"  MinIO Endpoint: {MINIO_ENDPOINT}")
print(f"  Trusted Bucket: {TRUSTED_BUCKET}")
print(f"  Output Directory: {OUTPUT_DIR.absolute()}")
print(f"  Test Size: {TEST_SIZE * 100:.0f}%")
print(f"  Random Seed: {RANDOM_SEED}")

✓ Loaded .env from: /home/didac/Desktop/upc/adsdb/adsdb-multimodal-food-data-management/notebooks/.env
Configuration:
  MinIO Endpoint: http://localhost:9000
  Trusted Bucket: trusted-zone
  Output Directory: /home/didac/Desktop/upc/adsdb/adsdb-multimodal-food-data-management/fine_tuning/training_data
  Test Size: 20%
  Random Seed: 42


In [2]:
# Initialize S3/MinIO client
session = boto3.session.Session(
    aws_access_key_id=MINIO_USER,
    aws_secret_access_key=MINIO_PASSWORD,
    region_name="us-east-1"
)
s3 = session.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    config=Config(signature_version="s3v4", s3={"addressing_style": "path"})
)

def check_bucket_exists(bucket: str) -> bool:
    """Check if a bucket exists and is accessible."""
    try:
        s3.head_bucket(Bucket=bucket)
        return True
    except ClientError:
        return False

# Verify connectivity
if check_bucket_exists(TRUSTED_BUCKET):
    print(f"✓ Successfully connected to MinIO, bucket '{TRUSTED_BUCKET}' is accessible")
else:
    print(f"⚠ Could not access bucket '{TRUSTED_BUCKET}' — check MinIO is running and credentials are correct")

✓ Successfully connected to MinIO, bucket 'trusted-zone' is accessible


## 2. Load Data from Trusted Zone

We load the recipe documents (JSONL) and the image mapping file to build our dataset.

In [3]:
def load_recipes_from_s3(bucket: str, key: str) -> List[Dict]:
    """
    Load recipes from a JSONL file stored in S3/MinIO.
    Returns a list of recipe dictionaries.
    """
    recipes = []
    try:
        obj = s3.get_object(Bucket=bucket, Key=key)
        for line in obj["Body"].iter_lines():
            if line:
                recipes.append(json.loads(line))
        print(f"✓ Loaded {len(recipes)} recipes from s3://{bucket}/{key}")
    except ClientError as e:
        print(f"✗ Failed to load recipes: {e}")
    return recipes

# Load recipes from trusted zone
recipes = load_recipes_from_s3(TRUSTED_BUCKET, TRUSTED_DOCS_KEY)

if recipes:
    # Show sample record structure
    sample = recipes[0]
    print(f"\nSample recipe keys: {list(sample.keys())[:15]}...")
    print(f"Sample ID: {sample.get('id')}")
    print(f"Sample title (raw): {sample.get('title_text_raw', 'N/A')[:80]}...")

✓ Loaded 272 recipes from s3://trusted-zone/documents/recipes.jsonl

Sample recipe keys: ['valid__from_det_ingrs', 'id', 'ingredients__from_det_ingrs', 'ingredients__from_layer1', 'title__from_layer1', 'instructions__from_layer1', 'has_nutrition_data', 'title_text_raw', 'ingredients_text_raw', 'instructions_text_raw', 'title_text_clean', 'ingredients_text_clean', 'instructions_text_clean', 'nutrition_normalized']...
Sample ID: 00003a70b1
Sample title (raw): Crunchy Onion Potato Bake...


In [4]:
def list_trusted_images(bucket: str, prefix: str) -> Dict[str, List[str]]:
    """
    List all images in the trusted zone and build a mapping from recipe ID to image keys.
    Image filenames follow the pattern: ...type$src$ts$hash__<recipeId>_<position>.jpg
    """
    import re
    
    # Pattern to extract recipe ID from image filename
    # e.g., "type$src$ts$hash__000018c8a5_0.jpg" -> recipe_id = "000018c8a5"
    id_pattern = re.compile(r"__([A-Za-z0-9_\-]+)_(\d+)\.(?:jpe?g|png|webp|gif|bmp|tiff)$", re.IGNORECASE)
    
    recipe_to_images: Dict[str, List[str]] = {}
    total_images = 0
    
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            if key.endswith("/"):
                continue
            
            # Extract recipe ID from filename
            filename = PurePosixPath(key).name
            match = id_pattern.search(filename)
            if match:
                recipe_id = match.group(1)
                recipe_to_images.setdefault(recipe_id, []).append(key)
                total_images += 1
    
    # Sort image lists for determinism
    for rid in recipe_to_images:
        recipe_to_images[rid].sort()
    
    print(f"✓ Found {total_images} images for {len(recipe_to_images)} unique recipes")
    return recipe_to_images

# Build image mapping from trusted zone
recipe_to_images = list_trusted_images(TRUSTED_BUCKET, TRUSTED_IMAGES_PREFIX)

if recipe_to_images:
    # Show distribution of images per recipe
    images_per_recipe = [len(v) for v in recipe_to_images.values()]
    print(f"\nImages per recipe distribution:")
    print(f"  Min: {min(images_per_recipe)}, Max: {max(images_per_recipe)}, Mean: {sum(images_per_recipe)/len(images_per_recipe):.2f}")

✓ Found 462 images for 269 unique recipes

Images per recipe distribution:
  Min: 1, Max: 11, Mean: 1.72


## 3. Build Image-Text Positive Pairs

For each recipe that has both text and images, we create positive pairs:
- **Image**: S3 key (path) to the image in the trusted zone
- **Text**: Recipe title (cleaned) — this will serve as the caption for contrastive learning

We use the cleaned title as it provides a concise, descriptive caption that aligns well with the visual content of food images.

In [5]:
def build_positive_pairs(
    recipes: List[Dict],
    recipe_to_images: Dict[str, List[str]],
    text_field: str = "title_text_raw",
    fallback_fields: Optional[List[str]] = None
) -> pd.DataFrame:
    """
    Build a DataFrame of positive image-text pairs.
    
    For each recipe with images, creates one row per image with:
    - recipe_id: Unique identifier
    - image_key: S3 path to the image
    - caption: Text description (title)
    
    Args:
        recipes: List of recipe dictionaries from trusted zone
        recipe_to_images: Mapping from recipe ID to list of image keys
        text_field: Primary field to use for caption
        fallback_fields: Alternative fields if primary is empty
    
    Returns:
        DataFrame with columns: recipe_id, image_key, caption
    """
    if fallback_fields is None:
        fallback_fields = ["title_text_clean", "title__from_layer1", "title"]
    
    pairs = []
    
    # Build recipe lookup by ID
    recipe_by_id = {r.get("id"): r for r in recipes if r.get("id")}
    
    for recipe_id, image_keys in recipe_to_images.items():
        recipe = recipe_by_id.get(recipe_id)
        if not recipe:
            continue
        
        # Get caption text
        caption = recipe.get(text_field, "").strip()
        if not caption:
            for fallback in fallback_fields:
                caption = recipe.get(fallback, "").strip()
                if caption:
                    break
        
        if not caption:
            continue
        
        # Create one pair per image
        for image_key in image_keys:
            pairs.append({
                "recipe_id": recipe_id,
                "image_key": image_key,
                "caption": caption
            })
    
    df = pd.DataFrame(pairs)
    print(f"✓ Built {len(df)} image-text pairs from {df['recipe_id'].nunique()} unique recipes")
    return df

# Build positive pairs
pairs_df = build_positive_pairs(recipes, recipe_to_images)

if not pairs_df.empty:
    print(f"\nDataFrame shape: {pairs_df.shape}")
    print(f"\nSample pairs:")
    display(pairs_df.head(10))

✓ Built 462 image-text pairs from 269 unique recipes

DataFrame shape: (462, 3)

Sample pairs:


Unnamed: 0,recipe_id,image_key,caption
0,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
1,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
2,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
3,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
4,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
5,0036b28b5f,images/image$adsdb-multimodal-food-data-manage...,Pain Au Riz (Rice Bread)
6,0036b28b5f,images/image$adsdb-multimodal-food-data-manage...,Pain Au Riz (Rice Bread)
7,0021f004a6,images/image$adsdb-multimodal-food-data-manage...,Soured Milk Cake
8,0021f004a6,images/image$adsdb-multimodal-food-data-manage...,Soured Milk Cake
9,0021f004a6,images/image$adsdb-multimodal-food-data-manage...,Soured Milk Cake


## 4. Data Cleaning

Clean the dataset by:
1. Removing rows with missing image keys or captions
2. Removing duplicate pairs
3. Removing very short or very long captions (potential noise)

In [6]:
def clean_pairs_dataframe(
    df: pd.DataFrame,
    min_caption_len: int = 3,
    max_caption_len: int = 200
) -> pd.DataFrame:
    """
    Clean the pairs DataFrame by removing invalid/duplicate entries.
    
    Args:
        df: Input DataFrame with columns (recipe_id, image_key, caption)
        min_caption_len: Minimum caption length in characters
        max_caption_len: Maximum caption length in characters
    
    Returns:
        Cleaned DataFrame
    """
    initial_count = len(df)
    print(f"Initial count: {initial_count} pairs")
    
    # 1. Remove rows with missing values
    df = df.dropna(subset=["image_key", "caption"])
    print(f"After removing missing values: {len(df)} pairs (removed {initial_count - len(df)})")
    
    # 2. Remove empty strings
    df = df[df["image_key"].str.strip().astype(bool)]
    df = df[df["caption"].str.strip().astype(bool)]
    print(f"After removing empty strings: {len(df)} pairs")
    
    # 3. Filter by caption length
    df = df[df["caption"].str.len() >= min_caption_len]
    df = df[df["caption"].str.len() <= max_caption_len]
    print(f"After caption length filter [{min_caption_len}, {max_caption_len}]: {len(df)} pairs")
    
    # 4. Remove exact duplicates (same image_key + caption)
    before_dedup = len(df)
    df = df.drop_duplicates(subset=["image_key", "caption"])
    print(f"After removing duplicates: {len(df)} pairs (removed {before_dedup - len(df)} duplicates)")
    
    # 5. Reset index
    df = df.reset_index(drop=True)
    
    print(f"\n✓ Final cleaned dataset: {len(df)} pairs ({len(df)/initial_count*100:.1f}% retained)")
    return df

# Clean the dataset
print("=" * 60)
print("Data Cleaning Report")
print("=" * 60)
cleaned_df = clean_pairs_dataframe(pairs_df)

Data Cleaning Report
Initial count: 462 pairs
After removing missing values: 462 pairs (removed 0)
After removing empty strings: 462 pairs
After caption length filter [3, 200]: 462 pairs
After removing duplicates: 462 pairs (removed 0 duplicates)

✓ Final cleaned dataset: 462 pairs (100.0% retained)


In [7]:
# Display cleaned data statistics
print("\n" + "=" * 60)
print("Cleaned Dataset Statistics")
print("=" * 60)

print(f"\nTotal pairs: {len(cleaned_df)}")
print(f"Unique recipes: {cleaned_df['recipe_id'].nunique()}")
print(f"Unique images: {cleaned_df['image_key'].nunique()}")
print(f"Unique captions: {cleaned_df['caption'].nunique()}")

# Caption length statistics
caption_lengths = cleaned_df["caption"].str.len()
print(f"\nCaption length statistics:")
print(f"  Min: {caption_lengths.min()} characters")
print(f"  Max: {caption_lengths.max()} characters")
print(f"  Mean: {caption_lengths.mean():.1f} characters")
print(f"  Median: {caption_lengths.median():.1f} characters")

# Images per recipe
images_per_recipe = cleaned_df.groupby("recipe_id").size()
print(f"\nImages per recipe:")
print(f"  Min: {images_per_recipe.min()}")
print(f"  Max: {images_per_recipe.max()}")
print(f"  Mean: {images_per_recipe.mean():.2f}")

# Sample captions
print(f"\nSample captions:")
for i, caption in enumerate(cleaned_df["caption"].sample(5, random_state=RANDOM_SEED).values):
    print(f"  {i+1}. {caption[:80]}{'...' if len(caption) > 80 else ''}")


Cleaned Dataset Statistics

Total pairs: 462
Unique recipes: 269
Unique images: 462
Unique captions: 268

Caption length statistics:
  Min: 9 characters
  Max: 118 characters
  Mean: 28.5 characters
  Median: 26.0 characters

Images per recipe:
  Min: 1
  Max: 11
  Mean: 1.72

Sample captions:
  1. Peg's Chili
  2. Classic Lasagna
  3. Aunt Marie's Peas
  4. Roast Salmon With Spiced Coconut Crumbs
  5. Crunchy Onion Potato Bake


## 5. Train/Test Split

Split the data into training and test sets with a fixed random seed for reproducibility.

**Important:** We split by recipe ID (not by individual pairs) to prevent data leakage — all images from the same recipe go to either train or test, not both.

In [8]:
def split_by_recipe(
    df: pd.DataFrame,
    test_size: float = 0.2,
    random_seed: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split DataFrame into train/test by recipe ID to avoid data leakage.
    
    All pairs from the same recipe go to either train or test, ensuring
    that similar images from the same recipe don't appear in both sets.
    
    Args:
        df: Input DataFrame with 'recipe_id' column
        test_size: Fraction of recipes for test set
        random_seed: Random seed for reproducibility
    
    Returns:
        Tuple of (train_df, test_df)
    """
    # Get unique recipe IDs
    recipe_ids = df["recipe_id"].unique()
    
    # Split recipe IDs
    train_ids, test_ids = train_test_split(
        recipe_ids,
        test_size=test_size,
        random_state=random_seed
    )
    
    # Create train/test DataFrames
    train_df = df[df["recipe_id"].isin(train_ids)].copy()
    test_df = df[df["recipe_id"].isin(test_ids)].copy()
    
    # Reset indices
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    
    print(f"Split by recipe ID (seed={random_seed}):")
    print(f"  Train: {len(train_df)} pairs ({len(train_ids)} recipes)")
    print(f"  Test: {len(test_df)} pairs ({len(test_ids)} recipes)")
    print(f"  Actual test ratio: {len(test_df) / len(df) * 100:.1f}%")
    
    return train_df, test_df

# Perform split
train_df, test_df = split_by_recipe(cleaned_df, test_size=TEST_SIZE, random_seed=RANDOM_SEED)

Split by recipe ID (seed=42):
  Train: 365 pairs (215 recipes)
  Test: 97 pairs (54 recipes)
  Actual test ratio: 21.0%


In [9]:
# Verify no overlap between train and test
train_recipes = set(train_df["recipe_id"].unique())
test_recipes = set(test_df["recipe_id"].unique())
overlap = train_recipes.intersection(test_recipes)

print(f"\nVerification:")
print(f"  Train recipe IDs: {len(train_recipes)}")
print(f"  Test recipe IDs: {len(test_recipes)}")
print(f"  Overlap: {len(overlap)} recipes")

if len(overlap) == 0:
    print("  ✓ No data leakage - train and test sets are properly separated")
else:
    print("  ⚠ Warning: There is overlap between train and test sets!")


Verification:
  Train recipe IDs: 215
  Test recipe IDs: 54
  Overlap: 0 recipes
  ✓ No data leakage - train and test sets are properly separated


## 6. Copy Images to Fine-Tuning Zone

Copy all training and test images from the trusted zone to the fine-tuning zone for tidiness. This keeps all fine-tuning data in one bucket.

In [10]:
def copy_image_to_fine_tuning(
    src_bucket: str,
    src_key: str,
    dst_bucket: str,
    dst_prefix: str
) -> Optional[str]:
    """
    Copy an image from trusted zone to fine-tuning zone.
    
    Args:
        src_bucket: Source bucket (trusted-zone)
        src_key: Source image key
        dst_bucket: Destination bucket (fine-tuning-zone)
        dst_prefix: Destination prefix (images)
    
    Returns:
        New image key in destination bucket, or None if failed
    """
    try:
        # Extract just the filename from the source key
        filename = PurePosixPath(src_key).name
        dst_key = f"{dst_prefix}/{filename}"
        
        # Copy the object
        s3.copy_object(
            Bucket=dst_bucket,
            Key=dst_key,
            CopySource={"Bucket": src_bucket, "Key": src_key}
        )
        return dst_key
    except Exception as e:
        return None


def copy_dataset_images(
    df: pd.DataFrame,
    src_bucket: str,
    dst_bucket: str,
    dst_prefix: str,
    split_name: str = "train"
) -> pd.DataFrame:
    """
    Copy all images referenced in a DataFrame to the fine-tuning zone.
    Updates the image_key column to point to the new location.
    
    Args:
        df: DataFrame with image_key column
        src_bucket: Source bucket
        dst_bucket: Destination bucket  
        dst_prefix: Destination prefix
        split_name: Name of the split (for logging)
    
    Returns:
        DataFrame with updated image_key values
    """
    print(f"Copying {split_name} images to s3://{dst_bucket}/{dst_prefix}/...")
    
    df = df.copy()
    unique_images = df["image_key"].unique()
    
    copied = 0
    failed = 0
    key_mapping = {}
    
    for src_key in unique_images:
        new_key = copy_image_to_fine_tuning(src_bucket, src_key, dst_bucket, dst_prefix)
        if new_key:
            key_mapping[src_key] = new_key
            copied += 1
        else:
            key_mapping[src_key] = src_key  # Keep original if copy fails
            failed += 1
    
    # Update image_key column with new keys
    df["image_key"] = df["image_key"].map(key_mapping)
    
    print(f"  ✓ Copied {copied} images, {failed} failed")
    return df


# Ensure fine-tuning bucket exists
def ensure_bucket_exists_for_copy(bucket: str) -> bool:
    """Create bucket if it doesn't exist."""
    try:
        s3.head_bucket(Bucket=bucket)
        return True
    except ClientError as e:
        error_code = e.response.get("Error", {}).get("Code", "")
        if error_code in ("404", "NoSuchBucket"):
            try:
                s3.create_bucket(Bucket=bucket)
                print(f"✓ Created bucket '{bucket}'")
                return True
            except ClientError:
                return False
        return False

# Copy images to fine-tuning zone
print("=" * 60)
print("Copying Images to Fine-Tuning Zone")
print("=" * 60)

if ensure_bucket_exists_for_copy(FINE_TUNING_BUCKET):
    # Copy train images
    train_df = copy_dataset_images(
        train_df,
        src_bucket=TRUSTED_BUCKET,
        dst_bucket=FINE_TUNING_BUCKET,
        dst_prefix=FINE_TUNING_IMAGES_PREFIX,
        split_name="train"
    )
    
    # Copy test images
    test_df = copy_dataset_images(
        test_df,
        src_bucket=TRUSTED_BUCKET,
        dst_bucket=FINE_TUNING_BUCKET,
        dst_prefix=FINE_TUNING_IMAGES_PREFIX,
        split_name="test"
    )
    
    print(f"\n✓ All images copied to s3://{FINE_TUNING_BUCKET}/{FINE_TUNING_IMAGES_PREFIX}/")
else:
    print("⚠ Could not access fine-tuning bucket, keeping original image paths")

Copying Images to Fine-Tuning Zone
Copying train images to s3://fine-tuning-zone/images/...
  ✓ Copied 365 images, 0 failed
Copying test images to s3://fine-tuning-zone/images/...
  ✓ Copied 97 images, 0 failed

✓ All images copied to s3://fine-tuning-zone/images/


## 7. Save Output Files (Local)

Save the train and test splits as CSV files locally for convenience.

In [11]:
def save_pairs_to_csv(df: pd.DataFrame, filepath: Path) -> None:
    """
    Save pairs DataFrame to CSV file.
    
    Args:
        df: DataFrame to save
        filepath: Output file path
    """
    # Ensure output directory exists
    filepath.parent.mkdir(parents=True, exist_ok=True)
    
    # Save to CSV
    df.to_csv(filepath, index=False, encoding="utf-8")
    
    # Report
    file_size_kb = filepath.stat().st_size / 1024
    print(f"✓ Saved {len(df)} pairs to {filepath} ({file_size_kb:.1f} KB)")

# Save train and test files
print("=" * 60)
print("Saving Output Files")
print("=" * 60)

save_pairs_to_csv(train_df, TRAIN_OUTPUT_FILE)
save_pairs_to_csv(test_df, TEST_OUTPUT_FILE)

Saving Output Files
✓ Saved 365 pairs to train_pairs_positive.csv (58.5 KB)
✓ Saved 97 pairs to test_pairs_positive.csv (15.3 KB)


## 8. Preview Saved Files

In [12]:
# Preview the saved CSV files
print("Preview of train_pairs_positive.csv:")
print("-" * 60)
display(pd.read_csv(TRAIN_OUTPUT_FILE).head(10))

print("\nPreview of test_pairs_positive.csv:")
print("-" * 60)
display(pd.read_csv(TEST_OUTPUT_FILE).head(10))

Preview of train_pairs_positive.csv:
------------------------------------------------------------


Unnamed: 0,recipe_id,image_key,caption
0,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
1,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
2,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
3,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
4,004205a8a0,images/image$adsdb-multimodal-food-data-manage...,Banana French Toast
5,0036b28b5f,images/image$adsdb-multimodal-food-data-manage...,Pain Au Riz (Rice Bread)
6,0036b28b5f,images/image$adsdb-multimodal-food-data-manage...,Pain Au Riz (Rice Bread)
7,0021f004a6,images/image$adsdb-multimodal-food-data-manage...,Soured Milk Cake
8,0021f004a6,images/image$adsdb-multimodal-food-data-manage...,Soured Milk Cake
9,0021f004a6,images/image$adsdb-multimodal-food-data-manage...,Soured Milk Cake



Preview of test_pairs_positive.csv:
------------------------------------------------------------


Unnamed: 0,recipe_id,image_key,caption
0,004a63989e,images/image$adsdb-multimodal-food-data-manage...,Lighter Spicy Garlic Shrimp
1,002481e577,images/image$adsdb-multimodal-food-data-manage...,Quick & Easy Chicken Parmigiana
2,002481e577,images/image$adsdb-multimodal-food-data-manage...,Quick & Easy Chicken Parmigiana
3,002481e577,images/image$adsdb-multimodal-food-data-manage...,Quick & Easy Chicken Parmigiana
4,0044818076,images/image$adsdb-multimodal-food-data-manage...,Noodles With Spicy Peanut Sauce
5,0044818076,images/image$adsdb-multimodal-food-data-manage...,Noodles With Spicy Peanut Sauce
6,0044818076,images/image$adsdb-multimodal-food-data-manage...,Noodles With Spicy Peanut Sauce
7,0044818076,images/image$adsdb-multimodal-food-data-manage...,Noodles With Spicy Peanut Sauce
8,0044818076,images/image$adsdb-multimodal-food-data-manage...,Noodles With Spicy Peanut Sauce
9,0044818076,images/image$adsdb-multimodal-food-data-manage...,Noodles With Spicy Peanut Sauce


## 9. Upload Datasets to MinIO

Store the training datasets in the `fine-tuning-zone` bucket for centralized access and version control. This keeps all pipeline data organized in MinIO.

In [13]:
def ensure_bucket_exists(bucket: str) -> bool:
    """
    Create bucket if it doesn't exist.
    Returns True if bucket exists or was created successfully.
    """
    try:
        s3.head_bucket(Bucket=bucket)
        print(f"✓ Bucket '{bucket}' already exists")
        return True
    except ClientError as e:
        error_code = e.response.get("Error", {}).get("Code", "")
        if error_code in ("404", "NoSuchBucket"):
            try:
                s3.create_bucket(Bucket=bucket)
                print(f"✓ Created bucket '{bucket}'")
                return True
            except ClientError as create_error:
                print(f"✗ Failed to create bucket '{bucket}': {create_error}")
                return False
        else:
            print(f"✗ Error checking bucket '{bucket}': {e}")
            return False


def upload_csv_to_minio(df: pd.DataFrame, bucket: str, key: str) -> bool:
    """
    Upload a DataFrame as CSV to MinIO.
    
    Args:
        df: DataFrame to upload
        bucket: Target bucket name
        key: Object key (path) in the bucket
    
    Returns:
        True if upload succeeded, False otherwise
    """
    try:
        # Convert DataFrame to CSV bytes
        csv_buffer = io.StringIO()
        df.to_csv(csv_buffer, index=False, encoding="utf-8")
        csv_bytes = csv_buffer.getvalue().encode("utf-8")
        
        # Upload to MinIO
        s3.put_object(
            Bucket=bucket,
            Key=key,
            Body=csv_bytes,
            ContentType="text/csv",
            Metadata={
                "rows": str(len(df)),
                "columns": ",".join(df.columns.tolist()),
                "created_at": pd.Timestamp.now().isoformat(),
                "random_seed": str(RANDOM_SEED),
                "test_size": str(TEST_SIZE),
            }
        )
        
        size_kb = len(csv_bytes) / 1024
        print(f"✓ Uploaded {len(df)} rows to s3://{bucket}/{key} ({size_kb:.1f} KB)")
        return True
        
    except ClientError as e:
        print(f"✗ Failed to upload to s3://{bucket}/{key}: {e}")
        return False


# Upload datasets to MinIO
print("=" * 60)
print("Uploading Datasets to MinIO")
print("=" * 60)

# Ensure the fine-tuning bucket exists
if ensure_bucket_exists(FINE_TUNING_BUCKET):
    # Define S3 keys for the datasets
    train_key = f"{FINE_TUNING_PREFIX}/train_pairs_positive.csv"
    test_key = f"{FINE_TUNING_PREFIX}/test_pairs_positive.csv"
    
    # Upload both datasets
    upload_csv_to_minio(train_df, FINE_TUNING_BUCKET, train_key)
    upload_csv_to_minio(test_df, FINE_TUNING_BUCKET, test_key)
    
    print(f"\n✅ Datasets available at:")
    print(f"   s3://{FINE_TUNING_BUCKET}/{train_key}")
    print(f"   s3://{FINE_TUNING_BUCKET}/{test_key}")
else:
    print("⚠ Could not create/access fine-tuning bucket. Datasets saved locally only.")

Uploading Datasets to MinIO
✓ Bucket 'fine-tuning-zone' already exists
✓ Uploaded 365 rows to s3://fine-tuning-zone/datasets/train_pairs_positive.csv (58.5 KB)
✓ Uploaded 97 rows to s3://fine-tuning-zone/datasets/test_pairs_positive.csv (15.3 KB)

✅ Datasets available at:
   s3://fine-tuning-zone/datasets/train_pairs_positive.csv
   s3://fine-tuning-zone/datasets/test_pairs_positive.csv
