In [2]:
!git clone https://github.com/ckapelonis02/sam2-fine-tune.git
%cd /kaggle/working/sam2-fine-tune

Cloning into 'sam2-fine-tune'...
remote: Enumerating objects: 355, done.[K
remote: Counting objects: 100% (71/71), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 355 (delta 39), reused 49 (delta 19), pack-reused 284 (from 1)[K
Receiving objects: 100% (355/355), 82.99 MiB | 44.61 MiB/s, done.
Resolving deltas: 100% (45/45), done.
/kaggle/working/sam2-fine-tune


In [3]:
%pip install -e .

Obtaining file:///kaggle/working/sam2-fine-tune
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.3.2 (from SAM-2==1.0)
  Using cached hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting iopath>=0.1.10 (from SAM-2==1.0)
  Using cached iopath-0.1.10.tar.gz (42 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker (from iopath>=0.1.10->SAM-2==1.0)
  Using cached portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Using cached hydra_core-1.3.2-py3-none-any.whl (154 kB)
Using cached portalocker-3.1.1-py3-none-any.whl (19 kB)
Building wheels for collected packages: SAM-2, iopath
  Building editable for SAM-2 (pyproject.toml) ... [?25l[?25hdone
  Created wheel for SAM-2: filename=sam_2-1.0-0.editable-cp310-cp310-linux_x86_64.

In [4]:
import kagglehub
path = kagglehub.model_download("metaresearch/segment-anything-2/pyTorch/sam2-hiera-tiny")

In [5]:
import sys
sys.path.append("/kaggle/input/segment-anything-2/pytorch/sam2-hiera-tiny/1/")

In [13]:
import hydra
import numpy as np
import torch
import cv2
import os
import random
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
from tqdm import tqdm
from sam2.build_sam import build_sam2
from sam2.sam2_image_predictor import SAM2ImagePredictor
from sam2.train_helper import *

cleanup()

# Configurations
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

hydra.core.global_hydra.GlobalHydra.instance().clear()
hydra.initialize_config_module('sam2', version_base='1.2')

# Model Initialization
sam2_model = build_sam2(
    config_file="../sam2_configs/sam2_hiera_t.yaml",
    ckpt_path="/kaggle/input/segment-anything-2/pytorch/sam2-hiera-tiny/1/sam2_hiera_tiny.pt",
    device="cuda",
    apply_postprocessing=False
)
predictor = SAM2ImagePredictor(sam2_model)
predictor.model.sam_mask_decoder.train(True)
predictor.model.sam_prompt_encoder.train(True)

# Optimizer & Scheduler
optimizer = optim.AdamW(predictor.model.parameters(), lr=1e-5, weight_decay=4e-5)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=10000, eta_min=1e-7)
scaler = torch.cuda.amp.GradScaler()

# Dataset Configuration
data_size = 2000
file_names = list(range(1, data_size + 1))
random.shuffle(file_names)
train_size = int(0.8 * data_size)
train_files, val_files = file_names[:train_size], file_names[train_size:]

train_data = read_dataset("/kaggle/input/data-2k-cropped/images", "/kaggle/input/data-2k-cropped/masks", train_files)
val_data = read_dataset("/kaggle/input/data-2k-cropped/images", "/kaggle/input/data-2k-cropped/masks", val_files)

# Training Parameters
max_masks = 150
epochs = 10
best_val_iou = 0.0
gradient_accumulation_steps = 4

def process_batch(image, masks, input_point, input_label):
    """Processes a single batch and returns the predicted masks, scores, and ground truth masks."""
    if masks.shape[0] == 0:
        return None, None, None

    predictor.set_image(image)
    
    mask_input, unnorm_coords, labels, _ = predictor._prep_prompts(input_point, input_label, box=None, mask_logits=None, normalize_coords=True)
    sparse_embeddings, dense_embeddings = predictor.model.sam_prompt_encoder(points=(unnorm_coords, labels), boxes=None, masks=None)

    batched_mode = unnorm_coords.shape[0] > 1
    high_res_features = [feat_level[-1].unsqueeze(0) for feat_level in predictor._features["high_res_feats"]]

    low_res_masks, prd_scores, _, _ = predictor.model.sam_mask_decoder(
        image_embeddings=predictor._features["image_embed"][-1].unsqueeze(0),
        image_pe=predictor.model.sam_prompt_encoder.get_dense_pe(),
        sparse_prompt_embeddings=sparse_embeddings,
        dense_prompt_embeddings=dense_embeddings,
        multimask_output=True,
        repeat_image=batched_mode,
        high_res_features=high_res_features
    )

    prd_masks = predictor._transforms.postprocess_masks(low_res_masks, predictor._orig_hw[-1])
    gt_mask = torch.tensor((masks / 255).astype(np.float16), device="cuda")
    prd_mask = torch.sigmoid(prd_masks[:, 0].to(dtype=torch.float16))

    return prd_mask, prd_scores, gt_mask

def compute_iou_loss(prd_mask, prd_scores, gt_mask):
    """Computes IoU, segmentation loss, and score loss."""
    inter = (gt_mask * (prd_mask > 0.5)).sum(dim=[1, 2])
    union = gt_mask.sum(dim=[1, 2]) + (prd_mask > 0.5).sum(dim=[1, 2]) - inter
    iou = inter / (union + 1e-6)
    
    seg_loss = (-gt_mask * torch.log(prd_mask + 1e-6) - (1 - gt_mask) * torch.log((1 - prd_mask) + 1e-6)).mean()
    score_loss = torch.abs(prd_scores[:, 0] - iou).mean()
    
    return iou, seg_loss + score_loss * 0.05

def evaluate():
    """Evaluates the model on the validation dataset."""
    predictor.model.eval()
    total_iou, count = 0, 0

    with torch.no_grad():
        for i in tqdm(range(len(val_files)), desc="Validation Progress"):
            image, masks, input_point, input_label = read_batch(val_data, i, max_masks)
            prd_mask, prd_scores, gt_mask = process_batch(image, masks, input_point, input_label)
            
            if prd_mask is None:
                continue
            
            iou, _ = compute_iou_loss(prd_mask, prd_scores, gt_mask)
            total_iou += iou.mean().item()
            count += 1

    predictor.model.train()
    return total_iou / count if count > 0 else 0

# Training Loop
for epoch in range(epochs):
    mean_iou = 0
    random.shuffle(train_files)
    
    print(f"\nEpoch {epoch+1}/{epochs}")

    for itr in tqdm(range(train_size), desc="Training Progress"):
        with torch.cuda.amp.autocast():
            image, masks, input_point, input_label = read_batch(train_data, itr % train_size, max_masks)
            prd_mask, prd_scores, gt_mask = process_batch(image, masks, input_point, input_label)

            if prd_mask is None:
                continue

            iou, loss = compute_iou_loss(prd_mask, prd_scores, gt_mask)
            loss = loss / gradient_accumulation_steps

            scaler.scale(loss).backward()

            if (itr + 1) % gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                predictor.model.zero_grad()

            scheduler.step()
            mean_iou = mean_iou * 0.99 + 0.01 * iou.mean().item()

    val_iou = evaluate()
    print(f"Epoch {epoch+1}: Train IoU = {mean_iou:.4f}, Val IoU = {val_iou:.4f}")

    if val_iou > best_val_iou:
        best_val_iou = val_iou
        torch.save(predictor.model.state_dict(), "best_model.torch")
        print(f"New best model saved, Val IoU = {best_val_iou:.4f}")



Epoch 1/10


Training Progress: 100%|██████████| 1600/1600 [22:37<00:00,  1.18it/s]
Validation Progress: 100%|██████████| 400/400 [03:45<00:00,  1.77it/s]


Epoch 1: Train IoU = 0.8744, Val IoU = 0.8616
New best model saved, Val IoU = 0.8616

Epoch 2/10


Training Progress: 100%|██████████| 1600/1600 [22:07<00:00,  1.21it/s]
Validation Progress: 100%|██████████| 400/400 [03:37<00:00,  1.84it/s]


Epoch 2: Train IoU = 0.8946, Val IoU = 0.8797
New best model saved, Val IoU = 0.8797

Epoch 3/10


Training Progress: 100%|██████████| 1600/1600 [22:08<00:00,  1.20it/s]
Validation Progress: 100%|██████████| 400/400 [03:38<00:00,  1.83it/s]


Epoch 3: Train IoU = 0.9027, Val IoU = 0.8874
New best model saved, Val IoU = 0.8874

Epoch 4/10


Training Progress: 100%|██████████| 1600/1600 [22:11<00:00,  1.20it/s]
Validation Progress: 100%|██████████| 400/400 [03:38<00:00,  1.83it/s]


Epoch 4: Train IoU = 0.9063, Val IoU = 0.8909
New best model saved, Val IoU = 0.8909

Epoch 5/10


Training Progress: 100%|██████████| 1600/1600 [22:10<00:00,  1.20it/s]
Validation Progress: 100%|██████████| 400/400 [03:38<00:00,  1.83it/s]


Epoch 5: Train IoU = 0.9078, Val IoU = 0.8924
New best model saved, Val IoU = 0.8924

Epoch 6/10


Training Progress: 100%|██████████| 1600/1600 [22:09<00:00,  1.20it/s]
Validation Progress: 100%|██████████| 400/400 [03:37<00:00,  1.84it/s]


Epoch 6: Train IoU = 0.9082, Val IoU = 0.8929
New best model saved, Val IoU = 0.8929

Epoch 7/10


Training Progress:  24%|██▍       | 380/1600 [05:17<16:59,  1.20it/s]


KeyboardInterrupt: 

In [14]:
!mkdir /kaggle/working/sam2-fine-tune/results

In [None]:
import numpy as np
import torch
import cv2
import hydra
import matplotlib.pyplot as plt
import os
import time
from PIL import Image
from sam2.build_sam import build_sam2
from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
from sam2.test_helper import test_generator
from sam2.train_helper import cleanup

cleanup()

# Configurations
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

hydra.core.global_hydra.GlobalHydra.instance().clear()
hydra.initialize_config_module('sam2', version_base='1.2')

sam2_model = build_sam2(
    config_file="../sam2_configs/sam2_hiera_t.yaml",
    ckpt_path="/kaggle/input/segment-anything-2/pytorch/sam2-hiera-tiny/1/sam2_hiera_tiny.pt",
    device="cuda",
    apply_postprocessing=False
)

mask_generator = SAM2AutomaticMaskGenerator(
    model=sam2_model,
    points_per_side=32,
    points_per_batch=4,
    pred_iou_thresh=0.75,
    stability_score_thresh=0.92,
    stability_score_offset=0.91,
    mask_threshold=0.4,
    box_nms_thresh=0.7,
    crop_n_layers=2,
    crop_nms_thresh=0.7,
    crop_overlap_ratio=0.3,
    crop_n_points_downscale_factor=2,
    point_grids=None,
    min_mask_region_area=25.0,
    output_mode="binary_mask",
    use_m2m=False,
    multimask_output=True,
    load_model="/kaggle/working/sam2-fine-tune/best_model.torch"
)

import pandas as pd

# Read the CSV file
df = pd.read_csv("/kaggle/input/evaluation-dataset/crops.csv")  # Replace with your actual file path

# Access specific columns
file_names = df["file_name"]
rows = df["rows"]
cols = df["cols"]

# Example: Iterate over the data
for file_name, row, col in zip(file_names, rows, cols):
    print(f"File: {file_name}, Rows: {row}, Cols: {col}")
    start_time = time.time()
    test_generator(
        mask_generator=mask_generator,
        img_path=f"/kaggle/input/evaluation-dataset/evaluation_dataset/images_set/{file_name}.jpg",
        output_path=f"/kaggle/working/sam2-fine-tune/results/{file_name}_{time.time()}.png",
        rows=row,
        cols=col,
        max_mask_crop_region=0.1,
        show_masks=False
    )
    print(f"Time taken: {time.time() - start_time}")


File: butterfly, Rows: 4, Cols: 4
Processing 1 of 16


In [None]:
import optuna
import numpy as np
import time
from sam2.build_sam import build_sam2
from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
from sam2.test_helper import test_generator
from evaluate import *

def objective(trial):
    points_per_side = 128
    points_per_batch = 32
    pred_iou_thresh = trial.suggest_float('pred_iou_thresh', 0.5, 0.9)
    stability_score_thresh = trial.suggest_float('stability_score_thresh', 0.7, 0.95)
    stability_score_offset = trial.suggest_float('stability_score_offset', 0.7, 1.2)
    mask_threshold = trial.suggest_float('mask_threshold', 0.0, 0.6)
    box_nms_thresh = 0.7
    crop_n_layers = 2
    crop_nms_thresh = 0.7
    crop_overlap_ratio = 0.3
    crop_n_points_downscale_factor = 2
    min_mask_region_area = 25.0
    use_m2m = False

    sam2_model = build_sam2(
        config_file="../sam2_configs/sam2_hiera_t.yaml",
        ckpt_path="/kaggle/input/segment-anything-2/pytorch/sam2-hiera-tiny/1/sam2_hiera_tiny.pt",
        device="cuda",
        apply_postprocessing=False
    )

    mask_generator = SAM2AutomaticMaskGenerator(
        model=sam2_model,
        points_per_side=points_per_side,
        points_per_batch=points_per_batch,
        pred_iou_thresh=pred_iou_thresh,
        stability_score_thresh=stability_score_thresh,
        stability_score_offset=stability_score_offset,
        mask_threshold=mask_threshold,
        box_nms_thresh=box_nms_thresh,
        crop_n_layers=crop_n_layers,
        crop_nms_thresh=crop_nms_thresh,
        crop_overlap_ratio=crop_overlap_ratio,
        crop_n_points_downscale_factor=crop_n_points_downscale_factor,
        min_mask_region_area=min_mask_region_area,
        use_m2m=use_m2m
    )

    img_path = "/kaggle/input/evaluation-dataset/images_set/butterfly.jpg"
    output_path = "/kaggle/working/sam2-fine-tune/results/butterfly.png"

    start_time = time.time()
    test_generator(
        mask_generator=mask_generator,
        img_path=img_path,
        output_path=output_path,
        rows=1,
        cols=1,
        max_mask_crop_region=0.1,
        show_masks=False
    )
    print(f"Test run took {time.time() - start_time} seconds")

    gt, pred = read_masks("/kaggle/input/evaluation-dataset/masks_set/butterfly.png", output_path)
    metrics = evaluate_pred(gt, pred)
    iou_score = metrics['IoU']

    return iou_score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best Hyperparameters:", study.best_params)
print("Best IoU Score:", study.best_value)