In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import os
from dataclasses import dataclass
from transformers import Sam3Processor, Sam3Model
import torch
from PIL import Image
import requests
import cv2
from itertools import combinations
from sklearn.model_selection import train_test_split
import random
from scipy.ndimage import label
from typing import Union

In [None]:
# Imports for embedding models (loaded on-demand to save GPU memory)
from transformers import CLIPProcessor, CLIPModel, AutoImageProcessor, AutoModel, AutoProcessor
import open_clip

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model = Sam3Model.from_pretrained("facebook/sam3").to(device)
processor = Sam3Processor.from_pretrained("facebook/sam3")

In [None]:
from dataclasses import dataclass

@dataclass
class Sam3Results:
    masks: np.ndarray
    boxes: np.ndarray
    scores: np.ndarray

@dataclass
class Sample:
    image: Image.Image
    samResults: Sam3Results

@dataclass
class MyImage:
    image: Image.Image
    file_name: str
    index: int
    size: int
    samResults: Union[Sam3Results, None] = None



In [None]:
def get_sam3_results(
    model: Sam3Model,
    processor: Sam3Processor,
    # List of PIL images
    my_images: list[MyImage],
    batch_size: int = 16,
) -> list[int]:

    new_samples = []
    i = 0

    while i < len(my_images):
        batch: list[MyImage] = []

        while i < len(my_images) and len(batch) < batch_size:
            batch.append(my_images[i])
            i += 1

        images = [my_image.image for my_image in batch]

        inputs = processor(images=images, text=["distinct object"] * len(images), return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            results_batch = processor.post_process_instance_segmentation(
                outputs,
                threshold=0.5,
                mask_threshold=0.4,
                target_sizes=inputs.get("original_sizes").tolist()
            )

        for my_image, results in zip(batch, results_batch):
            my_image.samResults = Sam3Results(
                    masks=results["masks"].detach().cpu().numpy(),
                    boxes=results["boxes"].detach().cpu().numpy(),
                    scores=results["scores"].detach().cpu().numpy()
                )

        # Clean up GPU memory after each batch
        del inputs, outputs, results_batch
        torch.cuda.empty_cache()

        print(f"Processed {i}/{len(my_images)} samples")

    return my_images



In [None]:

# get a list of images from a folder
def get_images_from_folder(folder_path: str) -> list[MyImage]:
    return [MyImage(Image.open(os.path.join(folder_path, f)), f, i, os.path.getsize(os.path.join(folder_path, f))) for i, f in enumerate(os.listdir(folder_path))]



authentic_images = get_images_from_folder("./train_images/authentic/")
len(authentic_images)

In [None]:
authentic_images = [image for image in authentic_images if image.size < 2*10**6]
len(authentic_images)

In [None]:
sam3_results = get_sam3_results(model, processor, authentic_images[:])

In [None]:
example = 8
sample_image = sam3_results[example]

In [None]:
def get_candidate_masks(sample_image: MyImage):
    candidate_masks = []
    image_size = sample_image.size
    for mask in sample_image.samResults.masks:
        mask_size = np.sum(mask)
        mask_area_ratio = mask_size / image_size
        if mask_area_ratio > 0.005 and mask_area_ratio < 0.2:
            candidate_masks.append(mask)
    return candidate_masks
candidate_masks = get_candidate_masks(sample_image)
candidate_masks

In [None]:
# Copy-paste multiple objects with overlap prevention toggle (FIXED)
def copy_paste_objects(image, masks, num_copies_list, prevent_overlap=True):
    """masks: list of masks, num_copies_list: list of copy counts per object
    prevent_overlap: if True, objects won't overlap with each other
    Returns: (forged_image, list_of_masks) - one mask per object"""
    img = np.array(image)
    h, w = img.shape[:2]
    occupied = np.zeros((h, w), dtype=bool)
    result = img.copy()
    object_masks = []

    # Mark all original masks as occupied first
    if prevent_overlap:
        for mask in masks:
            occupied[mask.astype(bool)] = True

    for mask, num_copies in zip(masks, num_copies_list):
        mask_bool = mask.astype(bool)
        rows = np.any(mask_bool, axis=1)
        cols = np.any(mask_bool, axis=0)
        y_min, y_max = np.where(rows)[0][[0, -1]]
        x_min, x_max = np.where(cols)[0][[0, -1]]
        obj_h, obj_w = y_max - y_min + 1, x_max - x_min + 1

        object_crop = img[y_min:y_max+1, x_min:x_max+1]
        mask_crop = mask_bool[y_min:y_max+1, x_min:x_max+1]
        obj_mask = mask_bool.copy()

        for _ in range(num_copies):
            for _ in range(1000):
                offset_x = random.randint(0, max(1, w - obj_w))
                offset_y = random.randint(0, max(1, h - obj_h))

                paste_region = occupied[offset_y:offset_y+obj_h, offset_x:offset_x+obj_w]
                if not prevent_overlap or not np.any(paste_region[mask_crop]):
                    result[offset_y:offset_y+obj_h, offset_x:offset_x+obj_w][mask_crop] = object_crop[mask_crop]
                    obj_mask[offset_y:offset_y+obj_h, offset_x:offset_x+obj_w] = mask_crop
                    if prevent_overlap:
                        occupied[offset_y:offset_y+obj_h, offset_x:offset_x+obj_w][mask_crop] = True
                    break

        object_masks.append(obj_mask)

    return Image.fromarray(result), object_masks

In [None]:
new_image, new_mask = copy_paste_objects(sample_image.image, candidate_masks[:2], [2,1])
new_image.save("./ourtraining/duplicate_2.png")

In [None]:
variantions = [
    {
        "name": "1.1",
        "num_copies": [1]
    },
    {
        "name": "1.2",
        "num_copies": [1]
    },
    {
        "name": "1.3",
        "num_copies": [1]
    },
    {
        "name": "2.1",
        "num_copies": [2]
    },
    {
        "name": "2.2",
        "num_copies": [2]
    },
    {
        "name": "3",
        "num_copies": [3]
    },
    {
        "name": "1-1.1",
        "num_copies": [1,1]
    },
    {
        "name": "1-1.2",
        "num_copies": [1,1]
    },
    {
        "name": "1-2",
        "num_copies": [1,2]
    },
    {
        "name": "2-1",
        "num_copies": [2,1]
    },
    {
        "name": "2-2",
        "num_copies": [2,2]
    },
    {
        "name": "1-3",
        "num_copies": [1,3]
    },
    {
        "name": "2-3",
        "num_copies": [2,3]
    },
    {
        "name": "1-1-1",
        "num_copies": [1,1,1]
    },
    {
        "name": "1-1-2",
        "num_copies": [1,1,2]
    },
    {
        "name": "1-1-3",
        "num_copies": [1,1,3]
    },
    {
        "name": "1-3-3",
        "num_copies": [1,3,3]
    },
    {
        "name": "1-1-1-1",
        "num_copies": [1,1,1,1]
    },
    {
        "name": "1-1-1-2",
        "num_copies": [1,1,1,2]
    },
    {
        "name": "1-1-1-3",
        "num_copies": [1,1,1,3]
    },
]

In [None]:
for i, image in enumerate(sam3_results[993:]):
    try:
        print(f"Processing {i+1}/{len(sam3_results)}")
        candidate_masks = get_candidate_masks(image)
        if len(candidate_masks) > 0:
            for variation in variantions:
                file_name = image.file_name.split(".")[0]
                # get random masks from candidate_masks based on variation["num_copies"]
                # it should be random with replacement
                k = len(variation["num_copies"]) if len(variation["num_copies"]) <= len(candidate_masks) else len(candidate_masks)
                random_masks = random.sample(candidate_masks, k=k)
                new_image, new_mask = copy_paste_objects(image.image, random_masks, variation["num_copies"])
                new_image.save(f"./ourtraining/images/{file_name}_{variation['name']}.png")
                np.save(f"./ourtraining/masks/{file_name}_{variation['name']}.npy", new_mask)
    except Exception as e:
        print(f"Error processing {i+1}/{len(sam3_results)}: {e}")

In [None]:
import matplotlib.pyplot as plt
name  = '2430_2-2'
image = Image.open(f"./ourtraining/images/{name}.png")
authentic_image = Image.open(f"./train_images/authentic/{name.split('_')[0]}.png")
plt.imshow(authentic_image)
plt.axis('off')
plt.show()
masks = np.load(f"./ourtraining/masks/{name}.npy")

for mask in masks:
    # show the image and the mask side by side
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.axis('off')
    plt.imshow(image)
    plt.subplot(1, 2, 2)
    plt.imshow(mask, alpha=0.2)
    plt.axis('off')
    plt.show()

In [None]:
from huggingface_hub import login, upload_folder

# (optional) Login with your Hugging Face credentials
login()

# Push your dataset files
upload_folder(folder_path="./ourtraining", repo_id="eliplutchok/recod_comp", repo_type="dataset")


In [None]:
# EASIER WAY 1: Using Hugging Face CLI (SIMPLEST - recommended!)
# For large folders, use the large folder upload command:
# huggingface-cli upload-large-folder eliplutchok/recod_comp ./ourtraining --repo-type dataset

# EASIER WAY 2: Using upload_large_folder in Python (for large datasets)
from huggingface_hub import HfApi
import os

# Login (only needed once per session)
# Option A: Use token from environment variable (recommended)
# os.environ["HF_TOKEN"] = "your_token_here"  # Get from https://huggingface.co/settings/tokens

# Option B: Use login() - may need ipywidgets: pip install ipywidgets
# from huggingface_hub import login
# login()

# Create API instance
api = HfApi()

# Upload large folder - handles large datasets better with progress tracking
api.upload_large_folder(
    folder_path="./ourtraining",
    repo_id="eliplutchok/recod_comp",
    repo_type="dataset",
    ignore_patterns=["*.ipynb_checkpoints", "__pycache__", "*.pyc"]  # Skip unnecessary files
)
