In [None]:
import fiftyone as fo

In [None]:
dataset = fo.load_dataset("segmentation_dataset_v0.10")

In [None]:
for sample in dataset:
    
    gt_points = [poly['points'][0] for poly in sample['General body shape']['polylines']]

    sam3_points = [poly['points'][0] for poly in sample['sam3_segmentation']['polylines']]
    if len(gt_points) > 1:
        print(len(gt_points))
        break

In [None]:
from shapely.geometry import Polygon
import numpy as np
from tqdm import tqdm

def calculate_iou(poly1_points, poly2_points):
    """Вычисляет IoU между двумя полигонами"""
    try:
        poly1 = Polygon(poly1_points)
        poly2 = Polygon(poly2_points)
        
        if not poly1.is_valid or not poly2.is_valid:
            return 0.0
        
        intersection = poly1.intersection(poly2).area
        union = poly1.union(poly2).area
        
        if union == 0:
            return 0.0
        
        return intersection / union
    except:
        return 0.0

def find_unmatched_detections(gt_polylines, sam3_polylines, iou_threshold=0.2):
    """
    Находит непарные детекции (те, у которых нет хорошего матча)
    Возвращает: (количество непарных GT, количество непарных SAM3, средний IoU)
    """
    if not gt_polylines or not sam3_polylines:
        # Если один из списков пустой, все детекции в другом списке непарные
        return len(gt_polylines or []), len(sam3_polylines or []), 0.0
    
    gt_points_list = [poly['points'][0] for poly in gt_polylines]
    sam3_points_list = [poly['points'][0] for poly in sam3_polylines]
    
    # Вычисляем IoU между всеми парами
    iou_matrix = np.zeros((len(gt_points_list), len(sam3_points_list)))
    for i, gt_points in enumerate(gt_points_list):
        for j, sam3_points in enumerate(sam3_points_list):
            iou_matrix[i, j] = calculate_iou(gt_points, sam3_points)
    
    # Находим лучший матч для каждого GT
    unmatched_gt = 0
    for i in range(len(gt_points_list)):
        best_iou = np.max(iou_matrix[i, :])
        if best_iou < iou_threshold:
            unmatched_gt += 1
    
    # Finding the best match for every SAM3
    unmatched_sam3 = 0
    for j in range(len(sam3_points_list)):
        best_iou = np.max(iou_matrix[:, j])
        if best_iou < iou_threshold:
            unmatched_sam3 += 1
    
    avg_iou = np.mean(iou_matrix) if iou_matrix.size > 0 else 0.0
    
    return unmatched_gt, unmatched_sam3, avg_iou

for sample in tqdm(dataset, desc="Calculating IoU and finding unmatched detections"):
    gt_polylines = sample['General body shape']['polylines'] if sample['General body shape'] else []
    sam3_polylines = sample['sam3_segmentation']['polylines'] if sample['sam3_segmentation'] else []
    
    unmatched_gt, unmatched_sam3, avg_iou = find_unmatched_detections(
        gt_polylines, sam3_polylines, iou_threshold=0.2
    )
    
    sample['unmatched_gt_count'] = unmatched_gt
    sample['unmatched_sam3_count'] = unmatched_sam3
    sample['avg_iou'] = avg_iou
    sample['has_unmatched'] = (unmatched_gt > 0) or (unmatched_sam3 > 0)
    sample.save()

print("Analysis complete. Fields added:")
print("- unmatched_gt_count: number of GT polygons without a good match")
print("- unmatched_sam3_count: number of SAM3 polygons without a good match")
print("- avg_iou: average IoU between all pairs")
print("- has_unmatched: whether there are unmatched detections")

In [None]:
# View 1: SAM3 found detections that did not find a match in GT (IoU < 0.2)
from fiftyone import ViewField as F

view_sam3_unmatched = dataset.match(F("unmatched_sam3_count") > 0)

print(f"Samples where SAM3 found detections without a good match in GT: {len(view_sam3_unmatched)}")
print(f"Total unmatched SAM3 detections: {sum([s['unmatched_sam3_count'] for s in view_sam3_unmatched])}")
session1 = fo.launch_app(view_sam3_unmatched)

In [None]:
# View 2: GT detections that did not find a match in SAM3 (IoU < 0.2)
view_gt_unmatched = dataset.match(F("unmatched_gt_count") > 0)

print(f"Samples where GT detections did not find a good match in SAM3: {len(view_gt_unmatched)}")
print(f"Total unmatched GT detections: {sum([s['unmatched_gt_count'] for s in view_gt_unmatched])}")
session2 = fo.launch_app(view_gt_unmatched)

# View 3: All samples with any mismatches (GT or SAM3)
view_all_unmatched = dataset.match(F("has_unmatched") == True)

print(f"Total samples with mismatches: {len(view_all_unmatched)}")
print(f"Total unmatched GT: {sum([s['unmatched_gt_count'] for s in view_all_unmatched])}")
print(f"Total unmatched SAM3: {sum([s['unmatched_sam3_count'] for s in view_all_unmatched])}")
session3 = fo.launch_app(view_all_unmatched)

In [None]:
from shapely.geometry import Polygon
import numpy as np
from tqdm import tqdm

def get_polygon_area(points):
    """Calculates the area of a polygon (normalized)"""
    try:
        poly = Polygon(points)
        return poly.area if poly.is_valid else 0.0
    except:
        return 0.0

def iou(poly1_points, poly2_points):
    """Calculates Intersection over Union of two polygons"""
    poly1 = Polygon(poly1_points)
    poly2 = Polygon(poly2_points)
    if not poly1.is_valid or not poly2.is_valid:
        return 0.0
    inter = poly1.intersection(poly2).area
    union = poly1.union(poly2).area
    if union == 0:
        return 0.0
    return inter / union

sam3_areas = []
gt_areas = []
unique_sam3_areas = []

iou_threshold = 0.5

for sample in tqdm(dataset, desc="Analyzing SAM3 and GT polygons"):
    sam3_polys = []
    gt_polys = []

    # SAM3 polygons
    if sample['sam3_segmentation'] and sample['sam3_segmentation']['polylines']:
        for poly in sample['sam3_segmentation']['polylines']:
            points = poly['points'][0]
            area = get_polygon_area(points)
            if area > 0:
                sam3_areas.append(area)
                sam3_polys.append(points)
    
    # GT polygons
    if sample['General body shape'] and sample['General body shape']['polylines']:
        for poly in sample['General body shape']['polylines']:
            points = poly['points'][0]
            area = get_polygon_area(points)
            if area > 0:
                gt_areas.append(area)
                gt_polys.append(points)
    
    # Unique SAM3 polygons (no matching GT with IOU >= 0.5)
    for s_poly in sam3_polys:
        max_iou = 0.0
        for g_poly in gt_polys:
            max_iou = max(max_iou, iou(s_poly, g_poly))
        if max_iou < iou_threshold:
            unique_sam3_areas.append(get_polygon_area(s_poly))

# Convert to numpy arrays
sam3_areas = np.array(sam3_areas)
gt_areas = np.array(gt_areas)
unique_sam3_areas = np.array(unique_sam3_areas)

def print_detailed_stats(name, areas):
    print(f"=== {name} ===")
    print(f"Total polygons: {len(areas)}")
    if len(areas) == 0:
        return
    print(f"Minimum: {areas.min():.6f}")
    print(f"Maximum: {areas.max():.6f}")
    print(f"Mean: {areas.mean():.6f}")
    print(f"Median: {np.median(areas):.6f}")
    
    # Additional quantile points
    quantiles = [0.1, 0.25, 0.5, 1, 5, 10, 25, 50, 75, 90, 95, 99]
    for q in quantiles:
        print(f"{q}% quantile: {np.percentile(areas, q):.6f}")
    
    # Small intervals 0-0.1
    intervals = np.arange(0, 0.11, 0.01)
    counts, _ = np.histogram(areas, bins=intervals)
    print("\nDistribution in range 0-0.1:")
    for i in range(len(counts)):
        print(f"  {intervals[i]:.2f}-{intervals[i+1]:.2f}: {counts[i]} ({counts[i]/len(areas)*100:.2f}%)")

# Output statistics
print_detailed_stats("SAM3", sam3_areas)
print_detailed_stats("GT", gt_areas)
print_detailed_stats("Unique SAM3 (IOU < 0.5)", unique_sam3_areas)

In [None]:
def print_detailed_stats(name, areas):
    print(f"=== {name} ===")
    print(f"Total polygons: {len(areas)}")
    if len(areas) == 0:
        return
    print(f"Minimum: {areas.min():.6f}")
    print(f"Maximum: {areas.max():.6f}")
    print(f"Mean: {areas.mean():.6f}")
    print(f"Median: {np.median(areas):.6f}")
    
    # Additional quantile points
    quantiles = [0.1, 0.25, 0.5, 1, 5, 10, 25, 50, 75, 90, 95, 99]
    for q in quantiles:
        print(f"{q}% quantile: {np.percentile(areas, q):.6f}")
    
    # Small intervals 0-0.7
    intervals = np.arange(0, 0.71, 0.01)
    counts, _ = np.histogram(areas, bins=intervals)
    print("\nDistribution in range 0-0.7:")
    for i in range(len(counts)):
        # Calculate percentage for each bin
        percentage = counts[i] / len(areas) * 100
        print(f"  {intervals[i]:.2f}-{intervals[i+1]:.2f}: {counts[i]} ({percentage:.2f}%)")

# Output statistics
print_detailed_stats("SAM3", sam3_areas)
print_detailed_stats("GT", gt_areas)
print_detailed_stats("Unique SAM3 (IOU < 0.5)", unique_sam3_areas)

In [None]:
# Area distribution visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. SAM3 Histogram (all data)
axes[0, 0].hist(sam3_areas, bins=50, alpha=0.7, color='blue', edgecolor='black')
axes[0, 0].set_xlabel('Area (normalized)')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('SAM3 Polygon Area Distribution')
axes[0, 0].grid(True, alpha=0.3)

# 2. SAM3 Histogram (small polygons only, up to 1% quantile)
threshold_1pct = np.percentile(sam3_areas, 1)
small_sam3 = sam3_areas[sam3_areas <= threshold_1pct * 10]  # up to 10x the 1% quantile
axes[0, 1].hist(small_sam3, bins=50, alpha=0.7, color='red', edgecolor='black')
axes[0, 1].axvline(threshold_1pct, color='green', linestyle='--', linewidth=2, label=f'1% Quantile: {threshold_1pct:.6f}')
axes[0, 1].set_xlabel('Area (normalized)')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Distribution of Small SAM3 Polygons')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Comparison: SAM3 vs GT
axes[1, 0].hist([sam3_areas, gt_areas], bins=50, alpha=0.6, 
                label=['SAM3', 'GT'], color=['blue', 'green'], edgecolor='black')
axes[1, 0].set_xlabel('Area (normalized)')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Comparison: SAM3 vs GT')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Box plot for comparison
axes[1, 1].boxplot([sam3_areas, gt_areas], labels=['SAM3', 'GT'])
axes[1, 1].set_ylabel('Area (normalized)')
axes[1, 1].set_title('Box Plot: SAM3 vs GT')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n=== FILTERING THRESHOLD RECOMMENDATIONS ===")
print(f"Smallest 1%: area < {np.percentile(sam3_areas, 1):.6f}")
print(f"Smallest 5%: area < {np.percentile(sam3_areas, 5):.6f}")
print(f"Smallest 10%: area < {np.percentile(sam3_areas, 10):.6f}")

In [None]:
# Analysis: how many polygons will be filtered at different thresholds
thresholds = [
    np.percentile(sam3_areas, 1),
    np.percentile(sam3_areas, 5),
    np.percentile(sam3_areas, 10),
    0.001,  # 0.1% of the image
    0.005,  # 0.5% of the image
    0.01,   # 1% of the image
]

print("=== NUMBER OF FILTERED POLYGONS AT VARIOUS THRESHOLDS ===\n")
for threshold in thresholds:
    count = np.sum(sam3_areas < threshold)
    percent = (count / len(sam3_areas)) * 100
    print(f"Threshold {threshold:.6f}: {count:4d} polygons ({percent:5.2f}%)")

In [None]:
# Creating a new dataset with merged polygons
from shapely.geometry import Polygon

def get_polygon_area(points):
    """Calculates the area of a polygon (normalized)"""
    try:
        poly = Polygon(points)
        return poly.area if poly.is_valid else 0.0
    except:
        return 0.0

def merge_detections(gt_polylines, sam3_polylines, iou_threshold=0.2, min_area=0.001):
    """
    Merges GT and unique SAM3 polygons
    - Keeps all GT polygons
    - Adds SAM3 polygons that do not match GT (IoU < threshold)
    - Filters out SAM3 polygons that are too small (area < min_area)
    """
    merged_polylines = []
    
    # Add all GT polygons
    if gt_polylines:
        merged_polylines.extend(gt_polylines)
    
    # Add unique SAM3 polygons (not matching any GT)
    if sam3_polylines:
        gt_points_list = [poly['points'][0] for poly in gt_polylines] if gt_polylines else []
        
        for sam3_poly in sam3_polylines:
            sam3_points = sam3_poly['points'][0]
            
            # Check the area
            area = get_polygon_area(sam3_points)
            if area < min_area:
                continue  # Skip polygons that are too small
            
            # Check if there is a good match with GT
            has_match = False
            for gt_points in gt_points_list:
                iou = calculate_iou(gt_points, sam3_points)
                if iou >= iou_threshold:
                    has_match = True
                    break
            
            # If no good match with GT exists, add this SAM3 polygon
            if not has_match:
                merged_polylines.append(sam3_poly)
    
    return merged_polylines

print("Polygon merging functions have been created")

In [None]:
# Creating a new dataset with merged polygons
import fiftyone as fo

# Dataset cloning configuration
new_dataset_name = "segmentation_merged_v0.1_full"

# Delete if it already exists
if new_dataset_name in fo.list_datasets():
    print(f"Deleting dataset: {new_dataset_name}")
    fo.delete_dataset(new_dataset_name)

# Create the new dataset
new_dataset = dataset.clone(new_dataset_name)
new_dataset.persistent = True
print(f"New dataset created: {new_dataset_name}")
print(f"Number of samples: {len(new_dataset)}")

In [None]:
# Updating the "General body shape" field with merged polygons
from tqdm import tqdm

min_area_threshold = 0.005  # Minimum area for SAM3 polygons (0.1% of the image)

stats = {
    'total_samples': 0,
    'gt_polygons': 0,
    'sam3_added': 0,
    'sam3_filtered_small': 0,
    'sam3_filtered_matched': 0
}

for sample in tqdm(new_dataset, desc="Merging polygons"):
    gt_polylines = sample['General body shape']['polylines'] if sample['General body shape'] else []
    sam3_polylines = sample['sam3_segmentation']['polylines'] if sample['sam3_segmentation'] else []
    
    original_gt_count = len(gt_polylines)
    original_sam3_count = len(sam3_polylines)
    
    # Merge polygons
    merged_polylines = merge_detections(
        gt_polylines, 
        sam3_polylines, 
        iou_threshold=0.1,
        min_area=min_area_threshold
    )
    
    # Update "General body shape" field
    if sample['General body shape']:
        sample['General body shape']['polylines'] = merged_polylines
    else:
        # Create a new field if it didn't exist
        sample['General body shape'] = fo.Polylines(polylines=merged_polylines)
    
    # Statistics
    sam3_added = len(merged_polylines) - original_gt_count
    stats['total_samples'] += 1
    stats['gt_polygons'] += original_gt_count
    stats['sam3_added'] += sam3_added
    
    sample.save()

print("Processing complete!")
print(f"Samples processed: {stats['total_samples']}")
print(f"GT polygons: {stats['gt_polygons']}")
print(f"SAM3 polygons added: {stats['sam3_added']}")
print(f"\nNew dataset: {new_dataset_name}")

In [None]:
# Visualization of the new dataset
session_merged = fo.launch_app(new_dataset)

print(f"Opened new dataset: {new_dataset_name}")
print(f"The 'General body shape' field now contains:")
print(f"  - All original GT polygons")
print(f"  - Unique SAM3 polygons (IoU < 0.2 with GT)")
print(f"  - Filtered out small SAM3 polygons (area < {min_area_threshold})")

In [None]:
import os
import json
import requests
import argparse
from typing import Any, Dict, List, Optional, Tuple

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def read_json_file(file_path: str) -> Dict[str, Any]:
    """
    Read a JSON file and return its content as a dictionary.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        Dict[str, Any]: The loaded JSON data.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

coco_path = "/home/fishial/Fishial/dataset/EXPORT_V_0_9/Fishial_Export_Jan_08_2026_04_14_Production_AI_Gen_All_Verified.json"
images_data_folder = "/home/fishial/Fishial/dataset/EXPORT_V_0_8/data/data"

# print(f"Start reading COCO file: {coco_path}")
# data = read_json_file(coco_path)
# print("Finished reading COCO file")


print(f"images_data_folder: {images_data_folder}")
# Get list of already downloaded images
downloaded_files = [
    f for f in os.listdir(images_data_folder)
    if os.path.isfile(os.path.join(images_data_folder, f))
]

print("Count of downloaded files: ", len(downloaded_files))

# Calculate and print the count of files that differ from the expected images in the dataset
all_image_filenames = {image['file_name'] for image in data.get('images', [])}
different_files = set(downloaded_files) - all_image_filenames
print("Count of different files: ", len(different_files))

# Calculate and print the count of files that differ from the expected images in the dataset
all_image_filenames = {image['file_name'] for image in data.get('images', [])}
different_files = set(downloaded_files) - all_image_filenames
print("Count of different files: ", len(different_files))
