# Prawn Counting Analysis Notebook

This notebook provides comprehensive analysis tools for counting prawns in underwater images, including:




# Setup & Imports 

Import all necessary libraries for dataset management, model evaluation, and analysis.


In [1]:
# Core libraries
import os
import shutil
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# FiftyOne for dataset management
import fiftyone as fo
from fiftyone import ViewField as F

# YOLO model
from ultralytics import RTDETR

# Image processing
from PIL import Image
import pytesseract

print("✅ All libraries imported successfully")

✅ All libraries imported successfully


# Configuration

Set up paths and configuration parameters for the analysis.


In [2]:
# Configuration parameters
CONFIG = {
    'dataset_name': 'circle-pond-analysis',
    'dataset_dir': r"/Users/gilbenor/Downloads/circle pond.v20i.yolov8",
    'model_path': r"/Users/gilbenor/Library/CloudStorage/OneDrive-Personal/measurement_paper_images/detection drone/runs-detections-drone-14.08/detect/train/weights/best.pt",
    'data_yaml': r"/Users/gilbenor/Downloads/circle pond.v23i.yolov8/data.yaml",
    'splits': ["test", "valid", "train"]
}

print("📁 Configuration loaded:")
for key, value in CONFIG.items():
    print(f"   {key}: {value}")
    
# Verify paths exist
if os.path.exists(CONFIG['dataset_dir']):
    print(f"✅ Dataset directory exists: {len(os.listdir(CONFIG['dataset_dir']))} items")
else:
    print("❌ Dataset directory not found")


📁 Configuration loaded:
   dataset_name: circle-pond-analysis
   dataset_dir: /Users/gilbenor/Downloads/circle pond.v20i.yolov8
   model_path: /Users/gilbenor/Library/CloudStorage/OneDrive-Personal/measurement_paper_images/detection drone/runs-detections-drone-14.08/detect/train/weights/best.pt
   data_yaml: /Users/gilbenor/Downloads/circle pond.v23i.yolov8/data.yaml
   splits: ['test', 'valid', 'train']
✅ Dataset directory exists: 6 items






# RT-DETR object detection evalutation at multiple confidence thresholds

This code block loads a trained RT-DETR model from a specified path and evaluates its performance on the test split of a YOLO-format dataset at various confidence thresholds. For each threshold (from 0.1 to 1.0), it runs validation, saves the results in JSON format, generates plots, and prints the results. This allows you to analyze how the model's performance changes as the detection confidence threshold varies.

---


In [None]:


from ultralytics import RTDETR

model= RTDETR(r"/Users/gilbenor/Library/CloudStorage/OneDrive-Personal/measurement_paper_images/detection drone/runs-detections-drone-14.08/detect/train/weights/best.pt")


for threshold in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    results = model.val(save_json=True, data=r"/Users/gilbenor/Downloads/circle pond.v23i.yolov8/data.yaml", plots=True,split='test',conf=threshold)
    print(f"threshold: {threshold}")
    print(results)

Ultralytics 8.3.100 🚀 Python-3.9.6 torch-2.0.1 CPU (Apple M4)
rt-detr-l summary: 302 layers, 31,985,795 parameters, 0 gradients, 103.4 GFLOPs


[34m[1mval: [0mScanning /Users/gilbenor/Downloads/circle pond.v23i.yolov8/test/labels.cache... 79 images, 1 backgrounds, 0 corrupt: 100%|██████████| 79/79 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):   0%|          | 0/5 [00:05<?, ?it/s]


KeyboardInterrupt: 



->

# Creating and Inspecting a FiftyOne Dataset from YOLO-Formatted Data

This code block creates a new FiftyOne dataset called `circle-pond-analysis` from a directory containing images and YOLO-format label files, organized into `train`, `valid`, and `test` splits.

- **Dataset Structure:**  
  The code expects the following folder structure for each split:
  ```
  <dataset_dir>/<split>/images/
  <dataset_dir>/<split>/labels/
  ```
  where `<split>` is one of `train`, `valid`, or `test`.

- **Label Parsing:**  
  For each image, the code looks for a corresponding YOLO label file. Each label file is parsed, and the bounding boxes are converted into FiftyOne `Detection` objects (with the label `'ground_truth'`).

- **Sample Creation:**  
  Each image and its detections are added as a sample to the FiftyOne dataset, with the split name added as a tag.

- **Dataset Summary:**  
  After loading, the code prints:
  - The number of files in the test directory
  - The number of samples in the dataset
  - A summary and schema of the dataset
  - Details of the first sample, including its ground truth detections

- **Visualization:**  
  Optionally, you can launch the FiftyOne app to visually inspect the dataset by uncommenting the last two lines.

This setup is useful for preparing and verifying your dataset before training or evaluation.

In [3]:
import fiftyone as fo
import os
import glob

# Set the dataset name
name = "circle-pond-analysis"

# Set the dataset directory
dataset_dir = r"/Users/gilbenor/Downloads/circle pond.v20i.yolov8"

# The splits to load
splits = ["test", "valid","train"]


dataset = fo.Dataset(name,overwrite=True)
print(f"Created new dataset '{name}'")


#print dataset dir
print(len(os.listdir("/Users/gilbenor/Downloads/circle pond.v20i.yolov8/test")))

def read_yolo_label(label_path):
    with open(label_path, 'r') as file:
        lines = file.readlines()
    detections = []
    for line in lines:
        class_id, x_center, y_center, width, height = map(float, line.strip().split())
        detections.append(
            fo.Detection(
                label='ground_truth' , # Convert class_id to string
                bounding_box=[x_center - width/2, y_center - height/2, width, height]
            )
        )
    return detections

for split in splits:
    split_dir = os.path.join(dataset_dir, split)
    images_dir = os.path.join(split_dir, "images")
    labels_dir = os.path.join(split_dir, "labels")
    
    # Add images and labels
    for image_path in glob.glob(os.path.join(images_dir, "*")):
        image_name = os.path.basename(image_path)
        label_name = os.path.splitext(image_name)[0] + ".txt"
        label_path = os.path.join(labels_dir, label_name)
        
        sample = fo.Sample(filepath=image_path)
        sample.tags.append(split)

        if os.path.exists(label_path):
            detections = read_yolo_label(label_path)
            sample["ground_truth"] = fo.Detections(detections=detections,label_field="ground_truth")
           
        #add sample to dataset
        dataset.add_sample(sample)
        print(f"Added {image_name} with label (if exists)")

print(f"\nDataset '{name}' now has {len(dataset)} samples")

# Print detailed information about the dataset
print("\nDataset Summary:")
print(dataset.summary())

# Print schema of the dataset
print("\nDataset Schema:")
for field_name, field_type in dataset.get_field_schema().items():
    print(f"{field_name}: {field_type}")

# If you want to examine a specific sample
if len(dataset) > 0:
    sample = dataset.first()
    print("\nFirst Sample Details:")
    print(sample)

    if "ground_truth" in sample:
        print("\nGround Truth for First Sample:")
        print(sample.ground_truth)
    else:
        print("\nWarning: 'ground_truth' field not found in the first sample")
else:
    print("\nWarning: Dataset is empty")

# Optionally, you can visualize the dataset
# session = fo.launch_app(dataset)
# session.wait()


Created new dataset 'circle-pond-analysis'
3
Added 20230920_120410_jpg.rf.228c5eec6b39d44ad174701845916268.jpg with label (if exists)
Added 20230920_115951_jpg.rf.cd3046bb49355e395045e8631aded018.jpg with label (if exists)
Added 20230920_121523_jpg.rf.dbe991c9c4bd750d730ba2a50ef21087.jpg with label (if exists)
Added 20230920_121416_jpg.rf.afffe41c31395002ec7d0dbca1d9d2e1.jpg with label (if exists)
Added 20230920_115310_jpg.rf.b8a6ccd11c7598ec045e2fda3e7dc524.jpg with label (if exists)
Added 20230920_120814_jpg.rf.f4cbaf661cdcdb67b2b820182fd80a51.jpg with label (if exists)
Added 20230920_120425_jpg.rf.f3d9a39cfeb1b2857480894f05663172.jpg with label (if exists)
Added 20230920_120359_jpg.rf.6c0789ba372afb3eb463288e5a235837.jpg with label (if exists)
Added 20230920_115213_jpg.rf.c73c0a0cec2c595a80e6a7ef7066d63c.jpg with label (if exists)
Added 20230920_121213_jpg.rf.638b72a42e4db7183e192536efd0057a.jpg with label (if exists)
Added 20230920_115946_jpg.rf.76816e35cb54c653ab07be62c6caabab.jpg

# tag each sample for test, valid train #

In [4]:
# Print detection count statistics per image

import numpy as np
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, 
                       module="numpy.core")

from fiftyone import ViewField as F
#for each split, print the avg number of detections and std
splits = ["test", "valid","train"]

splits_test=[]
splits_val=[]
splits_train=[]

for sample in dataset:
    if "test" in sample.tags:
        splits_test.append(sample)
    elif "valid" in sample.tags:
        splits_val.append(sample)
    elif "train" in sample.tags:
        splits_train.append(sample)

print(f"Average number of detections for split test: {np.mean([len(sample.ground_truth.detections) for sample in splits_test]):.2f}")
print(f"Standard deviation of detections for split test: {np.std([len(sample.ground_truth.detections) for sample in splits_test]):.2f}")

print(f"Average number of detections for split val: {np.mean([len(sample.ground_truth.detections) for sample in splits_val]):.2f}")
print(f"Standard deviation of detections for split val: {np.std([len(sample.ground_truth.detections) for sample in splits_val]):.2f}")


#number of samples in each split
print(f"Number of samples in split test: {len(splits_test)}")
print(f"Number of samples in split val: {len(splits_val)}")
print(f"Number of samples in split train: {len(splits_train)}")

print(f"Average number of detections for split train: {np.mean([len(sample.ground_truth.detections) for sample in splits_train]):.2f}")
print(f"Standard deviation of detections for split train: {np.std([len(sample.ground_truth.detections) for sample in splits_train]):.2f}")


print("\nDetection Count Statistics:")
detection_counts = []
for sample in dataset:
    if "ground_truth" in sample:
        count = len(sample.ground_truth.detections)
        detection_counts.append(count)

if detection_counts:
    print(f'min detection counts that are not 0: {min([count for count in detection_counts if count != 0])}')
    print(f"Min detections per image: {min(detection_counts)}")
    print(f"Max detections per image: {max(detection_counts)}")
    print(f"Average detections per image: {sum(detection_counts)/len(detection_counts):.2f}")
    #std
    #total number of detections
    print(f"Total number of detections: {sum(detection_counts)}")
    print(f"Standard deviation of detections per image: {np.std(detection_counts):.2f}")
else:
    print("No detection counts found in dataset")


#for each tag, print the avg number of detections and std
for tag in dataset.distinct("tags"):
    print(f"Average number of detections for tag {tag}: {np.mean([len(sample.ground_truth.detections) for sample in dataset.filter_labels('ground_truth', F('tags') == tag)]):.2f}")
    print(f"Standard deviation of detections for tag {tag}: {np.std([len(sample.ground_truth.detections) for sample in dataset.filter_labels('ground_truth', F('tags') == tag)]):.2f}")
    

Average number of detections for split test: 7.10
Standard deviation of detections for split test: 4.32
Average number of detections for split val: 2.51
Standard deviation of detections for split val: 2.37
Number of samples in split test: 79
Number of samples in split val: 102
Number of samples in split train: 494
Average number of detections for split train: 4.29
Standard deviation of detections for split train: 3.24

Detection Count Statistics:
min detection counts that are not 0: 1
Min detections per image: 0
Max detections per image: 20
Average detections per image: 4.35
Total number of detections: 2938
Standard deviation of detections per image: 3.48
Average number of detections for tag test: nan
Standard deviation of detections for tag test: nan
Average number of detections for tag train: nan
Standard deviation of detections for tag train: nan
Average number of detections for tag valid: nan
Standard deviation of detections for tag valid: nan


 # Launch app #

In [5]:
import fiftyone as fo

dataset = fo.load_dataset("circle-pond-analysis")


session = fo.launch_app(dataset, port=5160, auto=False)
session.show()

Session launched. Run `session.show()` to open the App in a cell output.




->

# Model Evaluation on Test Set

This code applies the RT-DETR model to the test split of the dataset and evaluates its performance against ground truth detections.

**What it does:**
- Filters the dataset to get only samples tagged with "test"
- Applies the RT-DETR model to generate predictions with confidence threshold 0.3
- Evaluates the model performance using IoU threshold of 0.5
- Compares predicted detections against ground truth detections

**Key correction:** We use `match_tags("test")` to filter samples by their tags, not `filter_labels()` which filters individual detections.

In [6]:
import fiftyone as fo
import fiftyone.utils.eval as foue
from ultralytics import RTDETR
from fiftyone import ViewField as F
from fiftyone.core.odm.dataset import SidebarGroupDocument

# Load the model
model = RTDETR(r"/Users/gilbenor/Library/CloudStorage/OneDrive-Personal/measurement_paper_images/detection drone/runs-detections-drone-14.08/detect/train/weights/best.pt")

# CORRECT WAY: Filter samples (not labels) by tags
test_set_view = dataset.match_tags("test")

results = None
print(f"Number of samples in test set: {len(test_set_view)}")

# Check if we have any samples
if len(test_set_view) == 0:
    print("No test samples found! Available tags:")
    print(dataset.distinct("tags"))
else:
    # Apply the model to the filtered test set
    test_set_view.apply_model(model, label_field="prawn", confidence_thresh=0.3)
    
    # Evaluate the model's performance - using a simpler approach to avoid sidebar validation error
    try:
        foue.evaluate_detections(
        test_set_view,
        pred_field="prawn",
        gt_field="ground_truth",
        eval_key="eval",
        iou=0.1,         # lower threshold to accept slight misalignments
        classwise=False  # don’t require identical labels
    )

        print("Evaluation results:")
        print(results)
        
    except Exception as e:
        print(f"FiftyOne evaluation failed: {e}")
        print("Using manual evaluation instead...")
        
        # Manual evaluation as fallback
        results = None
        print("Applied model predictions successfully. Use manual evaluation in next cell.")

Number of samples in test set: 79
 100% |███████████████████| 79/79 [39.4s elapsed, 0s remaining, 2.1 samples/s]      
Evaluating detections...
 100% |███████████████████| 79/79 [973.3ms elapsed, 0s remaining, 81.2 samples/s]      
Evaluation results:
None


# export dataset

In [7]:

dataset.export(
        export_dir=f"/Users/gilbenor/Documents/code_projects/msc/counting_research_algorithms/fiftyone_datasets/drone_detection",
        dataset_type=fo.types.FiftyOneDataset,
        export_media=True
    )

Exporting samples...
 100% |████████████████████| 675/675 [461.7ms elapsed, 0s remaining, 1.5K docs/s]       




->

# Manual Detection Evaluation System

This code implements a custom evaluation system for object detection results, specifically designed to handle prawn detection scenarios. The system addresses a bug where True Positives (TP) were incorrectly calculated when equal to 0.

## Key Components

### 1. IoU Calculator (`calculate_iou`)

This function:
- Takes two bounding boxes in normalized coordinates (0-1)
- Format: `[x_center, y_center, width, height]`
- Converts center-format to corner-format coordinates
- Calculates intersection and union areas
- Returns IoU value between 0 and 1

Key features:
- Handles edge cases (no intersection, zero union)
- Includes coordinate format conversion
- Uses normalized coordinates for consistency

### 2. Detection Evaluator (`manual_evaluate_detections`)

This function:
- Processes FiftyOne dataset samples
- Compares ground truth vs predicted detections
- Uses IoU threshold (default: 0.1) for matching
- Returns dictionary with:
  - True Positives (TP)
  - False Positives (FP)
  - False Negatives (FN)

Key steps:
1. Extracts ground truth and prediction boxes
2. Converts coordinates to center format
3. Implements matching using a greedy approach
4. Counts unmatched predictions as FP
5. Counts unmatched ground truth as FN

## Notable Implementation Details

1. **Coordinate Handling**:
   - Converts from corner to center format
   - Maintains consistency across calculations

2. **Matching Algorithm**:
   - Uses a greedy approach
   - Tracks matched boxes using boolean arrays
   - Prevents double-matching of ground truth boxes

3. **Error Prevention**:
   - Handles invalid box configurations
   - Returns 0 for non-overlapping boxes

## Usage Notes

- IoU threshold is configurable (default: 0.1)
- Handles FiftyOne dataset format
- Supports normalized coordinates
- Includes validation for edge cases
- Provides clear success/failure indicators in output

In [27]:
# Manual Evaluation Functions - Bug Fix for TP=0 Issue

def calculate_iou(box1, box2):
    """
    Calculate Intersection over Union (IoU) between two bounding boxes.
    
    Args:
        box1, box2: [x_center, y_center, width, height] in normalized coordinates (0-1)
    
    Returns:
        IoU value between 0 and 1
    """
    # Convert from center format to corner format
    def center_to_corners(box):
        x_center, y_center, width, height = box
        x1 = x_center - width / 2
        y1 = y_center - height / 2
        x2 = x_center + width / 2
        y2 = y_center + height / 2
        return [x1, y1, x2, y2]
    
    box1_corners = center_to_corners(box1)
    box2_corners = center_to_corners(box2)
    
    # Calculate intersection
    x1 = max(box1_corners[0], box2_corners[0])
    y1 = max(box1_corners[1], box2_corners[1])
    x2 = min(box1_corners[2], box2_corners[2])
    y2 = min(box1_corners[3], box2_corners[3])
    
    if x2 <= x1 or y2 <= y1:
        return 0.0
    
    intersection = (x2 - x1) * (y2 - y1)
    
    # Calculate union
    area1 = box1[2] * box1[3]  # width * height
    area2 = box2[2] * box2[3]  # width * height
    union = area1 + area2 - intersection
    
    if union <= 0:
        return 0.0
    
    return intersection / union

def manual_evaluate_detections(sample, iou_threshold=0.1):
    """
    Manually evaluate detections for a single sample.
    
    Args:
        sample: FiftyOne sample with 'ground_truth' and 'prawn' fields
        iou_threshold: IoU threshold for considering a detection as TP
    
    Returns:
        dict with 'tp', 'fp', 'fn' counts
    """
    # Get ground truth and predictions
    gt_detections = sample.ground_truth.detections if sample.ground_truth else []
    pred_detections = sample.prawn.detections if sample.prawn else []
    
    # Extract bounding boxes
    gt_boxes = []
    for det in gt_detections:
        # det.bounding_box is [x, y, width, height] where x,y is top-left corner
        # Convert to center format: [x_center, y_center, width, height]
        x, y, w, h = det.bounding_box
        x_center = x + w/2
        y_center = y + h/2
        gt_boxes.append([x_center, y_center, w, h])
    
    pred_boxes = []
    for det in pred_detections:
        x, y, w, h = det.bounding_box
        x_center = x + w/2
        y_center = y + h/2
        pred_boxes.append([x_center, y_center, w, h])
    
    # Match predictions to ground truth using Hungarian algorithm (greedy approach)
    gt_matched = [False] * len(gt_boxes)
    pred_matched = [False] * len(pred_boxes)
    
    tp = 0
    
    # For each prediction, find the best matching ground truth
    for pred_idx, pred_box in enumerate(pred_boxes):
        best_iou = 0
        best_gt_idx = -1
        
        for gt_idx, gt_box in enumerate(gt_boxes):
            if gt_matched[gt_idx]:  # Already matched
                continue
                
            iou = calculate_iou(pred_box, gt_box)
            if iou > best_iou:
                best_iou = iou
                best_gt_idx = gt_idx
        
        # If best IoU is above threshold, it's a match
        if best_iou >= iou_threshold and best_gt_idx != -1:
            tp += 1
            gt_matched[best_gt_idx] = True
            pred_matched[pred_idx] = True
    
    # Count unmatched predictions as FP and unmatched ground truth as FN
    fp = sum(1 for matched in pred_matched if not matched)
    fn = sum(1 for matched in gt_matched if not matched)
    
    return {
        'tp': tp,
        'fp': fp, 
        'fn': fn,
        
    }

print("✅ Manual evaluation functions created!")
print("   - calculate_iou(): Computes IoU between two bounding boxes")
print("   - manual_evaluate_detections(): Evaluates TP/FP/FN for a sample")
print("   - Uses IoU threshold of 0.5 by default")
print("   - Handles center vs corner coordinate conversion")

# Test the manual evaluation on a sample
test_sample = dataset.match_tags("test").first()
if test_sample:
    eval_results = manual_evaluate_detections(test_sample, iou_threshold=0.1)
    print(f"\n🔧 Testing manual evaluation on first test sample:")
    print(f"   TP: {eval_results['tp']}, FP: {eval_results['fp']}, FN: {eval_results['fn']}")
    print(f"   (Compare this to old broken evaluation: {test_sample['eval_tp']}, {test_sample['eval_fp']}, {test_sample['eval_fn']})")
else:
    print("\n⚠️  No test samples found")

✅ Manual evaluation functions created!
   - calculate_iou(): Computes IoU between two bounding boxes
   - manual_evaluate_detections(): Evaluates TP/FP/FN for a sample
   - Uses IoU threshold of 0.5 by default
   - Handles center vs corner coordinate conversion

🔧 Testing manual evaluation on first test sample:
   TP: 7, FP: 2, FN: 0
   (Compare this to old broken evaluation: 0, 9, 7)




->

# Interactive Detection Performance Analysis

This code creates an interactive Plotly visualization showing TP/FP/FN breakdown per image with MAE overlay. 

**Features:**
- Stacked bars showing True Positives, False Positives, and False Negatives per image
- MAE line overlay to show counting errors
- Interactive hover showing image details (filename, counts, MAE)
- Sortable by MAE, FP, or FN to identify problematic cases
- Reveals that images with same MAE can have very different error patterns

**How to interpret:**
- **Bar height** = total detection activity per image
- **Green** = correctly detected objects (TP)
- **Red** = incorrect detections (FP) 
- **Orange** = missed objects (FN)
- **Blue line** = counting error magnitude (MAE)
- **Hover** to see image filename and exact counts

In [28]:
# Fixed Manual Evaluation - Bug Fix for TP=0 Issue

records = []
test_set_view = dataset.match_tags("test")

print("🔧 Using manual evaluation to fix TP=0 bug...")
print("📊 Processing test samples with proper IoU calculation...")

for sample in test_set_view.iter_samples(progress=True):
    # Use our manual evaluation function instead of broken FiftyOne evaluation
    eval_results = manual_evaluate_detections(sample, iou_threshold=0.1)
    
    tp = eval_results['tp']
    fp = eval_results['fp'] 
    fn = eval_results['fn']

    print(tp, fp, fn)
    gt = tp + fn
    pred = tp + fp
    mae = abs(pred - gt)

    records.append({
        "image_id": sample.filepath.split("/")[-1],
        "filepath": sample.filepath.split("/")[-1],
        "TP": tp,
        "FP": fp,
        "FN": fn,
        "GT": gt,
        "Pred": pred,
        "MAE": mae
    })

print(f"✅ Number of records collected: {len(records)}")
print(f"🎯 Total TP across all images: {sum(r['TP'] for r in records)}")
print(f"❌ Total FP across all images: {sum(r['FP'] for r in records)}")  
print(f"⚠️  Total FN across all images: {sum(r['FN'] for r in records)}")

# Quick verification that we fixed the TP=0 bug
tp_count = sum(r['TP'] for r in records)
if tp_count > 0:
    print(f"🎉 SUCCESS: Fixed TP=0 bug! Now have {tp_count} true positives")
else:
    print("⚠️  Still have TP=0 issue - may need to adjust IoU threshold or check coordinate format")


🔧 Using manual evaluation to fix TP=0 bug...
📊 Processing test samples with proper IoU calculation...
7 2 0                                                                        
4 2 0
10 1 0
0 0 0
5 1 0
6 2 0
7 2 1
5 1 0
9 0 0
10 3 3
1 2 0
11 1 4
7 1 1
9 0 0
3 1 0
8 2 1
4 0 0
2 0 0
3 0 0
13 0 1
10 0 0
2 0 3
5 3 0
4 3 0
2 5 2
7 1 0
1 0 1
7 1 1
1 1 0
9 2 3
5 1 0
4 0 2
3 0 0
10 2 0
6 1 0
9 0 2
4 1 0
2 2 0
2 1 0
10 1 0
13 1 0
4 0 0
5 3 0
5 1 0
11 0 2
6 0 0                                                                                  
12 0 7
9 1 4
7 0 1
14 1 0
2 2 0
12 2 8
12 1 3
3 0 0
8 2 0
4 0 0
2 2 0
9 3 0
5 2 2
5 1 0
15 1 1
4 0 0
11 0 4
4 0 0
2 1 0
7 3 1
4 1 0
3 1 0
4 1 0
8 0 0
9 2 1
6 2 0
8 0 0
8 0 1
4 0 0
9 0 1
6 3 1
5 2 0
7 2 0
 100% |███████████████████| 79/79 [189.6ms elapsed, 0s remaining, 416.6 samples/s]     
✅ Number of records collected: 79
🎯 Total TP across all images: 499
❌ Total FP across all images: 87
⚠️  Total FN across all images: 62
🎉 SUCCESS: Fixed TP=0 bug! Now 

In [29]:
import plotly.graph_objects as go
import pandas as pd

# Example: Create and sort DataFrame
df = pd.DataFrame(records)  # You must already have this
df = df.sort_values("MAE", ascending=False).reset_index(drop=True)

# Clean up image names for better readability
df["clean_name"] = df["filepath"].str.replace("20230920_", "").str.replace("_jpg.rf.", "_").str.replace(".jpg", "")


df["clean_name"] = df["clean_name"].str.split("_").str[0]


# Add field to FiftyOne dataset (optional)
for sample in dataset.iter_samples(progress=True):
    sample["clean_name"] = sample["filepath"].split("/")[-1].replace("20230920_", "").replace("_jpg.rf.", "_").replace(".jpg", "").split("_")[0]
    sample.save()






# Create figure
fig = go.Figure()

# TP bar
fig.add_trace(go.Bar(
    x=df.index,
    y=df["TP"],
    name="True Positives",
    marker_color="green",
    hovertemplate="<b>%{customdata[0]}</b><br>True Positives: %{y}<br>MAE: %{customdata[1]:.3f}<extra></extra>",
    customdata=df[["clean_name", "MAE"]],
    text=df["TP"],
    yaxis="y"
))

# FP bar
fig.add_trace(go.Bar(
    x=df.index,
    y=df["FP"],
    name="False Positives",
    base=df["TP"],
    marker_color="orange",
    hovertemplate="<b>%{customdata[0]}</b><br>" +
                  "False Positives: %{text}<br>" +
                  "MAE: %{customdata[1]:.3f}<extra></extra>",
    customdata=list(zip(df["clean_name"], df["MAE"])),
    text=df["FP"]
))
# FN bar (negative)
fig.add_trace(go.Bar(
    x=df.index,
    y=-df["FN"],
    name="False Negatives",
    marker_color="red",
    hovertemplate="<b>%{customdata}</b><br>False Negatives: %{text}<extra></extra>",
    customdata=df["clean_name"],
    text=df["FN"]
))

# MAE line (right-side axis)
# fig.add_trace(go.Scatter(
#     x=df.index,
#     y=df["MAE"],
#     name="MAE",
#     mode="lines+markers",
#     line=dict(color="black", dash="dot", width=2),
#     marker=dict(size=4),
#     yaxis="y2",  # Attach to the separate axis
#     hovertemplate="<b>%{customdata}</b><br>MAE: %{y}<br>Pred: %{text}<extra></extra>",
#     customdata=df["clean_name"],
#     text=df["Pred"].astype(str) + " | GT: " + df["GT"].astype(str)
# ))

# Layout
fig.update_layout(
    title="🎯 Object Detection Performance: TP/FP/FN Breakdown with MAE",
    barmode="relative",
    xaxis=dict(
        title="Images (sorted by MAE: worst → best)",
        showgrid=True,
        gridcolor="lightgray"
    ),
    yaxis=dict(
        title="Detection Counts (TP / FP / FN)",
        showgrid=True,
        gridcolor="lightgray",
        range=[
            -df["FN"].max() * 1.2,
            (df["TP"] + df["FP"]).max() * 1.2
        ]
    ),
    yaxis2=dict(
        title="MAE (Mean Absolute Error)",
        side="right",
        overlaying="y",
        anchor=None,  # <-- This prevents y2 from inheriting y1's range
        showgrid=False,
        range=[0, df["MAE"].max() * 1.2]
    ),
    legend=dict(x=1.05, y=1),
    hovermode="closest",
    height=600,
    width=1200,
    plot_bgcolor="white"
)


# Show plot
fig.show()


 100% |█████████████████| 675/675 [788.3ms elapsed, 0s remaining, 856.3 samples/s]      


# Analysis: Relationship Between Bounding Box Area and False Negatives

This analysis investigates whether smaller objects (prawns) are more likely to be missed by the detection model, which would manifest as false negatives. We'll examine:

1. **Area distribution** of all ground truth bounding boxes
2. **Area distribution** of false negative bounding boxes  
3. **Statistical comparison** to determine if FN objects are significantly smaller
4. **Visualization** showing the relationship between object size and detection success


In [30]:
# Create interactive visualization showing individual prawn areas vs detection status

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

print("📊 CREATING VISUALIZATION: Individual Prawn Areas vs Detection Success\n")

# Define IoU calculation function
def calculate_iou(box1, box2):
    """Calculate IoU between two boxes in [x_center, y_center, width, height] format"""
    # Convert to corner coordinates
    x1_min, y1_min = box1[0] - box1[2]/2, box1[1] - box1[3]/2
    x1_max, y1_max = box1[0] + box1[2]/2, box1[1] + box1[3]/2
    
    x2_min, y2_min = box2[0] - box2[2]/2, box2[1] - box2[3]/2
    x2_max, y2_max = box2[0] + box2[2]/2, box2[1] + box2[3]/2
    
    # Calculate intersection
    inter_xmin = max(x1_min, x2_min)
    inter_ymin = max(y1_min, y2_min)
    inter_xmax = min(x1_max, x2_max)
    inter_ymax = min(y1_max, y2_max)
    
    if inter_xmax <= inter_xmin or inter_ymax <= inter_ymin:
        return 0.0
    
    inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
    box1_area = box1[2] * box1[3]
    box2_area = box2[2] * box2[3]
    union_area = box1_area + box2_area - inter_area
    
    return inter_area / union_area if union_area > 0 else 0.0

# Create analysis data directly from the dataset
print("🔍 Analyzing individual prawn areas and detection status...")
individual_prawn_data = []

for sample in test_set_view.iter_samples():
    sample_name = sample.filepath.split('/')[-1]
    
    # Get ground truth boxes
    gt_boxes = []
    if hasattr(sample, 'ground_truth') and sample.ground_truth:
        for detection in sample.ground_truth.detections:
            x, y, w, h = detection.bounding_box
            # Convert from corner format to center format for IoU calculation
            gt_center = [x + w/2, y + h/2, w, h]
            gt_boxes.append({
                'box': gt_center,
                'area': w * h,
                'width': w,
                'height': h,
                'corner_box': [x, y, w, h]
            })
    
    # Get prediction boxes  
    pred_boxes = []
    if hasattr(sample, 'prawn') and sample.prawn:
        for detection in sample.prawn.detections:
            x, y, w, h = detection.bounding_box
            # Convert from corner format to center format for IoU calculation
            pred_center = [x + w/2, y + h/2, w, h]
            pred_boxes.append({
                'box': pred_center,
                'confidence': detection.confidence if hasattr(detection, 'confidence') else 0.5
            })
    
    # Match predictions to ground truth using IoU
    iou_threshold = 0.1
    matched_gt = set()
    
    # For each ground truth, check if it was detected
    for i, gt in enumerate(gt_boxes):
        detected = False
        best_iou = 0
        
        for pred in pred_boxes:
            iou = calculate_iou(gt['box'], pred['box'])
            if iou >= iou_threshold:
                detected = True
                best_iou = max(best_iou, iou)
                matched_gt.add(i)
                break
        
        # Add individual prawn data
        individual_prawn_data.append({
            'image_name': sample_name.replace('.jpg', '').replace('.rf.', '_').split("_")[0],
            'prawn_id': f"{sample_name}_{i+1}",
            'area': gt['area'],
            'width': gt['width'],
            'height': gt['height'],
            'detection_status': 'GT_TP' if detected else 'FN',
            'detected': 1 if detected else 0,
            'missed': 0 if detected else 1,
            'best_iou': best_iou
        })

print(f"✅ Analyzed {len(individual_prawn_data)} individual prawns from {len(test_set_view)} images")

# Convert to DataFrame
df_prawns = pd.DataFrame(individual_prawn_data)
df_prawns["clean_name"] = df_prawns["image_name"].str.split("_").str[0]

print(f"✅ Collected data for {len(df_prawns)} individual prawns")
print(f"   Detected (TP): {df_prawns['detected'].sum()}")
print(f"   Missed (FN): {df_prawns['missed'].sum()}")

# Create the interactive scatter plot
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=[
        "🎯 Individual Prawn Areas vs Detection Status",
        "📊 Area Distribution by Detection Status", 
        "📏 Width vs Height (colored by detection)",
        "📈 Miss Rate by Area Percentile"
    ],
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# 1. Scatter plot: Area vs Detection Status
detected_prawns = df_prawns[df_prawns['detection_status'] == 'GT_TP']
missed_prawns = df_prawns[df_prawns['detection_status'] == 'FN']

# Add detected prawns
fig.add_trace(
    go.Scatter(
        x=detected_prawns.index,
        y=detected_prawns['area'],
        mode='markers',
        name='✅ Detected (TP)',
        marker=dict(
            color='green',
            size=6,
            opacity=0.7
        ),
        hovertemplate="<b>✅ Detected Prawn</b><br>" +
                      "Image: %{customdata[0]}<br>" +
                      "Prawn ID: %{customdata[1]}<br>" +
                      "Area: %{y:.6f}<br>" +
                      "Width: %{customdata[2]:.6f}<br>" +
                      "Height: %{customdata[3]:.6f}<br>" +
                      "IoU: %{customdata[4]:.3f}<br>" +
                      "<extra></extra>",
        customdata=list(zip(detected_prawns['clean_name'], 
                           detected_prawns['prawn_id'],
                           detected_prawns['width'],
                           detected_prawns['height'], 
                           detected_prawns['best_iou']))
    ),
    row=1, col=1
)

# Add missed prawns
fig.add_trace(
    go.Scatter(
        x=missed_prawns.index,
        y=missed_prawns['area'],
        mode='markers',
        name='❌ Missed (FN)',
        marker=dict(
            color='red',
            size=8,
            opacity=0.8,
            symbol='x'
        ),
        hovertemplate="<b>❌ Missed Prawn</b><br>" +
                      "Image: %{customdata[0]}<br>" +
                      "Prawn ID: %{customdata[1]}<br>" +
                      "Area: %{y:.6f}<br>" +
                      "Width: %{customdata[2]:.6f}<br>" +
                      "Height: %{customdata[3]:.6f}<br>" +
                      "Best IoU: %{customdata[4]:.3f}<br>" +
                      "<extra></extra>",
        customdata=list(zip(missed_prawns['clean_name'], 
                           missed_prawns['prawn_id'],
                           missed_prawns['width'],
                           missed_prawns['height'], 
                           missed_prawns['best_iou']))
    ),
    row=1, col=1
)

# 2. Box plot: Area distribution by detection status
fig.add_trace(
    go.Box(
        y=detected_prawns['area'],
        name='✅ Detected',
        marker_color='green',
        boxpoints='outliers'
    ),
    row=1, col=2
)

fig.add_trace(
    go.Box(
        y=missed_prawns['area'],
        name='❌ Missed',
        marker_color='red',
        boxpoints='outliers'
    ),
    row=1, col=2
)

# 3. Width vs Height scatter
fig.add_trace(
    go.Scatter(
        x=detected_prawns['width'],
        y=detected_prawns['height'],
        mode='markers',
        name='✅ Detected (W×H)',
        marker=dict(color='green', size=4, opacity=0.6),
        showlegend=False,
        hovertemplate="<b>✅ Detected Prawn</b><br>" +
                      "Image: %{customdata[0]}<br>" +
                      "Width: %{x:.6f}<br>" +
                      "Height: %{y:.6f}<br>" +
                      "Area: %{customdata[1]:.6f}<br>" +
                      "<extra></extra>",
        customdata=list(zip(detected_prawns['clean_name'], 
                           detected_prawns['area']))
    ),
    row=2, col=1
)

fig.add_trace(
    go.Scatter(
        x=missed_prawns['width'],
        y=missed_prawns['height'],
        mode='markers',
        name='❌ Missed (W×H)',
        marker=dict(color='red', size=6, opacity=0.8, symbol='x'),
        showlegend=False,
        hovertemplate="<b>❌ Missed Prawn</b><br>" +
                      "Image: %{customdata[0]}<br>" +
                      "Width: %{x:.6f}<br>" +
                      "Height: %{y:.6f}<br>" +
                      "Area: %{customdata[1]:.6f}<br>" +
                      "<extra></extra>",
        customdata=list(zip(missed_prawns['clean_name'], 
                           missed_prawns['area']))
    ),
    row=2, col=1
)

# 4. Miss rate by area percentile
if len(missed_prawns) > 0:
    # Create area bins based on percentiles
    percentiles = [0, 10, 25, 50, 75, 90, 100]
    area_thresholds = [np.percentile(df_prawns['area'], p) for p in percentiles]
    
    miss_rates = []
    bin_labels = []
    
    for i in range(len(area_thresholds)-1):
        lower = area_thresholds[i]
        upper = area_thresholds[i+1]
        
        prawns_in_bin = df_prawns[(df_prawns['area'] >= lower) & (df_prawns['area'] <= upper)]
        
        if len(prawns_in_bin) > 0:
            miss_rate = prawns_in_bin['missed'].sum() / len(prawns_in_bin) * 100
            miss_rates.append(miss_rate)
            bin_labels.append(f"{percentiles[i]}-{percentiles[i+1]}%")
        else:
            miss_rates.append(0)
            bin_labels.append(f"{percentiles[i]}-{percentiles[i+1]}%")
    
    fig.add_trace(
        go.Bar(
            x=bin_labels,
            y=miss_rates,
            name='Miss Rate %',
            marker_color='orange',
            showlegend=False,
            hovertemplate="<b>%{x} percentile</b><br>" +
                          "Miss Rate: %{y:.1f}%<br>" +
                          "<extra></extra>"
        ),
        row=2, col=2
    )

# Update layout
fig.update_layout(
    title="🔍 Individual Prawn Analysis: Bounding Box Area vs Detection Success",
    height=800,
    width=1400,
    showlegend=True
)

# Update axes labels
fig.update_xaxes(title_text="Prawn Index", row=1, col=1)
fig.update_yaxes(title_text="Area (normalized)", row=1, col=1)

fig.update_xaxes(title_text="Detection Status", row=1, col=2)
fig.update_yaxes(title_text="Area (normalized)", row=1, col=2)

fig.update_xaxes(title_text="Width (normalized)", row=2, col=1)
fig.update_yaxes(title_text="Height (normalized)", row=2, col=1)

fig.update_xaxes(title_text="Area Percentile Bin", row=2, col=2)
fig.update_yaxes(title_text="Miss Rate (%)", row=2, col=2)

fig.show()

# Print summary insights
print(f"\n🔍 INDIVIDUAL PRAWN INSIGHTS:")
print(f"")
print(f"📊 BASIC STATISTICS:")
print(f"   Total prawns analyzed: {len(df_prawns)}")
print(f"   Successfully detected: {detected_prawns.shape[0]} ({detected_prawns.shape[0]/len(df_prawns)*100:.1f}%)")
print(f"   Missed (false negatives): {missed_prawns.shape[0]} ({missed_prawns.shape[0]/len(df_prawns)*100:.1f}%)")

if len(missed_prawns) > 0 and len(detected_prawns) > 0:
    print(f"\n📏 SIZE COMPARISON:")
    print(f"   Detected prawns - Mean area: {detected_prawns['area'].mean():.6f}")
    print(f"   Missed prawns - Mean area: {missed_prawns['area'].mean():.6f}")
    print(f"   Ratio (missed/detected): {missed_prawns['area'].mean()/detected_prawns['area'].mean():.3f}")
    
    if missed_prawns['area'].mean() < detected_prawns['area'].mean():
        print(f"   📉 Missed prawns are on average {((detected_prawns['area'].mean() - missed_prawns['area'].mean())/detected_prawns['area'].mean()*100):.1f}% smaller")
    else:
        print(f"   📈 Missed prawns are on average {((missed_prawns['area'].mean() - detected_prawns['area'].mean())/detected_prawns['area'].mean()*100):.1f}% larger")

print(f"\n💡 KEY INSIGHTS:")
print(f"   • Each point in the scatter plot represents one individual prawn")
print(f"   • Green circles = successfully detected prawns") 
print(f"   • Red X's = missed prawns (false negatives)")
print(f"   • The visualization shows if smaller prawns are more likely to be missed")
print(f"   • Box plots compare the area distributions between detected and missed prawns")


📊 CREATING VISUALIZATION: Individual Prawn Areas vs Detection Success

🔍 Analyzing individual prawn areas and detection status...
✅ Analyzed 561 individual prawns from 79 images
✅ Collected data for 561 individual prawns
   Detected (TP): 503
   Missed (FN): 58



🔍 INDIVIDUAL PRAWN INSIGHTS:

📊 BASIC STATISTICS:
   Total prawns analyzed: 561
   Successfully detected: 503 (89.7%)
   Missed (false negatives): 58 (10.3%)

📏 SIZE COMPARISON:
   Detected prawns - Mean area: 0.009455
   Missed prawns - Mean area: 0.003285
   Ratio (missed/detected): 0.347
   📉 Missed prawns are on average 65.3% smaller

💡 KEY INSIGHTS:
   • Each point in the scatter plot represents one individual prawn
   • Green circles = successfully detected prawns
   • Red X's = missed prawns (false negatives)
   • The visualization shows if smaller prawns are more likely to be missed
   • Box plots compare the area distributions between detected and missed prawns


# Statistical Validation: Area vs False Negatives

This section provides rigorous statistical analysis to validate whether smaller prawns are significantly more likely to be missed by the detection model.


In [17]:
# Statistical Validation: Area vs False Negatives

import scipy.stats as stats
from scipy.stats import mannwhitneyu, chi2_contingency, pearsonr, spearmanr
import seaborn as sns
import matplotlib.pyplot as plt

print("📊 STATISTICAL VALIDATION: Area vs False Negatives\n")

# Separate detected and missed prawns
detected_prawns = df_prawns[df_prawns['detection_status'] == 'GT_TP']
missed_prawns = df_prawns[df_prawns['detection_status'] == 'FN']

print(f"📈 Sample sizes:")
print(f"   Detected prawns (TP): {len(detected_prawns)}")
print(f"   Missed prawns (FN): {len(missed_prawns)}")
print(f"   Total prawns: {len(df_prawns)}")
print(f"   Miss rate: {len(missed_prawns)/len(df_prawns)*100:.1f}%\n")

# 1. DESCRIPTIVE STATISTICS
print("📊 DESCRIPTIVE STATISTICS:")
print("=" * 50)

print("🎯 DETECTED PRAWNS (TP):")
print(f"   Area - Mean: {detected_prawns['area'].mean():.6f}, Std: {detected_prawns['area'].std():.6f}")
print(f"   Area - Median: {detected_prawns['area'].median():.6f}")
print(f"   Area - Range: {detected_prawns['area'].min():.6f} to {detected_prawns['area'].max():.6f}")

print("❌ MISSED PRAWNS (FN):")
print(f"   Area - Mean: {missed_prawns['area'].mean():.6f}, Std: {missed_prawns['area'].std():.6f}")
print(f"   Area - Median: {missed_prawns['area'].median():.6f}")
print(f"   Area - Range: {missed_prawns['area'].min():.6f} to {missed_prawns['area'].max():.6f}")

# Calculate effect size (Cohen's d)
pooled_std = np.sqrt(((len(detected_prawns)-1)*detected_prawns['area'].var() + 
                     (len(missed_prawns)-1)*missed_prawns['area'].var()) / 
                    (len(detected_prawns) + len(missed_prawns) - 2))
cohens_d = (detected_prawns['area'].mean() - missed_prawns['area'].mean()) / pooled_std

print(f"\n📏 EFFECT SIZE:")
print(f"   Cohen's d: {cohens_d:.3f}")
if abs(cohens_d) < 0.2:
    effect_size = "negligible"
elif abs(cohens_d) < 0.5:
    effect_size = "small"
elif abs(cohens_d) < 0.8:
    effect_size = "medium"
else:
    effect_size = "large"
print(f"   Effect size interpretation: {effect_size}")

print("\n" + "="*70)

# 2. NORMALITY TESTS
print("🔍 NORMALITY TESTS:")
print("=" * 50)

# Shapiro-Wilk test for normality (use subset if too large)
def test_normality(data, name, max_samples=5000):
    if len(data) > max_samples:
        sample_data = data.sample(max_samples, random_state=42)
        print(f"   {name} (sampled {max_samples}/{len(data)}):")
    else:
        sample_data = data
        print(f"   {name}:")
    
    stat, p_value = stats.shapiro(sample_data)
    is_normal = p_value > 0.05
    print(f"      Shapiro-Wilk: statistic={stat:.4f}, p-value={p_value:.2e}")
    print(f"      Normal distribution: {'✅ Yes' if is_normal else '❌ No'}")
    return is_normal

detected_normal = test_normality(detected_prawns['area'], "Detected prawns area")
missed_normal = test_normality(missed_prawns['area'], "Missed prawns area")

both_normal = detected_normal and missed_normal
print(f"\n   Both distributions normal: {'✅ Yes' if both_normal else '❌ No'}")

print("\n" + "="*70)

# 3. STATISTICAL TESTS
print("🧪 STATISTICAL TESTS:")
print("=" * 50)

# Choose appropriate test based on normality
if both_normal:
    print("📊 Using parametric tests (both distributions normal)")
    # Independent t-test
    t_stat, t_p_value = stats.ttest_ind(detected_prawns['area'], missed_prawns['area'])
    print(f"   Independent t-test:")
    print(f"      t-statistic: {t_stat:.4f}")
    print(f"      p-value: {t_p_value:.2e}")
else:
    print("📊 Using non-parametric tests (distributions not normal)")

# Mann-Whitney U test (non-parametric alternative to t-test)
u_stat, u_p_value = mannwhitneyu(detected_prawns['area'], missed_prawns['area'], 
                                 alternative='two-sided')
print(f"   Mann-Whitney U test:")
print(f"      U-statistic: {u_stat:.0f}")
print(f"      p-value: {u_p_value:.2e}")

# Correlation between area and detection success
area_detection_corr_pearson, area_p_pearson = pearsonr(df_prawns['area'], df_prawns['detected'])
area_detection_corr_spearman, area_p_spearman = spearmanr(df_prawns['area'], df_prawns['detected'])

print(f"   Correlation (Area vs Detection Success):")
print(f"      Pearson r: {area_detection_corr_pearson:.4f}, p-value: {area_p_pearson:.2e}")
print(f"      Spearman ρ: {area_detection_corr_spearman:.4f}, p-value: {area_p_spearman:.2e}")

print("\n" + "="*70)

# 4. BINNED ANALYSIS
print("📦 BINNED ANALYSIS:")
print("=" * 50)

# Create area quartiles for analysis
df_prawns['area_quartile'] = pd.qcut(df_prawns['area'], q=4, labels=['Q1 (Smallest)', 'Q2', 'Q3', 'Q4 (Largest)'])

# Calculate miss rate by quartile
quartile_analysis = df_prawns.groupby('area_quartile').agg({
    'missed': ['count', 'sum', 'mean'],
    'area': ['min', 'max', 'mean']
}).round(4)

quartile_analysis.columns = ['Total_Prawns', 'Missed_Count', 'Miss_Rate', 'Min_Area', 'Max_Area', 'Mean_Area']

print("   Miss Rate by Area Quartile:")
for quartile in quartile_analysis.index:
    row = quartile_analysis.loc[quartile]
    print(f"      {quartile}: {row['Miss_Rate']*100:.1f}% missed ({row['Missed_Count']:.0f}/{row['Total_Prawns']:.0f})")
    print(f"         Area range: {row['Min_Area']:.6f} - {row['Max_Area']:.6f}")

# Chi-square test for association between quartile and detection
contingency_table = pd.crosstab(df_prawns['area_quartile'], df_prawns['detection_status'])
chi2_stat, chi2_p_value, chi2_dof, chi2_expected = chi2_contingency(contingency_table)

print(f"\n   Chi-square test (Quartile vs Detection):")
print(f"      χ² statistic: {chi2_stat:.4f}")
print(f"      p-value: {chi2_p_value:.2e}")
print(f"      degrees of freedom: {chi2_dof}")

print("\n" + "="*70)

# 5. STATISTICAL CONCLUSIONS
print("🎯 STATISTICAL CONCLUSIONS:")
print("=" * 50)

alpha = 0.05
print(f"   Significance level (α): {alpha}")

# Main hypothesis test conclusion
if both_normal:
    main_p = t_p_value
    test_name = "t-test"
else:
    main_p = u_p_value
    test_name = "Mann-Whitney U test"

print(f"\n   PRIMARY HYPOTHESIS TEST ({test_name}):")
if main_p < alpha:
    print(f"   ✅ SIGNIFICANT DIFFERENCE (p = {main_p:.2e} < {alpha})")
    if detected_prawns['area'].mean() > missed_prawns['area'].mean():
        print("   📊 CONCLUSION: Detected prawns have significantly LARGER areas than missed prawns")
        print("   🔍 INTERPRETATION: Smaller prawns are more likely to be missed by the model")
    else:
        print("   📊 CONCLUSION: Detected prawns have significantly SMALLER areas than missed prawns")
        print("   🔍 INTERPRETATION: Larger prawns are more likely to be missed by the model")
else:
    print(f"   ❌ NO SIGNIFICANT DIFFERENCE (p = {main_p:.2e} ≥ {alpha})")
    print("   📊 CONCLUSION: No significant relationship between prawn area and detection success")

# Correlation conclusion
print(f"\n   CORRELATION ANALYSIS:")
if area_p_spearman < alpha:
    print(f"   ✅ SIGNIFICANT CORRELATION (p = {area_p_spearman:.2e} < {alpha})")
    if area_detection_corr_spearman > 0:
        print(f"   📈 Positive correlation (ρ = {area_detection_corr_spearman:.3f}): Larger area → Higher detection rate")
    else:
        print(f"   📉 Negative correlation (ρ = {area_detection_corr_spearman:.3f}): Larger area → Lower detection rate")
else:
    print(f"   ❌ NO SIGNIFICANT CORRELATION (p = {area_p_spearman:.2e} ≥ {alpha})")

# Quartile analysis conclusion
print(f"\n   QUARTILE ANALYSIS:")
if chi2_p_value < alpha:
    print(f"   ✅ SIGNIFICANT ASSOCIATION (p = {chi2_p_value:.2e} < {alpha})")
    print("   📊 CONCLUSION: Detection success varies significantly across area quartiles")
    
    # Find quartile with highest miss rate
    max_miss_quartile = quartile_analysis['Miss_Rate'].idxmax()
    min_miss_quartile = quartile_analysis['Miss_Rate'].idxmin()
    
    print(f"   🔴 Highest miss rate: {max_miss_quartile} ({quartile_analysis.loc[max_miss_quartile, 'Miss_Rate']*100:.1f}%)")
    print(f"   🟢 Lowest miss rate: {min_miss_quartile} ({quartile_analysis.loc[min_miss_quartile, 'Miss_Rate']*100:.1f}%)")
else:
    print(f"   ❌ NO SIGNIFICANT ASSOCIATION (p = {chi2_p_value:.2e} ≥ {alpha})")

print(f"\n   EFFECT SIZE: {effect_size.upper()} effect (Cohen's d = {cohens_d:.3f})")

print("\n" + "="*70)
print("📋 SUMMARY: Statistical validation complete!")
print(f"   Total prawns analyzed: {len(df_prawns)}")
print(f"   Statistical significance threshold: p < {alpha}")
print(f"   Primary test used: {test_name}")
print("="*70)


📊 STATISTICAL VALIDATION: Area vs False Negatives

📈 Sample sizes:
   Detected prawns (TP): 503
   Missed prawns (FN): 58
   Total prawns: 561
   Miss rate: 10.3%

📊 DESCRIPTIVE STATISTICS:
🎯 DETECTED PRAWNS (TP):
   Area - Mean: 0.009455, Std: 0.010021
   Area - Median: 0.006381
   Area - Range: 0.000702 to 0.117087
❌ MISSED PRAWNS (FN):
   Area - Mean: 0.003285, Std: 0.002563
   Area - Median: 0.002363
   Area - Range: 0.000302 to 0.011450

📏 EFFECT SIZE:
   Cohen's d: 0.647
   Effect size interpretation: medium

🔍 NORMALITY TESTS:
   Detected prawns area:
      Shapiro-Wilk: statistic=0.6127, p-value=4.90e-32
      Normal distribution: ❌ No
   Missed prawns area:
      Shapiro-Wilk: statistic=0.7591, p-value=2.19e-08
      Normal distribution: ❌ No

   Both distributions normal: ❌ No

🧪 STATISTICAL TESTS:
📊 Using non-parametric tests (distributions not normal)
   Mann-Whitney U test:
      U-statistic: 24431
      p-value: 3.73e-17
   Correlation (Area vs Detection Success):
      P