<a href="https://colab.research.google.com/github/donbcolab/AIE3/blob/main/brain_tumor_hf_ds_protoype_analysis_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Brain Tumor Image Dataset - Prototype Analysis

- HF DataSet Loader



## Set up and Initial Checks

### Import Necessary Libraries and Define Constants

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -qU pyarrow==14.0.1 requests==2.31.0

In [None]:
!pip install -qU datasets==2.11.0

In [None]:
import os
import json
import pandas as pd
from datasets import Dataset, Features, ClassLabel, Value, Sequence, Image

In [None]:
HF_DATASET_NAME = 'brain-tumor-image-dataset-semantic-segmentation'
SOURCE_JSON = "/content/drive/MyDrive/kaggle/datasets/brain-tumor-image-dataset-semantic-segmentation/train/_annotations.coco.json"
SOURCE_IMAGE_DIR = "/content/drive/MyDrive/kaggle/datasets/brain-tumor-image-dataset-semantic-segmentation/train"


### Initial Verification

In [None]:
features = Features({
    'file_name': Value(dtype='string'),
    'image': Image(),
    'id': Value(dtype='int64'),
    'category_id': ClassLabel(names=['Non-Tumor', 'Tumor']),  # Index 0 is Non-Tumor, 1 is Tumor
    'bbox': Sequence(feature=Value(dtype='float32'), length=4),
    'segmentation': Sequence(feature=Sequence(feature=Value(dtype='float32'), length=-1), length=-1),
    'area': Value(dtype='float32'),
    'iscrowd': Value(dtype='int64'),
    'height': Value(dtype='int64'),
    'width': Value(dtype='int64'),
    'date_captured': Value(dtype='string')
})

In [None]:
def verify_source_data():
    with open(SOURCE_JSON, 'r') as f:
        data = json.load(f)

    print("Categories:")
    for category in data['categories']:
        print(f"ID: {category['id']}, Name: {category['name']}, Supercategory: {category['supercategory']}")

    category_counts = pd.DataFrame(data['annotations'])['category_id'].value_counts().sort_index()
    print("\nCategory distribution in annotations:")
    print(category_counts)

    # Check for images with multiple bounding boxes
    image_bbox_counts = pd.DataFrame(data['annotations'])['image_id'].value_counts()
    print(f"\nImages with multiple bounding boxes: {(image_bbox_counts > 1).sum()}")
    print(f"Max bounding boxes in an image: {image_bbox_counts.max()}")

In [None]:
from tqdm.auto import tqdm

def load_data_to_df():
    with open(SOURCE_JSON, 'r') as f:
        data = json.load(f)

    images = pd.DataFrame(data['images'])
    annotations = pd.DataFrame(data['annotations'])

    df = pd.merge(images, annotations, left_on='id', right_on='image_id', suffixes=('', '_ann'))

    # Adjust category_id: 1 -> 0 (Non-Tumor), 2 -> 1 (Tumor)
    df['category_id'] = df['category_id'] - 1

    # Add the full image path
    df['image'] = df['file_name'].apply(lambda x: os.path.join(SOURCE_IMAGE_DIR, x))

    return df

In [None]:
import cv2
from tqdm.auto import tqdm

def create_hf_dataset(df, hf_dataset_name):
    # Convert 'image' column to image data
    def load_image(image_path):
        img = cv2.imread(image_path)
        if img is not None:
            return cv2.imencode('.jpg', img)[1].tobytes()
        return None

    tqdm.pandas(desc="Loading images")
    df['image'] = df['image'].progress_apply(load_image)

    dataset = Dataset.from_pandas(df, features=features)
    # dataset.push_to_hub(hf_dataset_name)
    # print(f"Dataset {hf_dataset_name} created and pushed to hub successfully.")

In [None]:
from tqdm.auto import tqdm

# Execute the process
verify_source_data()
df = load_data_to_df()

# Add overall progress bar
with tqdm(total=3, desc="Creating dataset") as pbar:
    create_hf_dataset(df, HF_DATASET_NAME)
    pbar.update(1)

    # You can add more steps here if needed
    pbar.update(1)

    pbar.update(1)

Categories:
ID: 0, Name: Tumor, Supercategory: none
ID: 1, Name: 0, Supercategory: Tumor
ID: 2, Name: 1, Supercategory: Tumor

Category distribution in annotations:
category_id
1    771
2    731
Name: count, dtype: int64

Images with multiple bounding boxes: 1
Max bounding boxes in an image: 2


Creating dataset:   0%|          | 0/3 [00:00<?, ?it/s]

Loading images:   0%|          | 0/1502 [00:00<?, ?it/s]

ValueError: Couldn't cast
id: int64
license: int64
file_name: string
height: int64
width: int64
date_captured: string
id_ann: int64
image_id: int64
category_id: int64
bbox: list<item: double>
  child 0, item: double
area: double
segmentation: list<item: list<item: double>>
  child 0, item: list<item: double>
      child 0, item: double
iscrowd: int64
image: binary
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1853
to
{'file_name': Value(dtype='string', id=None), 'image': Image(decode=True, id=None), 'id': Value(dtype='int64', id=None), 'category_id': ClassLabel(names=['Non-Tumor', 'Tumor'], id=None), 'bbox': Sequence(feature=Value(dtype='float32', id=None), length=4, id=None), 'segmentation': Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=None), 'area': Value(dtype='float32', id=None), 'iscrowd': Value(dtype='int64', id=None), 'height': Value(dtype='int64', id=None), 'width': Value(dtype='int64', id=None), 'date_captured': Value(dtype='string', id=None)}
because column names don't match

In [None]:
# print category object from SOURCE_JSON
with open(SOURCE_JSON, 'r') as f:
    coco_data = json.load(f)
    categories = coco_data['categories']
    for category in categories:
        print(category)

In [None]:
import json
from collections import Counter

with open(SOURCE_JSON, 'r') as f:
    coco_data = json.load(f)

category_ids = [ann['category_id'] for ann in coco_data['annotations']]
id_counts = Counter(category_ids)

categories = {cat['id']: cat['name'] for cat in coco_data['categories']}

print("Category ID counts:", id_counts)
print("Category mappings:", categories)

# Automated verification
expected_categories = {1: 'Non-Tumor', 2: 'Tumor'}
assert categories == expected_categories, f"Category mismatch. Expected {expected_categories}, got {categories}"
assert set(id_counts.keys()) == set(expected_categories.keys()), f"Unexpected category IDs found: {set(id_counts.keys())}"

In [None]:
def is_rectangle_segmentation(bbox, segmentation):
    x, y, w, h = bbox
    expected = [[x, y, x+w, y, x+w, y+h, x, y+h]]
    return segmentation == expected

sample_annotations = coco_data['annotations'][:100]
rectangle_count = sum(is_rectangle_segmentation(ann['bbox'], ann['segmentation'])
                      for ann in sample_annotations)

rectangle_percentage = (rectangle_count / len(sample_annotations)) * 100
print(f"{rectangle_percentage:.2f}% of sampled annotations have rectangular segmentations")

# Automated verification
assert rectangle_percentage >= 99, f"Only {rectangle_percentage:.2f}% of segmentations are rectangular (expected ≥99%)"

In [None]:
import pandas as pd
from datasets import Dataset

# Create small subset
subset_images = coco_data['images'][:10]
subset_annotations = [ann for ann in coco_data['annotations']
                      if ann['image_id'] in [img['id'] for img in subset_images]]

# Create DataFrame
df_subset = pd.DataFrame({
    'file_name': [img['file_name'] for img in subset_images],
    'image_id': [img['id'] for img in subset_images],
    'category_id': [ann['category_id'] for ann in subset_annotations],
    'bbox': [ann['bbox'] for ann in subset_annotations],
    'segmentation': [ann['segmentation'] for ann in subset_annotations]
})

# Convert to Parquet
df_subset.to_parquet('test_subset.parquet')

# Load Parquet file
loaded_df = pd.read_parquet('test_subset.parquet')

# Verify data
assert len(loaded_df) == len(df_subset), "Row count mismatch"
for column in df_subset.columns:
    assert (loaded_df[column] == df_subset[column]).all(), f"Mismatch in column {column}"

print("Parquet conversion and loading test passed successfully")

In [None]:
from datasets import Dataset
import cv2
import matplotlib.pyplot as plt

def load_image_on_demand(example):
    image_path = os.path.join(SOURCE_IMAGE_DIR, example['file_name'])
    example['image'] = cv2.imread(image_path)
    return example

# Create dataset
dataset = Dataset.from_parquet('test_subset.parquet')

# Set transform for on-demand loading
dataset.set_transform(load_image_on_demand)

# Test accessing items
for i in range(3):
    item = dataset[i]
    assert 'image' in item, f"Image not loaded for item {i}"
    assert item['image'] is not None, f"Image is None for item {i}"
    print(f"Successfully loaded image for item {i}")

    # Visualize the image
    plt.figure(figsize=(10, 10))
    plt.imshow(cv2.cvtColor(item['image'], cv2.COLOR_BGR2RGB))
    plt.title(f"Image {i}: {item['file_name']}")
    plt.axis('off')
    plt.show()