<a href="https://colab.research.google.com/github/donbcolab/AIE3/blob/main/brain_tumor_hf_ds_protoype_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Brain Tumor Image Dataset - Prototype Analysis

- HF DataSet Loader



## Set up and Initial Checks

### Import Necessary Libraries and Define Constants

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -qU pyarrow==14.0.1 requests==2.31.0

In [3]:
!pip install -qU datasets==2.11.0

In [4]:
import os
import json
import pandas as pd
from datasets import Dataset, Features, ClassLabel, Value, Sequence, Image

In [5]:
HF_DATASET_NAME = 'brain-tumor-image-dataset-semantic-segmentation'
SOURCE_JSON = "/content/drive/MyDrive/kaggle/datasets/brain-tumor-image-dataset-semantic-segmentation/train/_annotations.coco.json"
SOURCE_IMAGE_DIR = "/content/drive/MyDrive/kaggle/datasets/brain-tumor-image-dataset-semantic-segmentation/train"


### Initial Verification

In [6]:
features = Features({
    'file_name': Value(dtype='string'),
    'image': Image(decode=True),
    'id': Value(dtype='int64'),
    'category_id': ClassLabel(names=['tumor', 'normal', 'other']),  # Add 'other' or adjust as needed
    'bbox': Sequence(feature=Value(dtype='float32'), length=4),
    'segmentation': Sequence(feature=Sequence(feature=Value(dtype='float32'), length=-1), length=-1),
    'area': Value(dtype='float32'),
    'iscrowd': Value(dtype='int64'),
    'height': Value(dtype='int64'),
    'width': Value(dtype='int64'),
    'date_captured': Value(dtype='string'),
    'license': Value(dtype='int64')
})

In [7]:
def verify_source_data():
    if not os.path.exists(SOURCE_JSON):
        raise FileNotFoundError(f"JSON file not found: {SOURCE_JSON}")

    with open(SOURCE_JSON, 'r') as f:
        data = json.load(f)

    print(f"JSON file contains {len(data['images'])} images and {len(data['annotations'])} annotations")
    print("Sample of images:", data['images'][:2])
    print("Sample of annotations:", data['annotations'][:2])

In [8]:
from tqdm.auto import tqdm

def load_data_to_df():
    with open(SOURCE_JSON, 'r') as f:
        data = json.load(f)

    images = pd.DataFrame(data['images'])
    annotations = pd.DataFrame(data['annotations'])

    df = pd.merge(images, annotations, left_on='id', right_on='image_id', suffixes=('', '_ann'))

    # Drop duplicate columns and rename as needed
    df = df.drop(columns=['id_ann', 'image_id'])

    # Add the full image path
    tqdm.pandas(desc="Processing image paths")
    df['image'] = df['file_name'].progress_apply(lambda x: os.path.join(SOURCE_IMAGE_DIR, x))

    # Ensure all required columns are present
    for column in features.keys():
        if column not in df.columns:
            df[column] = None

    return df

In [9]:
import cv2
from tqdm.auto import tqdm

def create_hf_dataset(df, hf_dataset_name):
    # Convert 'image' column to image data
    def load_image(image_path):
        img = cv2.imread(image_path)
        if img is not None:
            return cv2.imencode('.jpg', img)[1].tobytes()
        return None

    tqdm.pandas(desc="Loading images")
    df['image'] = df['image'].progress_apply(load_image)

    dataset = Dataset.from_pandas(df, features=features)
    # dataset.push_to_hub(hf_dataset_name)
    # print(f"Dataset {hf_dataset_name} created and pushed to hub successfully.")

In [10]:
"""
from tqdm.auto import tqdm

# Execute the process
verify_source_data()
df = load_data_to_df()

# Add overall progress bar
with tqdm(total=3, desc="Creating dataset") as pbar:
    create_hf_dataset(df, HF_DATASET_NAME)
    pbar.update(1)

    # You can add more steps here if needed
    pbar.update(1)

    pbar.update(1)
  """

'\nfrom tqdm.auto import tqdm\n\n# Execute the process\nverify_source_data()\ndf = load_data_to_df()\n\n# Add overall progress bar\nwith tqdm(total=3, desc="Creating dataset") as pbar:\n    create_hf_dataset(df, HF_DATASET_NAME)\n    pbar.update(1)\n\n    # You can add more steps here if needed\n    pbar.update(1)\n\n    pbar.update(1)\n  '

In [11]:
import json
from collections import Counter

with open(SOURCE_JSON, 'r') as f:
    coco_data = json.load(f)

category_ids = [ann['category_id'] for ann in coco_data['annotations']]
id_counts = Counter(category_ids)

categories = {cat['id']: cat['name'] for cat in coco_data['categories']}

print("Category ID counts:", id_counts)
print("Category mappings:", categories)

Category ID counts: Counter({1: 771, 2: 731})
Category mappings: {0: 'Tumor', 1: '0', 2: '1'}


In [12]:
# print category object from SOURCE_JSON
with open(SOURCE_JSON, 'r') as f:
    coco_data = json.load(f)
    categories = coco_data['categories']
    for category in categories:
        print(category)

{'id': 0, 'name': 'Tumor', 'supercategory': 'none'}
{'id': 1, 'name': '0', 'supercategory': 'Tumor'}
{'id': 2, 'name': '1', 'supercategory': 'Tumor'}


In [13]:
def is_rectangle_segmentation(bbox, segmentation):
    x, y, w, h = bbox
    expected = [[x, y, x+w, y, x+w, y+h, x, y+h]]
    return segmentation == expected

sample_annotations = coco_data['annotations'][:100]
rectangle_count = sum(is_rectangle_segmentation(ann['bbox'], ann['segmentation'])
                      for ann in sample_annotations)

print(f"{rectangle_count}% of sampled annotations have rectangular segmentations")

0% of sampled annotations have rectangular segmentations


In [14]:
import json
from collections import Counter

with open(SOURCE_JSON, 'r') as f:
    coco_data = json.load(f)

category_ids = [ann['category_id'] for ann in coco_data['annotations']]
id_counts = Counter(category_ids)

categories = {cat['id']: cat['name'] for cat in coco_data['categories']}

print("Category ID counts:", id_counts)
print("Category mappings:", categories)

# Automated verification
expected_categories = {1: 'Non-Tumor', 2: 'Tumor'}
assert categories == expected_categories, f"Category mismatch. Expected {expected_categories}, got {categories}"
assert set(id_counts.keys()) == set(expected_categories.keys()), f"Unexpected category IDs found: {set(id_counts.keys())}"

Category ID counts: Counter({1: 771, 2: 731})
Category mappings: {0: 'Tumor', 1: '0', 2: '1'}


AssertionError: Category mismatch. Expected {1: 'Non-Tumor', 2: 'Tumor'}, got {0: 'Tumor', 1: '0', 2: '1'}

In [15]:
def is_rectangle_segmentation(bbox, segmentation):
    x, y, w, h = bbox
    expected = [[x, y, x+w, y, x+w, y+h, x, y+h]]
    return segmentation == expected

sample_annotations = coco_data['annotations'][:100]
rectangle_count = sum(is_rectangle_segmentation(ann['bbox'], ann['segmentation'])
                      for ann in sample_annotations)

rectangle_percentage = (rectangle_count / len(sample_annotations)) * 100
print(f"{rectangle_percentage:.2f}% of sampled annotations have rectangular segmentations")

# Automated verification
assert rectangle_percentage >= 99, f"Only {rectangle_percentage:.2f}% of segmentations are rectangular (expected ≥99%)"

0.00% of sampled annotations have rectangular segmentations


AssertionError: Only 0.00% of segmentations are rectangular (expected ≥99%)

In [16]:
import pandas as pd
from datasets import Dataset

# Create small subset
subset_images = coco_data['images'][:10]
subset_annotations = [ann for ann in coco_data['annotations']
                      if ann['image_id'] in [img['id'] for img in subset_images]]

# Create DataFrame
df_subset = pd.DataFrame({
    'file_name': [img['file_name'] for img in subset_images],
    'image_id': [img['id'] for img in subset_images],
    'category_id': [ann['category_id'] for ann in subset_annotations],
    'bbox': [ann['bbox'] for ann in subset_annotations],
    'segmentation': [ann['segmentation'] for ann in subset_annotations]
})

# Convert to Parquet
df_subset.to_parquet('test_subset.parquet')

# Load Parquet file
loaded_df = pd.read_parquet('test_subset.parquet')

# Verify data
assert len(loaded_df) == len(df_subset), "Row count mismatch"
for column in df_subset.columns:
    assert (loaded_df[column] == df_subset[column]).all(), f"Mismatch in column {column}"

print("Parquet conversion and loading test passed successfully")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [17]:
from datasets import Dataset
import cv2
import matplotlib.pyplot as plt

def load_image_on_demand(example):
    image_path = os.path.join(SOURCE_IMAGE_DIR, example['file_name'])
    example['image'] = cv2.imread(image_path)
    return example

# Create dataset
dataset = Dataset.from_parquet('test_subset.parquet')

# Set transform for on-demand loading
dataset.set_transform(load_image_on_demand)

# Test accessing items
for i in range(3):
    item = dataset[i]
    assert 'image' in item, f"Image not loaded for item {i}"
    assert item['image'] is not None, f"Image is None for item {i}"
    print(f"Successfully loaded image for item {i}")

    # Visualize the image
    plt.figure(figsize=(10, 10))
    plt.imshow(cv2.cvtColor(item['image'], cv2.COLOR_BGR2RGB))
    plt.title(f"Image {i}: {item['file_name']}")
    plt.axis('off')
    plt.show()

Downloading and preparing dataset parquet/default to /root/.cache/huggingface/datasets/parquet/default-1b8d22c6376028e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/default-1b8d22c6376028e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


TypeError: join() argument must be str, bytes, or os.PathLike object, not 'list'