<a href="https://colab.research.google.com/github/donbcolab/AIE3/blob/main/cnmc_leukemia_2019_brain_tumor_hf_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CNMC Leukemia 2019 Train

*   List item
*   List item



## Set up and Initial Checks

### Import Necessary Libraries and Define Constants

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -qU pyarrow==14.0.1 requests==2.31.0

In [3]:
!pip install -qU datasets==2.11.0

In [4]:
import os
import json
import pandas as pd
from datasets import Dataset, Features, ClassLabel, Value, Sequence, Image

In [5]:
HF_DATASET_NAME = 'brain-tumor-image-dataset-semantic-segmentation'
SOURCE_JSON = "/content/drive/MyDrive/kaggle/datasets/brain-tumor-image-dataset-semantic-segmentation/train/_annotations.coco.json"
SOURCE_IMAGE_DIR = "/content/drive/MyDrive/kaggle/datasets/brain-tumor-image-dataset-semantic-segmentation/train"


### Initial Verification

In [6]:
features = Features({
    'file_name': Value(dtype='string'),
    'image': Image(decode=True),  # Change this to decode=True
    'id': Value(dtype='int64'),
    'category_id': ClassLabel(names=['tumor', 'normal']),
    'bbox': Sequence(feature=Value(dtype='float32'), length=4),
    'segmentation': Sequence(feature=Sequence(feature=Value(dtype='float32'), length=-1), length=-1),
    'area': Value(dtype='float32'),
    'iscrowd': Value(dtype='int64'),
    'height': Value(dtype='int64'),
    'width': Value(dtype='int64'),
    'date_captured': Value(dtype='string'),
    'license': Value(dtype='int64')
})

In [7]:
def verify_source_data():
    if not os.path.exists(SOURCE_JSON):
        raise FileNotFoundError(f"JSON file not found: {SOURCE_JSON}")

    with open(SOURCE_JSON, 'r') as f:
        data = json.load(f)

    print(f"JSON file contains {len(data['images'])} images and {len(data['annotations'])} annotations")
    print("Sample of images:", data['images'][:2])
    print("Sample of annotations:", data['annotations'][:2])

In [8]:
def load_data_to_df():
    with open(SOURCE_JSON, 'r') as f:
        data = json.load(f)

    images = pd.DataFrame(data['images'])
    annotations = pd.DataFrame(data['annotations'])

    df = pd.merge(images, annotations, left_on='id', right_on='image_id', suffixes=('', '_ann'))

    # Drop duplicate columns and rename as needed
    df = df.drop(columns=['id_ann', 'image_id'])

    # Add the full image path
    df['image'] = df['file_name'].apply(lambda x: os.path.join(SOURCE_IMAGE_DIR, x))

    # Ensure all required columns are present
    for column in features.keys():
        if column not in df.columns:
            df[column] = None

    return df

In [9]:
import cv2

def create_hf_dataset(df, hf_dataset_name):
    # Convert 'image' column to image data
    def load_image(image_path):
        img = cv2.imread(image_path)
        if img is not None:
            return cv2.imencode('.jpg', img)[1].tobytes()
        return None

    df['image'] = df['image'].apply(load_image)

    dataset = Dataset.from_pandas(df, features=features)
    dataset.push_to_hub(hf_dataset_name)
    print(f"Dataset {hf_dataset_name} created and pushed to hub successfully.")

In [10]:
# Execute the process
verify_source_data()
df = load_data_to_df()
create_hf_dataset(df, HF_DATASET_NAME)

JSON file contains 5 images and 5 annotations
Sample of images: [{'id': 0, 'license': 1, 'file_name': '2256_jpg.rf.3afd7903eaf3f3c5aa8da4bbb928bc19.jpg', 'height': 640, 'width': 640, 'date_captured': '2023-08-19T04:37:54+00:00'}, {'id': 1, 'license': 1, 'file_name': '2871_jpg.rf.3b6eadfbb369abc2b3bcb52b406b74f2.jpg', 'height': 640, 'width': 640, 'date_captured': '2023-08-19T04:37:54+00:00'}]
Sample of annotations: [{'id': 0, 'image_id': 0, 'category_id': 1, 'bbox': [145, 239, 168.75, 162.5], 'area': 27421.875, 'segmentation': [[313.75, 238.75, 145, 238.75, 145, 401.25, 313.75, 401.25, 313.75, 238.75]], 'iscrowd': 0}, {'id': 1, 'image_id': 1, 'category_id': 1, 'bbox': [194, 176, 148.75, 233.75], 'area': 34770.313, 'segmentation': [[342.5, 176.25, 193.75, 176.25, 193.75, 410, 342.5, 410, 342.5, 176.25]], 'iscrowd': 0}]


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/535 [00:00<?, ?B/s]



Dataset brain-tumor-image-dataset-semantic-segmentation created and pushed to hub successfully.
