<a href="https://colab.research.google.com/github/donbcolab/composable_vlms/blob/main/roboflow_bccd_hf_load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q roboflow datasets datasets[vision]

In [None]:
from google.colab import userdata
from roboflow import Roboflow

ROBOFLOW_API_KEY = userdata.get('ROBOFLOW_API_KEY')
rf = Roboflow(api_key=ROBOFLOW_API_KEY)

project = rf.workspace("roboflow-100").project("bccd-ouzjz")
version = project.version(2)
dataset = version.download(model_format="coco", location="/content/coco/", overwrite=True)
dataset = version.download(model_format="paligemma", location="/content/paligemma/", overwrite=True)
dataset = version.download(model_format="florence2-od", location="/content/florence2_od/", overwrite=True)

In [None]:
# for each of the dataset model formats (coco / paligemma / florence2_od)
# do a "ls -l" of the json files
!ls -l coco/train/*.json
!ls -l coco/test/*.json
!ls -l coco/valid/*.json

!ls -l florence2_od/test/*.json*
!ls -l florence2_od/train/*.json*
!ls -l florence2_od/valid/*.json*

!ls -l paligemma/dataset/*.json*

In [None]:
import os

def count_jpg_files(directory):
    return len([f for f in os.listdir(directory) if f.endswith('.jpg')])

directories = [
    "/content/coco/train",
    "/content/coco/test",
    "/content/coco/valid",
    "/content/florence2_od/test",
    "/content/florence2_od/train",
    "/content/florence2_od/valid",
    "/content/paligemma/dataset"
]

for directory in directories:
    try:
        count = count_jpg_files(directory)
        print(f"{directory}: {count} files")
    except FileNotFoundError:
        print(f"{directory}: Directory not found")


In [None]:
import os
import json
from datasets import Dataset, Features, Image, Sequence, Value, ClassLabel, DatasetDict

def load_coco_annotations(annotation_file, image_dir):
    with open(annotation_file, 'r') as f:
        data = json.load(f)

    # Create id to filename mapping
    id_to_filename = {img['id']: img['file_name'] for img in data['images']}

    # Create category id to name mapping
    id_to_category = {cat['id']: cat['name'] for cat in data['categories']}

    # Process annotations
    image_annotations = {}
    for ann in data['annotations']:
        image_id = ann['image_id']
        if image_id not in image_annotations:
            image_annotations[image_id] = []
        image_annotations[image_id].append(ann)

    # Create dataset entries
    dataset_entries = []
    for img in data['images']:
        entry = {
            'image_id': img['id'],
            'image': os.path.join(image_dir, img['file_name']),
            'width': img['width'],
            'height': img['height'],
            'objects': {
                'id': [],
                'area': [],
                'bbox': [],
                'category': []
            }
        }

        if img['id'] in image_annotations:
            for ann in image_annotations[img['id']]:
                entry['objects']['id'].append(ann['id'])
                entry['objects']['area'].append(ann['area'])
                entry['objects']['bbox'].append(ann['bbox'])
                entry['objects']['category'].append(id_to_category[ann['category_id']])

        dataset_entries.append(entry)

    return dataset_entries

# Load annotations
train_data = load_coco_annotations('/content/coco/train/_annotations.coco.json', '/content/coco/train')
test_data = load_coco_annotations('/content/coco/test/_annotations.coco.json', '/content/coco/test')
valid_data = load_coco_annotations('/content/coco/valid/_annotations.coco.json', '/content/coco/valid')

# Define features
features = Features({
    'image_id': Value('int64'),
    'image': Image(),
    'width': Value('int32'),
    'height': Value('int32'),
    'objects': {
        'id': Sequence(Value('int64')),
        'area': Sequence(Value('int64')),
        'bbox': Sequence(Sequence(Value('float32'), length=4)),
        'category': Sequence(ClassLabel(names=['RBC', 'WBC', 'Platelets']))
    }
})

# Create datasets
coco_train_dataset = Dataset.from_list(train_data, features=features)
coco_test_dataset = Dataset.from_list(test_data, features=features)
coco_valid_dataset = Dataset.from_list(valid_data, features=features)

# Combine into a DatasetDict
coco_ds = DatasetDict({
    'train': coco_train_dataset,
    'test': coco_test_dataset,
    'validation': coco_valid_dataset
})

In [None]:
# Push to Hub
coco_ds.push_to_hub("dwb2023/bccd-coco")

# Print some information about the dataset
print(f"Dataset splits: {coco_ds.keys()}")
print(f"Number of training examples: {len(coco_ds['train'])}")
print(f"Number of testing examples: {len(coco_ds['test'])}")
print(f"Number of validation examples: {len(coco_ds['validation'])}")
print(f"Features: {coco_ds['train'].features}")

In [None]:
import os
import json
from datasets import Dataset, Features, Image, Value, Sequence, DatasetDict

def load_coco_annotations(annotation_file, image_dir):
    with open(annotation_file, 'r') as f:
        data = json.load(f)

    # Create image id to annotations mapping
    image_annotations = {img['id']: {'image': img, 'annotations': []} for img in data['images']}
    for ann in data['annotations']:
        image_annotations[ann['image_id']]['annotations'].append(ann)

    # Create dataset entries
    dataset_entries = []
    for img_id, img_data in image_annotations.items():
        entry = {
            'image_id': img_data['image']['file_name'],
            'image': os.path.join(image_dir, img_data['image']['file_name']),
            'width': img_data['image']['width'],
            'height': img_data['image']['height'],
            'annotations': {
                'image': img_data['image'],
                'categories': {
                    'id': [cat['id'] for cat in data['categories']],
                    'name': [cat['name'] for cat in data['categories']],
                    'supercategory': [cat['supercategory'] for cat in data['categories']]
                },
                'annotations': {
                    'id': [ann['id'] for ann in img_data['annotations']],
                    'image_id': [ann['image_id'] for ann in img_data['annotations']],
                    'category_id': [ann['category_id'] for ann in img_data['annotations']],
                    'bbox': [ann['bbox'] for ann in img_data['annotations']],
                    'area': [ann['area'] for ann in img_data['annotations']],
                    'segmentation': [ann['segmentation'] for ann in img_data['annotations']],
                    'iscrowd': [ann['iscrowd'] for ann in img_data['annotations']]
                }
            }
        }
        dataset_entries.append(entry)

    return dataset_entries

# Load annotations
train_data = load_coco_annotations('/content/coco/train/_annotations.coco.json', '/content/coco/train')
test_data = load_coco_annotations('/content/coco/test/_annotations.coco.json', '/content/coco/test')
valid_data = load_coco_annotations('/content/coco/valid/_annotations.coco.json', '/content/coco/valid')

# Define features
features = Features({
    'image_id': Value('string'),
    'image': Image(),
    'width': Value('int32'),
    'height': Value('int32'),
    'annotations': {
        'image': {
            'id': Value('int64'),
            'license': Value('int64'),
            'file_name': Value('string'),
            'height': Value('int64'),
            'width': Value('int64'),
            'date_captured': Value('string')
        },
        'categories': {
            'id': Sequence(Value('int64')),
            'name': Sequence(Value('string')),
            'supercategory': Sequence(Value('string'))
        },
        'annotations': {
            'id': Sequence(Value('int64')),
            'image_id': Sequence(Value('int64')),
            'category_id': Sequence(Value('int64')),
            'bbox': Sequence(Sequence(Value('float32'), length=4)),
            'area': Sequence(Value('float32')),
            'segmentation': Sequence(Sequence(Value('float32'))),
            'iscrowd': Sequence(Value('int64'))
        }
    }
})

# Create datasets
coco_train_dataset = Dataset.from_list(train_data, features=features)
coco_test_dataset = Dataset.from_list(test_data, features=features)
coco_valid_dataset = Dataset.from_list(valid_data, features=features)

# Combine into a DatasetDict
coco_dataset = DatasetDict({
    'train': coco_train_dataset,
    'test': coco_test_dataset,
    'validation': coco_valid_dataset
})

In [None]:
# Push to Hub
coco_dataset.push_to_hub("dwb2023/roboflow100-bccd-coco")

# Print some information about the dataset
print(f"Dataset splits: {coco_dataset.keys()}")
print(f"Number of training examples: {len(coco_dataset['train'])}")
print(f"Number of testing examples: {len(coco_dataset['test'])}")
print(f"Number of validation examples: {len(coco_dataset['validation'])}")
print(f"Features: {coco_dataset['train'].features}")

# Print a sample entry
print("\nSample entry:")
print(coco_dataset['train'][0])

In [None]:
import os
import json
from datasets import Dataset, Features, Image, Value, DatasetDict

def load_florence2_annotations(annotation_file, image_dir):
    dataset_entries = []
    with open(annotation_file, 'r') as f:
        for line in f:
            ann = json.loads(line)
            entry = {
                'image_id': ann['image'],
                'image': os.path.join(image_dir, ann['image']),
                'annotations': ann  # Keep all original fields
            }
            dataset_entries.append(entry)
    return dataset_entries

# Load annotations
train_data = load_florence2_annotations('/content/florence2_od/train/annotations.jsonl', '/content/florence2_od/train')
test_data = load_florence2_annotations('/content/florence2_od/test/annotations.jsonl', '/content/florence2_od/test')
valid_data = load_florence2_annotations('/content/florence2_od/valid/annotations.jsonl', '/content/florence2_od/valid')

# Define features
features = Features({
    'image_id': Value('string'),
    'image': Image(),
    'annotations': {
        'image': Value('string'),
        'prefix': Value('string'),
        'suffix': Value('string')
    }
})

# Create datasets
florence2_train_dataset = Dataset.from_list(train_data, features=features)
florence2_test_dataset = Dataset.from_list(test_data, features=features)
florence2_valid_dataset = Dataset.from_list(valid_data, features=features)

# Combine into a DatasetDict
florence2_dataset = DatasetDict({
    'train': florence2_train_dataset,
    'test': florence2_test_dataset,
    'validation': florence2_valid_dataset
})

In [None]:
# Push to Hub
florence2_dataset.push_to_hub("dwb2023/roboflow100-bccd-florence2")

# Print some information about the dataset
print(f"Dataset splits: {florence2_dataset.keys()}")
print(f"Number of training examples: {len(florence2_dataset['train'])}")
print(f"Number of testing examples: {len(florence2_dataset['test'])}")
print(f"Number of validation examples: {len(florence2_dataset['validation'])}")
print(f"Features: {florence2_dataset['train'].features}")

# Print a sample entry
print("\nSample entry:")
print(florence2_dataset['train'][0])

In [None]:
# Push to Hub
paligemma_dataset.push_to_hub("dwb2023/roboflow100-bccd-paligemma")

# Print some information about the dataset
print(f"Dataset splits: {paligemma_dataset.keys()}")
print(f"Number of training examples: {len(paligemma_dataset['train'])}")
print(f"Number of testing examples: {len(paligemma_dataset['test'])}")
print(f"Number of validation examples: {len(paligemma_dataset['validation'])}")
print(f"Features: {paligemma_dataset['train'].features}")

# Print a sample entry
print("\nSample entry:")
print(paligemma_dataset['train'][0])

In [None]:
# Search for same image across datasets
image_id_to_find = 'BloodImage_00343_jpg.rf.d8c56063ce5e40c50efb00a7e0c83c3b.jpg'

# Function to find a record by image_id
def find_record_by_image_id(dataset, image_id):
    for record in dataset['train']:
        if record['image_id'] == image_id:
            return record
    return None

In [None]:
# Find the COCO record
record = find_record_by_image_id(coco_ds, image_id_to_find)

# Display the record
if record:
    print(f"Record found: {record}")
else:
    print("Record not found")

In [None]:
# Find the RoboFlow COCO record
record = find_record_by_image_id(coco_dataset, image_id_to_find)

# Display the record
if record:
    print(f"Record found: {record}")
else:
    print("Record not found")

In [None]:
# Find the Roboflow Florence2 record
record = find_record_by_image_id(florence2_dataset, image_id_to_find)

# Display the record
if record:
    print(f"Record found: {record}")
else:
    print("Record not found")

In [None]:
# Find the Roboflow Paligemma record
record = find_record_by_image_id(paligemma_dataset, image_id_to_find)

# Display the record
if record:
    print(f"Record found: {record}")
else:
    print("Record not found")