In [1]:
import pandas as pd

import torch

from transformers import AutoImageProcessor
from datasets import Dataset, DatasetDict, Features, Image, Sequence, Value
import albumentations as A
from albumentations.pytorch import ToTensorV2

import os

In [None]:
# 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

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
data_path = "output/"

In [4]:
train_df = pd.read_csv("train_labels.csv")
valid_df = pd.read_csv("valid_labels.csv")
test_df = pd.read_csv("test_labels.csv")

In [5]:
label_columns = ['Crack', 'Red-Dots', 'Toothmark']

In [6]:
# Convert binary-encoded features to array of labels
def binary_to_labels(row, label_cols):
    return [float(row[col]) for col in label_cols]

for df in [train_df, valid_df, test_df]:
    df['labels'] = df.apply(lambda row: binary_to_labels(row, label_columns), axis=1)
    df.drop(columns=label_columns, inplace=True)

In [7]:
# Append image path to filename
def add_image_path(df, split):
    df['image'] = data_path + split + "_segmented/" + df['filename']
    # Check if file exists and remove row if it doesn't
    df = df[df['image'].apply(os.path.exists)]
    return df[['image', 'labels']]

train_dataset_df = add_image_path(train_df, "train")
valid_dataset_df = add_image_path(valid_df, "valid")
test_dataset_df = add_image_path(test_df, "test")

In [8]:
train_dataset_df

Unnamed: 0,image,labels


In [9]:
num_classes = 3
class_names = ['Crack', 'Red-Dots', 'Toothmark']

# Labels is an array of floats
features = Features({
    'image': Image(),
    'labels': Sequence(feature=Value('float32'), length=num_classes)
})

In [10]:
print("Train labels example:", train_dataset_df.iloc[0])
print("Validation labels example:", valid_dataset_df.iloc[0])
print("Test labels example:", test_dataset_df.iloc[0])

IndexError: single positional indexer is out-of-bounds

In [11]:
train_dataset = Dataset.from_pandas(train_dataset_df, features=features, preserve_index=False)
valid_dataset = Dataset.from_pandas(valid_dataset_df, features=features, preserve_index=False)
test_dataset = Dataset.from_pandas(test_dataset_df, features=features, preserve_index=False)

In [12]:
print(len(train_dataset))
print(len(valid_dataset))
print(len(test_dataset))

619
181
94


In [13]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [14]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['image', 'labels'],
        num_rows: 619
    })
    validation: Dataset({
        features: ['image', 'labels'],
        num_rows: 181
    })
    test: Dataset({
        features: ['image', 'labels'],
        num_rows: 94
    })
})

In [15]:
# Modify depending on model
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-384")

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [None]:
train_transforms = A.Compose([
    A.RandomRotate90(),
    A.Flip(),
    A.GaussianBlur(p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.HueSaturationValue(p=0.5),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

val_test_transforms = A.Compose([
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])


In [16]:
def preprocess_function(examples, is_train=False):
    images = [np.array(img) for img in examples['image']]
    
    if is_train:
        augmented = [train_transforms(image=img)['image'] for img in images]
    else:
        augmented = [val_test_transforms(image=img)['image'] for img in images]
    
    labels = torch.tensor(examples['labels'], dtype=torch.float)
    
    return {
        'pixel_values': torch.stack(augmented),
        'labels': labels
    }


In [17]:
dataset_dict['train'] = dataset_dict['train'].map(
    lambda examples: preprocess_function(examples, is_train=True),
    batched=True,
    remove_columns=['image']
)

for split in ['validation', 'test']:
    dataset_dict[split] = dataset_dict[split].map(
        preprocess_function,
        batched=True,
        remove_columns=['image']
    )


Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

In [18]:
dataset_dict.push_to_hub("e1010101/tongue-images-384-segmented", private=True)

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/206 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/e1010101/tongue-images-384-segmented/commit/a2775d05c03ef313d1376c8085f1fb609ee9da4f', commit_message='Upload dataset', commit_description='', oid='a2775d05c03ef313d1376c8085f1fb609ee9da4f', pr_url=None, pr_revision=None, pr_num=None)