In [1]:
import pandas as pd

import torch

from transformers import AutoImageProcessor
from datasets import Dataset, DatasetDict, Features, Image, Sequence, Value

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
data_path = "Data/"

In [4]:
train_df = pd.read_csv(data_path + "train.csv")
valid_df = pd.read_csv(data_path + "valid.csv")
test_df = pd.read_csv(data_path + "test.csv")

In [5]:
label_columns = ['Crack', 'Red-Dots', 'Toothmark']

In [6]:
# Convert binary-encoded features to array of labels
def binary_to_labels(row, label_cols):
    return [float(row[col]) for col in label_cols]

for df in [train_df, valid_df, test_df]:
    df['labels'] = df.apply(lambda row: binary_to_labels(row, label_columns), axis=1)
    df.drop(columns=label_columns, inplace=True)

In [7]:
# Append image path to filename
def add_image_path(df):
    df['image'] = data_path + df['filename']
    return df[['image', 'labels']]

train_dataset_df = add_image_path(train_df)
valid_dataset_df = add_image_path(valid_df)
test_dataset_df = add_image_path(test_df)

In [8]:
train_dataset_df

Unnamed: 0,image,labels
0,Data/CC_211_jpg.rf.31153880eb2dbd26c2a84ec2063...,"[0.0, 0.0, 1.0]"
1,Data/CC_1873_jpg.rf.30bd76e7beefb2ee1bb72b6c33...,"[1.0, 1.0, 1.0]"
2,Data/CC_1584_jpg.rf.3074bed75cb82a848747c8534c...,"[0.0, 1.0, 0.0]"
3,Data/CC_1130_jpg.rf.321c01c097f04c8f20de88d709...,"[0.0, 1.0, 0.0]"
4,Data/CC_526_jpg.rf.33619b9cdb0f9b4fd0725af2847...,"[0.0, 1.0, 1.0]"
...,...,...
741,Data/CC_162_jpg.rf.fc096c7c27c0ae53b58160ddad5...,"[1.0, 0.0, 1.0]"
742,Data/CC_1059_jpg.rf.fd96a222061b07c745d63ceeb7...,"[1.0, 1.0, 1.0]"
743,Data/CC_1035_jpg.rf.fb05385f3eb3212f397c42d36a...,"[0.0, 1.0, 1.0]"
744,Data/CC_1355_jpg.rf.ffb1d2f17400373be04d0afc8f...,"[0.0, 1.0, 1.0]"


In [9]:
num_classes = 3
class_names = ['Crack', 'Red-Dots', 'Toothmark']

# Labels is an array of floats
features = Features({
    'image': Image(),
    'labels': Sequence(feature=Value('float32'), length=num_classes)
})

In [10]:
print("Train labels example:", train_dataset_df.iloc[0])
print("Validation labels example:", valid_dataset_df.iloc[0])
print("Test labels example:", test_dataset_df.iloc[0])

Train labels example: image     Data/CC_211_jpg.rf.31153880eb2dbd26c2a84ec2063...
labels                                      [0.0, 0.0, 1.0]
Name: 0, dtype: object
Validation labels example: image     Data/CC_1930_jpg.rf.c3eed44b2a8b24b72f69fc19ba...
labels                                      [1.0, 1.0, 1.0]
Name: 0, dtype: object
Test labels example: image     Data/CC_1401_jpg.rf.10a517516c5386991c4133e502...
labels                                      [0.0, 1.0, 0.0]
Name: 0, dtype: object


In [11]:
train_dataset = Dataset.from_pandas(train_dataset_df, features=features, preserve_index=False)
valid_dataset = Dataset.from_pandas(valid_dataset_df, features=features, preserve_index=False)
test_dataset = Dataset.from_pandas(test_dataset_df, features=features, preserve_index=False)

In [12]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [13]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['image', 'labels'],
        num_rows: 746
    })
    validation: Dataset({
        features: ['image', 'labels'],
        num_rows: 214
    })
    test: Dataset({
        features: ['image', 'labels'],
        num_rows: 106
    })
})

In [14]:
# Modify depending on model
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-384")

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [15]:
def preprocess_function(examples):
    images = image_processor(examples['image'], return_tensors='pt')
    labels = torch.tensor(examples['labels'], dtype=torch.float)
    
    return {
        'pixel_values': images['pixel_values'],
        'labels': labels
    }

In [16]:
dataset_dict = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/746 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

In [17]:
dataset_dict.push_to_hub("e1010101/tongue-images-384-no-augmentation", private=True)

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/e1010101/tongue-images-384-no-augmentation/commit/5fdf03fe4958f3027cc87abce34f0712ecdafc99', commit_message='Upload dataset', commit_description='', oid='5fdf03fe4958f3027cc87abce34f0712ecdafc99', pr_url=None, pr_revision=None, pr_num=None)