In [1]:
import pandas as pd

import torch

from transformers import AutoImageProcessor
from datasets import Dataset, DatasetDict, Features, Image, Sequence, Value

import os

In [2]:
from huggingface_hub import notebook_login
notebook_login()
# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
data_path = "Data/"

In [18]:
train_df = pd.read_csv(data_path + "train.csv")
valid_df = pd.read_csv(data_path + "valid.csv")
test_df = pd.read_csv(data_path + "test.csv")

In [19]:
label_columns = ['Crack', 'Red-Dots', 'Toothmark']

In [20]:
# Convert binary-encoded features to array of labels
def binary_to_labels(row, label_cols):
    return [float(row[col]) for col in label_cols]

for df in [train_df, valid_df, test_df]:
    df['labels'] = df.apply(lambda row: binary_to_labels(row, label_columns), axis=1)
    df.drop(columns=label_columns, inplace=True)

In [21]:
# Append image path to filename
def add_image_path(df):
    df['image'] = data_path + df['filename']
    return df[['image', 'labels']]

train_dataset_df = add_image_path(train_df)
valid_dataset_df = add_image_path(valid_df)
test_dataset_df = add_image_path(test_df)

In [22]:
train_dataset_df

Unnamed: 0,image,labels
0,Data/CC_1311_jpg.rf.1c021f815e3f13ff1da57eb0a4...,"[0.0, 1.0, 0.0]"
1,Data/CC_1485_jpg.rf.1be90431f70fd51920fe477341...,"[0.0, 1.0, 0.0]"
2,Data/CC_1493_jpg.rf.1bedd1b74145dacfc72c8b7c81...,"[0.0, 1.0, 0.0]"
3,Data/CC_1276_jpg.rf.1c049ea503b73e285d2afa5024...,"[0.0, 0.0, 0.0]"
4,Data/CC_1683_jpg.rf.1c0a08050f49ed2c914b58c90e...,"[1.0, 0.0, 0.0]"
...,...,...
5217,Data/CC_1760_jpg.rf.ffda035a88a751b4dba66cde6c...,"[1.0, 0.0, 1.0]"
5218,Data/CC_1479_jpg.rf.fffb0de56ce44913c300488fb6...,"[0.0, 1.0, 1.0]"
5219,Data/CC_11_jpg.rf.ff034723d95b2ff4bc00aa898351...,"[0.0, 0.0, 0.0]"
5220,Data/CC_859_jpg.rf.ff5a7ff6015d0ce3f803428b15a...,"[1.0, 0.0, 0.0]"


In [23]:
num_classes = 3
class_names = ['Crack', 'Red-Dots', 'Toothmark']

# Labels is an array of floats
features = Features({
    'image': Image(),
    'labels': Sequence(feature=Value('float32'), length=num_classes)
})

In [24]:
print("Train labels example:", train_dataset_df.iloc[0])
print("Validation labels example:", valid_dataset_df.iloc[0])
print("Test labels example:", test_dataset_df.iloc[0])

Train labels example: image     Data/CC_1311_jpg.rf.1c021f815e3f13ff1da57eb0a4...
labels                                      [0.0, 1.0, 0.0]
Name: 0, dtype: object
Validation labels example: image     Data/CC_1050_jpg.rf.0cef530431d371ae88db9fd71e...
labels                                      [0.0, 0.0, 0.0]
Name: 0, dtype: object
Test labels example: image     Data/CC_2534_jpg.rf.0b530d0adbf544c42236e0c6fd...
labels                                      [1.0, 0.0, 0.0]
Name: 0, dtype: object


In [25]:
train_dataset = Dataset.from_pandas(train_dataset_df, features=features, preserve_index=False)
valid_dataset = Dataset.from_pandas(valid_dataset_df, features=features, preserve_index=False)
test_dataset = Dataset.from_pandas(test_dataset_df, features=features, preserve_index=False)

In [26]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [27]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['image', 'labels'],
        num_rows: 5222
    })
    validation: Dataset({
        features: ['image', 'labels'],
        num_rows: 214
    })
    test: Dataset({
        features: ['image', 'labels'],
        num_rows: 106
    })
})

In [28]:
# Modify depending on model
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-384")

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [29]:
def preprocess_function(examples):
    images = image_processor(examples['image'], return_tensors='pt')
    labels = torch.tensor(examples['labels'], dtype=torch.float)
    
    return {
        'pixel_values': images['pixel_values'],
        'labels': labels
    }

In [30]:
dataset_dict = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/5222 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

In [31]:
dataset_dict.push_to_hub("e1010101/tongue-images-384", private=True)

Uploading the dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/274 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/274 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/274 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/e1010101/tongue-images-384/commit/ba44fdcf3bb334a1e33d0ea247d334587e88988f', commit_message='Upload dataset', commit_description='', oid='ba44fdcf3bb334a1e33d0ea247d334587e88988f', pr_url=None, pr_revision=None, pr_num=None)