In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch

from transformers import AutoImageProcessor
from datasets import Dataset, DatasetDict, Features, Image, Sequence, Value

import os

In [28]:
from huggingface_hub import notebook_login
notebook_login()
# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("valid.csv")
test_df = pd.read_csv("test.csv")

In [31]:
label_columns = ['Crack', 'Red-Dots', 'Toothmark']

In [32]:
# Convert binary-encoded features to array of labels
def binary_to_labels(row, label_cols):
    return [float(row[col]) for col in label_cols]

for df in [train_df, valid_df, test_df]:
    df['labels'] = df.apply(lambda row: binary_to_labels(row, label_columns), axis=1)
    df.drop(columns=label_columns, inplace=True)

In [33]:
# Append image path to filename
def add_image_path(df):
    df['image'] = df['filename']
    return df[['image', 'labels']]

train_dataset_df = add_image_path(train_df)
valid_dataset_df = add_image_path(valid_df)
test_dataset_df = add_image_path(test_df)

In [34]:
train_dataset_df

Unnamed: 0,image,labels
0,CC_2058_jpg.rf.84dfa601d2425b42f63f221fa7b4f3c...,"[1.0, 0.0, 0.0]"
1,CC_2385_jpg.rf.8578f0b85eecd5a55855e0c849dd0e2...,"[1.0, 0.0, 0.0]"
2,CC_17_jpg.rf.8611dcd760638488cce652f89d2c38ab.jpg,"[0.0, 0.0, 1.0]"
3,CC_1328_jpg.rf.86155b7ff3da3fd5a7eec3b5850ec26...,"[0.0, 1.0, 1.0]"
4,CC_866_jpg.rf.868371c679ec8a0fec13874fa9c752c8...,"[1.0, 0.0, 0.0]"
...,...,...
2225,CC_310_jpg.rf.ffa68b8cd5cf6bfef4551400f2f2407d...,"[0.0, 0.0, 1.0]"
2226,CC_1409_jpg.rf.ffced042b6ce657929100554ca2eed9...,"[1.0, 0.0, 1.0]"
2227,CC_1355_jpg.rf.ffb1d2f17400373be04d0afc8f7bec0...,"[0.0, 1.0, 1.0]"
2228,CC_1758_jpg.rf.ff8ec540bbf6c88965e840dd4e161bc...,"[1.0, 0.0, 1.0]"


In [35]:
num_classes = 3
class_names = ['Crack', 'Red-Dots', 'Toothmark']

# Labels is an array of floats
features = Features({
    'image': Image(),
    'labels': Sequence(feature=Value('float32'), length=num_classes)
})

In [36]:
print("Train labels example:", train_dataset_df.iloc[0])
print("Validation labels example:", valid_dataset_df.iloc[0])
print("Test labels example:", test_dataset_df.iloc[0])

In [37]:
train_dataset = Dataset.from_pandas(train_dataset_df, features=features, preserve_index=False)
valid_dataset = Dataset.from_pandas(valid_dataset_df, features=features, preserve_index=False)
test_dataset = Dataset.from_pandas(test_dataset_df, features=features, preserve_index=False)

In [38]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [39]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['image', 'labels'],
        num_rows: 2230
    })
    validation: Dataset({
        features: ['image', 'labels'],
        num_rows: 214
    })
    test: Dataset({
        features: ['image', 'labels'],
        num_rows: 106
    })
})

In [40]:
# Modify depending on model
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-384")

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [41]:
def preprocess_function(examples):
    images = image_processor(examples['image'], return_tensors='pt')
    labels = torch.tensor(examples['labels'], dtype=torch.float)
    
    return {
        'pixel_values': images['pixel_values'],
        'labels': labels
    }

In [42]:
dataset_dict = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/2230 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

In [44]:
dataset_dict.push_to_hub("e1010101/tongue-images-384", private=True)

Uploading the dataset shards:   0%|          | 0/9 [00:00<?, ?it/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/e1010101/tongue-images-384/commit/59ebce38755aa449a75ee8d780ef466e598b85bd', commit_message='Upload dataset', commit_description='', oid='59ebce38755aa449a75ee8d780ef466e598b85bd', pr_url=None, pr_revision=None, pr_num=None)