In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, Features, load_metric

from PIL import Image
import torch
from torchvision import transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor, Trainer, TrainingArguments, ViTImageProcessor

from src import dataloader as ds




In [2]:
file_path = 'fer2013.tar.gz'
data_path = ds.unpack_tar_gz(file_path)

Target folder 'data' is not empty. Assuming the file is already unpacked.


In [3]:
df = pd.read_csv(data_path)

In [4]:
def process_for_huggingface_dataset(data):
    image_list = []
    image_labels = data['emotion'].astype(int).tolist()
    
    for pixels in data['pixels']:
        # Convert pixel string to an array of integers
        image = np.fromstring(pixels, dtype=int, sep=' ')
        # Reshape to 48x48
        image = image.reshape((48, 48))
        # Convert grayscale to RGB by repeating the grayscale values across three channels
        image_rgb = np.stack([image] * 3, axis=-1)
        # Convert to list format
        image_list.append(image_rgb.tolist())

    # Create a DataFrame with images and labels
    output_df = pd.DataFrame({'img': image_list, 'label': image_labels})
    
    return output_df

In [5]:
# this will take a while
train_hf = process_for_huggingface_dataset(df[df['Usage'] == 'Training'])
val_hf = process_for_huggingface_dataset(df[df['Usage'] == 'PublicTest'])
test_hf = process_for_huggingface_dataset(df[df['Usage'] == 'PrivateTest'])

In [6]:
train_hf.head()

Unnamed: 0,img,label
0,"[[[70, 70, 70], [80, 80, 80], [82, 82, 82], [7...",0
1,"[[[151, 151, 151], [150, 150, 150], [147, 147,...",0
2,"[[[231, 231, 231], [212, 212, 212], [156, 156,...",2
3,"[[[24, 24, 24], [32, 32, 32], [36, 36, 36], [3...",4
4,"[[[4, 4, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",6


In [7]:
train_dataset = Dataset.from_pandas(train_hf)
val_dataset = Dataset.from_pandas(val_hf)
test_dataset = Dataset.from_pandas(test_hf)

In [8]:
train_dataset

Dataset({
    features: ['img', 'label'],
    num_rows: 28709
})

In [9]:
num_classes = len(set(train_hf['label']))
labels = train_dataset.features['label']
num_classes, labels

(7, Value(dtype='int64', id=None))

In [10]:
train_dataset[0]['img']

[[[70, 70, 70],
  [80, 80, 80],
  [82, 82, 82],
  [72, 72, 72],
  [58, 58, 58],
  [58, 58, 58],
  [60, 60, 60],
  [63, 63, 63],
  [54, 54, 54],
  [58, 58, 58],
  [60, 60, 60],
  [48, 48, 48],
  [89, 89, 89],
  [115, 115, 115],
  [121, 121, 121],
  [119, 119, 119],
  [115, 115, 115],
  [110, 110, 110],
  [98, 98, 98],
  [91, 91, 91],
  [84, 84, 84],
  [84, 84, 84],
  [90, 90, 90],
  [99, 99, 99],
  [110, 110, 110],
  [126, 126, 126],
  [143, 143, 143],
  [153, 153, 153],
  [158, 158, 158],
  [171, 171, 171],
  [169, 169, 169],
  [172, 172, 172],
  [169, 169, 169],
  [165, 165, 165],
  [129, 129, 129],
  [110, 110, 110],
  [113, 113, 113],
  [107, 107, 107],
  [95, 95, 95],
  [79, 79, 79],
  [66, 66, 66],
  [62, 62, 62],
  [56, 56, 56],
  [57, 57, 57],
  [61, 61, 61],
  [52, 52, 52],
  [43, 43, 43],
  [41, 41, 41]],
 [[65, 65, 65],
  [61, 61, 61],
  [58, 58, 58],
  [57, 57, 57],
  [56, 56, 56],
  [69, 69, 69],
  [75, 75, 75],
  [70, 70, 70],
  [65, 65, 65],
  [56, 56, 56],
  [54, 54, 54]

In [11]:
model_id = 'google/vit-base-patch16-224'
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

In [12]:
# Preprocess function
def preprocess_function(examples):
    processed_images = []
    for image in examples['img']:
        # Convert to PIL Image
        image = np.array(image, dtype=np.uint8)
        image = Image.fromarray(image)
        # Resize to 224x224
        image = image.resize((224, 224))
        # Ensure it's RGB
        image = image.convert("RGB")
        processed_images.append(np.array(image))
    
    inputs = processor(images=processed_images, return_tensors="pt")
    inputs['labels'] = examples['label']
    return inputs

In [None]:
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["img"])
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=["img"])
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["img"])


Map:   0%|          | 0/28709 [00:00<?, ? examples/s]

Map:   0%|          | 0/3589 [00:00<?, ? examples/s]

In [None]:
train_dataset

In [ ]:
# Define the metrics
accuracy_metric = load_metric("accuracy")

def compute_metrics(p):
    return accuracy_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=100
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

In [ ]:
# Evaluate the model
trainer.evaluate()