In [1]:
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from datasets import DatasetDict, Dataset
import os
from PIL import Image
import numpy as np

In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Dataset load

In [None]:
# Function to load images and labels from a directory
def load_data_from_dir(directory,label2id):
    images = []
    labels = []
    image_ids = []
    label_cat_dogs = [] # Assuming you have labels like 'cat' and 'dog'
    
    for dir in os.listdir(directory):
        #print(label)
        label_dir = os.path.join(directory, dir)
        if os.path.isdir(label_dir):
            for filename in os.listdir(label_dir):
                if filename.endswith(".jpg"):
                    img_path = os.path.join(label_dir, filename)
                    img = Image.open(img_path)
                    print(label)
                    # if not isinstance(img, Image.JpegImageFile):
                    #     img = img.convert("RGB")  # Convert to RGB if necessary
                    #     img = img.save(img_path, format='JPEG')  # Overwrite the image file
                    #     img = Image.open(img_path)
                    images.append(img)
                    label = label2id[dir]
                    labels.append(label)
                    image_ids.append(dir)
                    # Assuming you have some logic to convert label names to categorical values (e.g., 'cat' -> 0, 'dog' -> 1)
                    label_cat_dogs.append(0 if label == 'cat' else 1)
                    
    return {
        'image': images,
        'label': labels,
        'image_id': image_ids,
        'label_cat_dog': label_cat_dogs
    }

In [None]:
data_dir =  "/home/dell/Desktop/DATASETS/agegender"

labels = os.listdir(os.path.join(data_dir, "train"))

label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}
print(label2id,id2label)

In [None]:
# Load data for training and testing
train_data = load_data_from_dir(os.path.join(data_dir, "train"),label2id)
test_data = load_data_from_dir(os.path.join(data_dir, "val"),label2id)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': Dataset.from_dict(train_data),
    'test': Dataset.from_dict(test_data)
})


In [None]:
dataset_dict

In [None]:
dataset_dict['train']

In [None]:
example = dataset_dict["train"][0]
example["label_cat_dog"]

In [None]:
example["image"]


In [None]:
dataset_dict["train"][0]

### Model load

In [None]:
from transformers import AutoImageProcessor, AutoModelForImageClassification

model_name = "facebook/dinov2-base"
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModelForImageClassification.from_pretrained(model_name, id2label=id2label, label2id=label2id)

### Prepare dataset for the model

In [None]:
from torchvision.transforms import Compose, RandomResizedCrop, RandomHorizontalFlip, ColorJitter, ToTensor, Normalize
import torch

# make sure to use the appropriate image mean, std and interpolation
# of the inference processor
mean = processor.image_mean
std = processor.image_std
interpolation = processor.resample

# for training, we use some image transformations from Torchvision
# feel free to use other libraries like Albumentations or Kornia here
train_transform = Compose([
    RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=interpolation),
    RandomHorizontalFlip(p=0.5),
    ColorJitter(brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4)),
    ToTensor(),
    Normalize(mean=mean, std=std),
])

In [None]:
def prepare(batch, mode="train"):
  # get images
  images = batch["image"]
  #print(len(images))

  # prepare for the model
  if mode == "train":
    images = [train_transform(image.convert("RGB")) for image in images]
    pixel_values = torch.stack(images)
  elif mode == "test":
    pixel_values = processor(images, return_tensors="pt").pixel_values
  else:
    raise ValueError(f"Mode {mode} not supported")

  inputs = {}
  inputs["pixel_values"] = pixel_values
  inputs["labels"] = torch.tensor(batch["label"])

  return inputs

In [None]:
# set num_proc equal to the number of CPU cores on your machine
# see https://docs.python.org/3/library/multiprocessing.html#multiprocessing.cpu_count
train_dataset = dataset_dict["train"].map(prepare, num_proc=1, batched=True, batch_size=10, fn_kwargs={"mode":"train"})
eval_dataset = dataset_dict["test"].map(prepare, num_proc=1, batched=True, batch_size=10, fn_kwargs={"mode":"test"})

In [None]:
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

In [None]:
train_dataset[0]["pixel_values"].shape

In [None]:
train_dataset[0]["labels"]


### Train the model


In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

# the compute_metrics function takes a Named Tuple as input:
# predictions, which are the logits of the model as Numpy arrays,
# and label_ids, which are the ground-truth labels as Numpy arrays.
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    accuracy = accuracy_score(y_pred=predictions, y_true=eval_pred.label_ids)
    return {"accuracy": accuracy}

In [None]:

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"{model_name}-finetuned-oxford",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)
   

In [None]:
import torch

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)