In [None]:
from transformers import ViTForImageClassification, ViTFeatureExtractor
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from PIL import Image
import os
from peft import get_peft_model, LoraConfig
import torch

In [45]:
model_name = "google/vit-base-patch16-224"
model = ViTForImageClassification.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

Using device: cpu


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

In [46]:
# Load the feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")

# Load the dataset from the specified directory
dataset = load_dataset('imagefolder', data_dir='/home/ernestos/ws/downloads/mtsd_v2_fully_annotated/processed/regulatory_stop_g1')

# Preprocess the dataset
def preprocess_function(examples):
    images = [Image.open(image_file).convert("RGB") if isinstance(image_file, str) else image_file.convert("RGB") for image_file in examples['image']]
    inputs = feature_extractor(images, return_tensors="pt")
    inputs['labels'] = [0] * len(images)  # Assuming the label is always stop_sign
    return {"pixel_values": inputs["pixel_values"], "labels": inputs["labels"]}

# Apply preprocessing
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset['train'].train_test_split(test_size=0.05)

train_dataset = dataset['train']
eval_dataset = dataset['test']

# Remove the 'image' column as it is no longer needed
train_dataset = train_dataset.remove_columns(["image"])
eval_dataset = eval_dataset.remove_columns(["image"])

Resolving data files:   0%|          | 0/1386 [00:00<?, ?it/s]

Map:   0%|          | 0/1386 [00:00<?, ? examples/s]

In [47]:
lora_config = LoraConfig(
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["query", "value"],  # Target modules to apply LoRA
    lora_dropout=0.1,  # Dropout rate
)

model = get_peft_model(model, lora_config)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

Using device: cpu


PeftModel(
  (base_model): LoraModel(
    (model): ViTForImageClassification(
      (vit): ViTModel(
        (embeddings): ViTEmbeddings(
          (patch_embeddings): ViTPatchEmbeddings(
            (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
          )
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): ViTEncoder(
          (layer): ModuleList(
            (0-11): 12 x ViTLayer(
              (attention): ViTSdpaAttention(
                (attention): ViTSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
               

In [48]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    remove_unused_columns=False  # Ensure columns are not removed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=feature_extractor,  # Pass the feature extractor as tokenizer
    data_collator=None,  # Default data collator will work for image data
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log


TrainOutput(global_step=249, training_loss=7.085639922973143, metrics={'train_runtime': 4281.872, 'train_samples_per_second': 0.922, 'train_steps_per_second': 0.058, 'total_flos': 3.0972648968729395e+17, 'train_loss': 7.085639922973143, 'epoch': 3.0})

In [38]:
model.save_pretrained("./fine-tuned-vit")
feature_extractor.save_pretrained("./fine-tuned-vit")

print(train_dataset.features)


{'pixel_values': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None), 'labels': Value(dtype='int64', id=None)}


In [36]:
print(training_args.device)

cpu
