# Import libraries

In [1]:
from transformers import ViTForImageClassification, ViTFeatureExtractor
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from PIL import Image
import os
from peft import get_peft_model, LoraConfig
import torch

2024-12-10 13:41:37.324470: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Init Google-Vit

In [2]:
model_name = "google/vit-base-patch16-224"
model = ViTForImageClassification.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

Using device: cpu


  return torch._C._cuda_getDeviceCount() > 0


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

# Load Dataset

In [3]:
# Load the feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")

# Load the dataset from the specified directory
dataset = load_dataset('imagefolder', data_dir='/home/ernestos/ws/downloads/mtsd_v2_fully_annotated/processed/regulatory_stop_g1')

# Preprocess the dataset
def preprocess_function(examples):
    images = [Image.open(image_file).convert("RGB") if isinstance(image_file, str) else image_file.convert("RGB") for image_file in examples['image']]
    inputs = feature_extractor(images, return_tensors="pt")
    inputs['labels'] = [0] * len(images)  # Assuming the label is always stop_sign
    return {"pixel_values": inputs["pixel_values"], "labels": inputs["labels"]}

# Apply preprocessing
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset['train'].train_test_split(test_size=0.05)

train_dataset = dataset['train']
eval_dataset = dataset['test']

# Remove the 'image' column as it is no longer needed
train_dataset = train_dataset.remove_columns(["image"])
eval_dataset = eval_dataset.remove_columns(["image"])

Resolving data files:   0%|          | 0/1386 [00:00<?, ?it/s]

# get_perf to add Lora weights

In [4]:
lora_config = LoraConfig(
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["query", "value"],  # Target modules to apply LoRA
    lora_dropout=0.1,  # Dropout rate
)

model = get_peft_model(model, lora_config)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

Using device: cpu


PeftModel(
  (base_model): LoraModel(
    (model): ViTForImageClassification(
      (vit): ViTModel(
        (embeddings): ViTEmbeddings(
          (patch_embeddings): ViTPatchEmbeddings(
            (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
          )
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): ViTEncoder(
          (layer): ModuleList(
            (0-11): 12 x ViTLayer(
              (attention): ViTSdpaAttention(
                (attention): ViTSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
               

In [5]:
from transformers import DefaultDataCollator

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    remove_unused_columns=False,  # Ensure columns are not removed
    logging_dir='./logs',  # Directory for storing logs
    logging_strategy="epoch"  # Log at the end of each epoch
)

data_collator = DefaultDataCollator()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=feature_extractor,  # Pass the feature extractor as tokenizer
    data_collator=data_collator,  # Use the default data collator
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,8.1532,No log
2,7.0075,No log
3,5.9926,No log


TrainOutput(global_step=249, training_loss=7.051098314155058, metrics={'train_runtime': 4254.9529, 'train_samples_per_second': 0.928, 'train_steps_per_second': 0.059, 'total_flos': 3.0972648968729395e+17, 'train_loss': 7.051098314155058, 'epoch': 3.0})

In [6]:
model.save_pretrained("./fine-tuned-vit")
feature_extractor.save_pretrained("./fine-tuned-vit")

print(train_dataset.features)


{'pixel_values': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None), 'labels': Value(dtype='int64', id=None)}


- - - - - - - - - - - - - - - - - 

In [7]:
from transformers import AutoModelForImageClassification, AutoImageProcessor

model_name = "google/vit-base-patch16-224"
model = AutoModelForImageClassification.from_pretrained(model_name)
image_processor = AutoImageProcessor.from_pretrained(model_name)


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [8]:
from datasets import load_dataset

dataset_path = "/home/ernestos/ws/downloads/mtsd_v2_fully_annotated/processed/regulatory_stop_g1"
dataset = load_dataset("imagefolder", data_dir=dataset_path)

# Split the dataset into train and validation sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']


Resolving data files:   0%|          | 0/1386 [00:00<?, ?it/s]

In [None]:
# Add new traffic sign category
new_category = "new_traffic_sign"
new_category_label = 1000  # Assuming 0 is for stop_sign and 1 is for new_traffic_sign

# Update the preprocess function to handle the new category
def preprocess_function(examples):
    images = [Image.open(image_file).convert("RGB") if isinstance(image_file, str) else image_file.convert("RGB") for image_file in examples['image']]
    inputs = feature_extractor(images, return_tensors="pt")
    inputs['labels'] = [new_category_label if "new_traffic_sign" in image_file else 0 for image_file in examples['image']]  # Update labels based on the new category
    return {"pixel_values": inputs["pixel_values"], "labels": inputs["labels"]}

# Apply preprocessing again with the updated function
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset['train'].train_test_split(test_size=0.05)

train_dataset = dataset['train']
eval_dataset = dataset['test']

# Remove the 'image' column as it is no longer needed
train_dataset = train_dataset.remove_columns(["image"])
eval_dataset = eval_dataset.remove_columns(["image"])