Change from May 13
- L2 regularization (training args)
- Preprocessing images for train changed alot

In [1]:
from datasets import load_dataset, Image
"""
.venv/Scripts/activate

python -m image_process
"""
base_output_dir = f"models/may14_VIT2"
dataset = load_dataset("potato_train/train")
filenames_ds = load_dataset("potato_train/train").cast_column("image", Image(decode=False))

filename_col = [x['image']['path'].split('\\')[-1] for x in filenames_ds['train']]
dataset['train'] = dataset['train'].add_column("filename", filename_col)

#print(dataset['train'][0])
#base_output_dir

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import ViTImageProcessor

# import model
model_id = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTImageProcessor.from_pretrained(
    model_id
)
# feature_extractor

In [3]:
import torch
import numpy as np

# device will determine whether to run the training on GPU or CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomVerticalFlip,
    RandomRotation,
    Resize,
    ToTensor,
    ColorJitter,
    RandomAffine,
    Pad,
    RandomCrop
)
from PIL import Image  # Import PIL for RandomAffine's resample
import torch

torch.manual_seed(42)

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
size = (feature_extractor.size["height"], feature_extractor.size["width"])

training_transforms = Compose([
    Resize(size),
    # CenterCrop(size),
    RandomRotation((-30, 30)),
    RandomHorizontalFlip(),
    Pad(10),  # Add padding before random crop
    RandomCrop(size),
    # RandomVerticalFlip(),
    ColorJitter(brightness=(0.8, 1.2), contrast=(0.9, 1.1), saturation=(0.9, 1.1)),
    RandomAffine(degrees=10, translate=(0.05, 0.05), scale=(0.95, 1.05), interpolation=Image.BILINEAR),
    ToTensor(),
    normalize
])

def training_image_preprocess(batch):
    batch["pixel_values"] = torch.stack([training_transforms(img) for img in batch["image"]])
    return batch

def preprocess(batch):
    # take a list of PIL images and turn them to pixel values
    inputs = feature_extractor(
        batch['image'],
        return_tensors='pt'
    )
    inputs['label'] = batch['label']
    return inputs

In [5]:
train_test_split = dataset["train"].train_test_split(test_size=0.2, shuffle=True, seed=42)
dataset_train = train_test_split["train"]
dataset_test = train_test_split["test"]

In [6]:
num_classes = len(set(dataset_train['label']))
labels = dataset_train.features['label']
num_classes, labels

(6,
 ClassLabel(names=['Bacteria', 'Fungi', 'Healthy', 'Pest', 'Phytopthora', 'Virus'], id=None))

In [7]:
# transform the training dataset
prepared_train = dataset_train.with_transform(training_image_preprocess)
# ... and the testing dataset
prepared_test = dataset_test.with_transform(preprocess)

Save images of preprocessed images (both train and test)

In [8]:
import os
from torchvision.transforms.functional import to_pil_image

os.makedirs(base_output_dir, exist_ok=True)
output_dir = f"{base_output_dir}/preprocessed_train_images"
os.makedirs(output_dir, exist_ok=True)

for index, item in enumerate(prepared_train):
    if index >= 10:
        break
    pixel_values = item["pixel_values"]
    image = to_pil_image(pixel_values)
    label_filename = dataset_train[index]["filename"]

    name_without_extension, extension = os.path.splitext(label_filename)
    filename = f"pp_{name_without_extension}.png"

    filepath = os.path.join(output_dir, filename)
    image.save(filepath)

In [9]:
output_dir = f"{base_output_dir}/preprocessed_test_images"
os.makedirs(output_dir, exist_ok=True)

for index, item in enumerate(prepared_test):
    if index >= 10:
        break
    pixel_values = item["pixel_values"]
    image = to_pil_image(pixel_values)
    label_filename = dataset_test[index]["filename"]

    name_without_extension, extension = os.path.splitext(label_filename)
    filename = f"pp_{name_without_extension}.png"

    filepath = os.path.join(output_dir, filename)
    image.save(filepath)

In [10]:
import evaluate

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    results = {}
    results.update(accuracy_metric.compute(
        predictions=predictions, 
        references=p.label_ids,
        )
    )
    results.update(f1_metric.compute(predictions=predictions, references=p.label_ids, average="weighted"))
    return results
#

In [11]:
from transformers import ViTForImageClassification, Trainer, TrainingArguments, ViTConfig

training_args = TrainingArguments(
  output_dir=base_output_dir,
  per_device_train_batch_size=16,
  eval_strategy="steps",
  num_train_epochs=8,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=5e-5,
  save_total_limit=2,
  seed=42,
  remove_unused_columns=False,
  push_to_hub=False,
  load_best_model_at_end=True,
  weight_decay=0.01,  # Add this line to apply L2 regularization
)

config = ViTConfig.from_pretrained(model_id)

# If you want to change it (do this BEFORE loading the model with from_pretrained):
#config.hidden_dropout_prob = 0.2
#config.attention_probs_dropout_prob = 0.2
config.num_labels = len(dataset_train.features['label'].names)

print(config.hidden_dropout_prob)
print(config.attention_probs_dropout_prob)

model = ViTForImageClassification.from_pretrained(
    model_id,  # classification head
    config=config,
)

model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_train,
    eval_dataset=prepared_test,
    processing_class=feature_extractor,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.0
0.0


In [12]:
train_results = trainer.train()

# save tokenizer with the model
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

# save the trainer state
trainer.save_state()

Step,Training Loss,Validation Loss,Accuracy,F1
100,0.7728,0.742843,0.815498,0.808637
200,0.4314,0.500782,0.867159,0.867078
300,0.4186,0.516793,0.830258,0.834309
400,0.3045,0.445508,0.861624,0.860376
500,0.1916,0.399145,0.878229,0.877705
600,0.1146,0.442838,0.852399,0.852371
700,0.1371,0.3768,0.881919,0.880662
800,0.1396,0.475556,0.850554,0.852158
900,0.1186,0.407936,0.883764,0.882607
1000,0.1048,0.444724,0.863469,0.86245


***** train metrics *****
  epoch                    =          8.0
  total_flos               = 1250029893GF
  train_loss               =       0.3178
  train_runtime            =   0:27:34.51
  train_samples_per_second =       10.468
  train_steps_per_second   =        0.658


In [14]:
from transformers import Trainer, ViTForImageClassification, ViTFeatureExtractor

# Load the trained model
model = ViTForImageClassification.from_pretrained(base_output_dir)
feature_extractor = ViTFeatureExtractor.from_pretrained(base_output_dir)

# Define the Trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_train,
    eval_dataset=prepared_test,
    processing_class=feature_extractor,
)

# Now you can run the evaluation
eval_results = trainer.evaluate()

# Log and print the evaluation metrics
trainer.log_metrics("eval", eval_results)
trainer.save_metrics("eval", eval_results)

print(eval_results)

***** eval metrics *****
  eval_accuracy               =     0.8819
  eval_f1                     =     0.8807
  eval_loss                   =     0.3768
  eval_model_preparation_time =      0.001
  eval_runtime                = 0:00:23.24
  eval_samples_per_second     =     23.316
  eval_steps_per_second       =      2.925
{'eval_loss': 0.37680020928382874, 'eval_model_preparation_time': 0.001, 'eval_accuracy': 0.8819188191881919, 'eval_f1': 0.8806623861457068, 'eval_runtime': 23.2454, 'eval_samples_per_second': 23.316, 'eval_steps_per_second': 2.925}
