Change from May 14
- keep dropouts
- better retestability (seeds)

In [2]:
from datasets import load_dataset, Image
"""
.venv/Scripts/activate

python -m image_process
"""
base_output_dir = f"models/may15_VIT1"
dataset = load_dataset("potato_train/train")
filenames_ds = load_dataset("potato_train/train").cast_column("image", Image(decode=False))

filename_col = [x['image']['path'].split('\\')[-1] for x in filenames_ds['train']]
dataset['train'] = dataset['train'].add_column("filename", filename_col)

#print(dataset['train'][0])
#base_output_dir

In [3]:
from transformers import ViTImageProcessor

# import model
model_id = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTImageProcessor.from_pretrained(
    model_id
)
# feature_extractor

In [4]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomVerticalFlip,
    RandomRotation,
    Resize,
    ToTensor,
    ColorJitter,
    RandomAffine,
    Pad,
    RandomCrop
)
from PIL import Image  # Import PIL for RandomAffine's resample
import torch
import numpy as np

def set_seeds(seed):
    # random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        # torch.backends.cudnn.deterministic = True ## this makes performance too slow
        # torch.backends.cudnn.benchmark = False

seed = 42
set_seeds(seed)

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
size = (feature_extractor.size["height"], feature_extractor.size["width"])

training_transforms = Compose([
    Resize(size),
    # CenterCrop(size),
    RandomRotation((-30, 30)),
    RandomHorizontalFlip(),
    Pad(10),  # Add padding before random crop
    RandomCrop(size),
    # RandomVerticalFlip(),
    ColorJitter(brightness=(0.8, 1.2), contrast=(0.9, 1.1), saturation=(0.9, 1.1)),
    RandomAffine(degrees=10, translate=(0.05, 0.05), scale=(0.95, 1.05), interpolation=Image.BILINEAR),
    ToTensor(),
    normalize
])

def training_image_preprocess(batch):
    batch["pixel_values"] = torch.stack([training_transforms(img) for img in batch["image"]])
    return batch

def preprocess(batch):
    # take a list of PIL images and turn them to pixel values
    inputs = feature_extractor(
        batch['image'],
        return_tensors='pt'
    )
    inputs['label'] = batch['label']
    return inputs

In [5]:
import torch
import numpy as np

# device will determine whether to run the training on GPU or CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
train_test_split = dataset["train"].train_test_split(test_size=0.2, shuffle=True, seed=42)
dataset_train = train_test_split["train"]
dataset_test = train_test_split["test"]

In [7]:
num_classes = len(set(dataset_train['label']))
labels = dataset_train.features['label']
num_classes, labels

(6,
 ClassLabel(names=['Bacteria', 'Fungi', 'Healthy', 'Pest', 'Phytopthora', 'Virus'], id=None))

In [8]:
# transform the training dataset
prepared_train = dataset_train.with_transform(training_image_preprocess)
# ... and the testing dataset
prepared_test = dataset_test.with_transform(preprocess)

Save images of preprocessed images (both train and test)

In [9]:
import os
from torchvision.transforms.functional import to_pil_image

os.makedirs(base_output_dir, exist_ok=True)
output_dir = f"{base_output_dir}/preprocessed_train_images"
os.makedirs(output_dir, exist_ok=True)

for index, item in enumerate(prepared_train):
    if index >= 10:
        break
    pixel_values = item["pixel_values"]
    image = to_pil_image(pixel_values)
    label_filename = dataset_train[index]["filename"]

    name_without_extension, extension = os.path.splitext(label_filename)
    filename = f"pp_{name_without_extension}.png"

    filepath = os.path.join(output_dir, filename)
    image.save(filepath)

In [10]:
output_dir = f"{base_output_dir}/preprocessed_test_images"
os.makedirs(output_dir, exist_ok=True)

for index, item in enumerate(prepared_test):
    if index >= 10:
        break
    pixel_values = item["pixel_values"]
    image = to_pil_image(pixel_values)
    label_filename = dataset_test[index]["filename"]

    name_without_extension, extension = os.path.splitext(label_filename)
    filename = f"pp_{name_without_extension}.png"

    filepath = os.path.join(output_dir, filename)
    image.save(filepath)

In [11]:
import evaluate

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    results = {}
    results.update(accuracy_metric.compute(
        predictions=predictions, 
        references=p.label_ids,
        )
    )
    results.update(f1_metric.compute(predictions=predictions, references=p.label_ids, average="weighted"))
    return results
#

In [12]:
from transformers import ViTForImageClassification, Trainer, TrainingArguments, ViTConfig

training_args = TrainingArguments(
  output_dir=base_output_dir,
  per_device_train_batch_size=16,
  eval_strategy="steps",
  num_train_epochs=8,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=5e-5,
  save_total_limit=2,
  seed=seed,
  remove_unused_columns=False,
  push_to_hub=False,
  load_best_model_at_end=True,
  weight_decay=0.01,  # Add this line to apply L2 regularization
)

config = ViTConfig.from_pretrained(model_id)

# If you want to change it (do this BEFORE loading the model with from_pretrained):
config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.2
config.num_labels = len(dataset_train.features['label'].names)

print(config.hidden_dropout_prob)
print(config.attention_probs_dropout_prob)

model = ViTForImageClassification.from_pretrained(
    model_id,  # classification head
    config=config,
)

model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_train,
    eval_dataset=prepared_test,
    processing_class=feature_extractor,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.2
0.2


In [13]:
train_results = trainer.train()

# save tokenizer with the model
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

# save the trainer state
trainer.save_state()

Step,Training Loss,Validation Loss,Accuracy,F1
100,1.0411,0.953686,0.734317,0.726322
200,0.6418,0.644705,0.813653,0.816346
300,0.571,0.55643,0.819188,0.82375
400,0.5132,0.455889,0.846863,0.849575
500,0.2836,0.47963,0.839483,0.839348
600,0.2331,0.426909,0.845018,0.848124
700,0.2688,0.423351,0.852399,0.856389
800,0.2532,0.421326,0.865314,0.867861
900,0.1889,0.432182,0.852399,0.854611
1000,0.2286,0.431955,0.852399,0.855673


***** train metrics *****
  epoch                    =          8.0
  total_flos               = 1250029893GF
  train_loss               =       0.4553
  train_runtime            =   0:27:00.33
  train_samples_per_second =       10.689
  train_steps_per_second   =        0.671


In [14]:
from transformers import Trainer, ViTForImageClassification, ViTFeatureExtractor

# Load the trained model
model = ViTForImageClassification.from_pretrained(base_output_dir)
feature_extractor = ViTFeatureExtractor.from_pretrained(base_output_dir)

# Define the Trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_train,
    eval_dataset=prepared_test,
    processing_class=feature_extractor,
)

# Now you can run the evaluation
eval_results = trainer.evaluate()

# Log and print the evaluation metrics
trainer.log_metrics("eval", eval_results)
trainer.save_metrics("eval", eval_results)

print(eval_results)



***** eval metrics *****
  eval_accuracy               =     0.8653
  eval_f1                     =     0.8679
  eval_loss                   =     0.4213
  eval_model_preparation_time =        0.0
  eval_runtime                = 0:00:23.06
  eval_samples_per_second     =     23.498
  eval_steps_per_second       =      2.948
{'eval_loss': 0.42132607102394104, 'eval_model_preparation_time': 0.0, 'eval_accuracy': 0.8653136531365314, 'eval_f1': 0.8678605637128383, 'eval_runtime': 23.0655, 'eval_samples_per_second': 23.498, 'eval_steps_per_second': 2.948}
