### Dependecies

In [None]:
!pip3 install datasets==2.9.0

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 

In [None]:
!pip3 install transformers==4.26.1

In [None]:
!pip3 install evaluate==0.4.0

## Data gathering

In [1]:
test_save_path = './data/processed_dataset/test/'
train_save_path = './data/processed_dataset/train/'
val_save_path = './data/processed_dataset/val/'

In [2]:
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
 
test_dataset = load_from_disk(test_save_path)
val_dataset = load_from_disk(val_save_path)
train_dataset = load_from_disk(train_save_path)

In [2]:
import evaluate
import json
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import requests
import torch
from datasets import load_dataset, load_from_disk, Dataset, Features, Array3D
from io import BytesIO
from typing import Tuple
from PIL import Image

In [3]:
from transformers import AutoProcessor, ViTFeatureExtractor, ViTForImageClassification, Trainer, TrainingArguments, default_data_collator

#### Check GPU support

In [5]:
torch.cuda.is_available(),torch.cuda.device_count(),torch.cuda.current_device()

(True, 1, 0)

In [6]:
torch.cuda.device(0),torch.cuda.get_device_name(0)

(<torch.cuda.device at 0x220c084e710>, 'NVIDIA GeForce RTX 3080')

#### Model initalization

In [7]:
val_size = 0.2
test_size = 0.1
model_name = "google/vit-base-patch16-224-in21k"
num_classes = train_dataset.features["label"].num_classes

In [8]:
# Download model from model hub
# model = ViTForImageClassification(num_labels=num_classes).to("cuda")
model = ViTForImageClassification.from_pretrained(model_name, num_labels=num_classes).to("cuda")
# Download feature extractor from hub
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# K for top accuracy metric
k_for_top_acc = 1

# Compute metrics function for binary classification
acc_metric = evaluate.load("accuracy", module_type="metric")

def compute_metrics(eval_pred):
    predicted_probs, labels = eval_pred
    # Accuracy
    predicted_labels = np.argmax(predicted_probs, axis=1)
    acc = acc_metric.compute(predictions=predicted_labels, references=labels)
    # Top-K Accuracy
    top_k_indexes = [np.argpartition(row, -k_for_top_acc)[-k_for_top_acc:] for row in predicted_probs]
    top_k_classes = [top_k_indexes[i][np.argsort(row[top_k_indexes[i]])] for i, row in enumerate(predicted_probs)]
    top_k_classes = np.flip(np.array(top_k_classes), 1)
    acc_k = {
        f"accuracy_k" : sum([label in predictions for predictions, label in zip(top_k_classes, labels)]) / len(labels)
    }
    # Merge metrics
    acc.update(acc_k)
    return acc

In [10]:
# Change labels
id2label = {key:train_dataset.features["label"].names[index] for index,key in enumerate(model.config.id2label.keys())}
label2id = {train_dataset.features["label"].names[index]:value for index,value in enumerate(model.config.label2id.values())}
model.config.id2label = id2label
model.config.label2id = label2id

#### Parameters tuning

In [11]:
import datetime

In [12]:
unique_part = f"{num_classes}_{format(datetime.datetime.now(), '%d%m%y_%H%M%S')}"
model_dir = f"./model_large_{unique_part}"
output_data_dir = f"./outputs_large_{unique_part}"

# Total number of training epochs to perform
num_train_epochs = 30
# The batch size per GPU/TPU core/CPU for training
per_device_train_batch_size = 32
# The batch size per GPU/TPU core/CPU for evaluation
per_device_eval_batch_size = 64
# The initial learning rate for AdamW optimizer
learning_rate = 2e-5
# Number of steps used for a linear warmup from 0 to learning_rate
warmup_steps = 500
# The weight decay to apply to all layers except all bias and LayerNorm weights in AdamW optimizer
weight_decay = 0.01

unique_part = f"{num_classes}_{num_train_epochs}_{format(datetime.datetime.now(), '%d%m%y_%H%M%S')}"
model_dir = f"./model_large_{unique_part}"
output_data_dir = f"./outputs_large_{unique_part}"


main_metric_for_evaluation = "accuracy"

#### Train

In [13]:
training_args = TrainingArguments(
    output_dir = model_dir,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    per_device_eval_batch_size = per_device_eval_batch_size,
    warmup_steps = warmup_steps,
    weight_decay = weight_decay,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    logging_dir = f"{output_data_dir}/logs",
    learning_rate = float(learning_rate),
    load_best_model_at_end = True,
    metric_for_best_model = main_metric_for_evaluation,
)

# Create Trainer instance
trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = default_data_collator,
    tokenizer = feature_extractor
)

In [14]:
trainer.train()

***** Running training *****
  Num examples = 34421
  Num Epochs = 30
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 32280
  Number of trainable parameters = 85820957


Epoch,Training Loss,Validation Loss,Accuracy,Accuracy K
1,1.7665,0.805173,0.843175,0.843175
2,0.5741,0.5131,0.878849,0.878849
3,0.326,0.386145,0.902734,0.902734
4,0.1988,0.364157,0.909544,0.909544
5,0.1236,0.348223,0.912593,0.912593
6,0.0752,0.383202,0.912389,0.912389
7,0.0464,0.403834,0.911881,0.911881
8,0.0281,0.460707,0.911983,0.911983
9,0.0227,0.466238,0.91117,0.91117
10,0.015,0.497549,0.911475,0.911475


***** Running Evaluation *****
  Num examples = 9839
  Batch size = 64
Saving model checkpoint to ./model_large_29_30_280723_200223\checkpoint-1076
Configuration saved in ./model_large_29_30_280723_200223\checkpoint-1076\config.json
Model weights saved in ./model_large_29_30_280723_200223\checkpoint-1076\pytorch_model.bin
Image processor saved in ./model_large_29_30_280723_200223\checkpoint-1076\preprocessor_config.json
***** Running Evaluation *****
  Num examples = 9839
  Batch size = 64
Saving model checkpoint to ./model_large_29_30_280723_200223\checkpoint-2152
Configuration saved in ./model_large_29_30_280723_200223\checkpoint-2152\config.json
Model weights saved in ./model_large_29_30_280723_200223\checkpoint-2152\pytorch_model.bin
Image processor saved in ./model_large_29_30_280723_200223\checkpoint-2152\preprocessor_config.json
***** Running Evaluation *****
  Num examples = 9839
  Batch size = 64
Saving model checkpoint to ./model_large_29_30_280723_200223\checkpoint-3228
Conf

TrainOutput(global_step=32280, training_loss=0.10811014241011539, metrics={'train_runtime': 10771.479, 'train_samples_per_second': 95.867, 'train_steps_per_second': 2.997, 'total_flos': 8.00399176062306e+19, 'train_loss': 0.10811014241011539, 'epoch': 30.0})

#### Evaluation

In [15]:
log_history = pd.DataFrame(trainer.state.log_history)
log_history = log_history.fillna(0)
log_history = log_history.groupby(['epoch']).sum()
log_history

Unnamed: 0_level_0,loss,learning_rate,step,eval_loss,eval_accuracy,eval_accuracy_k,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.0,1.7665,1.963751e-05,2152,0.805173,0.843175,0.843175,36.283,271.174,4.244,0.0,0.0,0.0,0.0,0.0
2.0,0.5741,1.896035e-05,4304,0.5131,0.878849,0.878849,35.2488,279.13,4.369,0.0,0.0,0.0,0.0,0.0
3.0,0.326,1.82832e-05,6456,0.386145,0.902734,0.902734,35.4342,277.67,4.346,0.0,0.0,0.0,0.0,0.0
4.0,0.1988,1.760604e-05,8608,0.364157,0.909544,0.909544,35.5309,276.914,4.334,0.0,0.0,0.0,0.0,0.0
5.0,0.1236,1.692889e-05,10760,0.348223,0.912593,0.912593,35.3684,278.186,4.354,0.0,0.0,0.0,0.0,0.0
6.0,0.0752,1.625173e-05,12912,0.383202,0.912389,0.912389,35.3165,278.595,4.361,0.0,0.0,0.0,0.0,0.0
7.0,0.0464,1.557458e-05,15064,0.403834,0.911881,0.911881,35.3514,278.32,4.356,0.0,0.0,0.0,0.0,0.0
8.0,0.0281,1.489742e-05,17216,0.460707,0.911983,0.911983,35.2646,279.005,4.367,0.0,0.0,0.0,0.0,0.0
9.0,0.0227,1.422026e-05,19368,0.466238,0.91117,0.91117,35.2627,279.02,4.367,0.0,0.0,0.0,0.0,0.0
10.0,0.015,1.354311e-05,21520,0.497549,0.911475,0.911475,35.2677,278.98,4.367,0.0,0.0,0.0,0.0,0.0


In [16]:
trainer.save_model(model_dir)

Saving model checkpoint to ./model_large_29_30_280723_200223
Configuration saved in ./model_large_29_30_280723_200223\config.json
Model weights saved in ./model_large_29_30_280723_200223\pytorch_model.bin
Image processor saved in ./model_large_29_30_280723_200223\preprocessor_config.json


In [11]:
# Load dataset
model_dir = 'C:\\research\\model_large_29_30_280723_200223'
test_save_path = 'C:\\research\\data\\processed_dataset\\test'
test_dataset = load_from_disk(test_save_path)
# Load trained model
model = ViTForImageClassification.from_pretrained(model_dir)

# Load feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained(model_dir)

# Create Trainer instance
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
    tokenizer=feature_extractor
)

# Evaluate model
eval_results = trainer.evaluate(eval_dataset=test_dataset)

print(eval_results)

loading configuration file C:\research\model_large_29_30_280723_200223\config.json
Model config ViTConfig {
  "_name_or_path": "google/vit-base-patch16-224-in21k",
  "architectures": [
    "ViTForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "aksessuary",
    "1": "belye",
    "2": "bluzyi",
    "3": "bodi",
    "4": "bryuki",
    "5": "chasy",
    "6": "dublyenki_i_shuby",
    "7": "futbolki",
    "8": "jaketyi",
    "9": "jiletyi",
    "10": "kardiganyi",
    "11": "kombinezony",
    "12": "korsetyi",
    "13": "kyuloty",
    "14": "legginsy",
    "15": "nakidki",
    "16": "obuv",
    "17": "platya",
    "18": "polo",
    "19": "shorty",
    "20": "sumki",
    "21": "topyi",
    "22": "trikotaj",
    "23": "verkhnyaya",
    "24": "vintazh",
    "25": "vodolazki",
    "26": "women",
    "27": "yubki",
    "28": "yuvelirnye"
  },
  "image_size

{'eval_loss': 0.6878405213356018, 'eval_accuracy': 0.9198046000407083, 'eval_accuracy_k': 0.9198046000407083, 'eval_runtime': 21.742, 'eval_samples_per_second': 225.968, 'eval_steps_per_second': 28.286}
