# Training and Testing A Visual Transformer Model
Here we test a visual Transfrom ViT from [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)

In [None]:
import torch
import torchvision as tv
import torchvision.transforms.v2 as v2
from our_datasets import Country_images
#from Country_dict import comp_country_dict
from ViT import VisionTransformer

USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
#get models
print(f"Using {device} device")

Using cuda device


## Load our Dataset and Create our dataloaders

In [None]:
batch_size = 32
weights = tv.models.ViT_B_16_Weights.DEFAULT
transform = v2.Compose([weights.transforms(), ])


dataset_path = "data\\compressed_dataset\\"
dataset = Country_images("data\\compressed_dataset\\country_comp.csv",dataset_path,transform=transform)
num_classes = dataset.get_num_classes()
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset,lengths=[0.7,0.1,0.2])
data_loader_params = {
    'batch_size': batch_size,  # Batch size for data loading
    'num_workers': 10,  # Number of subprocesses to use for data loading
    'persistent_workers': True,  # If True, the data loader will not shutdown the worker processes after a dataset has been consumed once. This allows to maintain the worker dataset instances alive.
    'pin_memory': True,  # If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Useful when using GPU.
    'pin_memory_device': 'cuda' ,  # Specifies the device where the data should be loaded. Commonly set to use the GPU.
}
train_dataloader      = torch.utils.data.DataLoader(train_dataset, **data_loader_params, shuffle=True)
val_dataloader        = torch.utils.data.DataLoader(val_dataset, **data_loader_params, shuffle=True)
test_dataloader       = torch.utils.data.DataLoader(test_dataset, **data_loader_params, shuffle=False,in_order=True)

## Load our model

In [3]:
#hyperparameters:
lr = 0.001
#maximum learning rate we will let our model train in order to train faster at the start
max_lr = 0.1
weight_decay = 0.00001
EPOCHS = 100
#end hyperparameters

#model and optimizers
model = tv.models.vit_b_16(weights=weights)
#model = VisionTransformer(num_classes=num_classes)
model.device = device
model.name = "ViT_l_16"
model.path = "Trained_Models\\ViT\\" #where to save our best model
print(model.path)
#redfine our output layer to output our classes
model.heads.head = torch.nn.Linear(in_features=model.heads.head.in_features,out_features=num_classes,bias=True)
print(model)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
#scales the gradients, neccessary for mixed precision data types to properly converge
scaler = torch.amp.GradScaler(device=device)
#change our learning rate based on far we are in training and if we are improving
lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lr, total_steps=EPOCHS*len(train_dataloader))

#added data to our model for ease of use (and to prevent passing so many variables to our training function)


Trained_Models\ViT\
VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=T

## Train our Model

In [4]:
#we will call the function we defined in "Training_Functions.py"
from Training_Functions import TrainModel
model_train = True

if model_train:
    model = model.to(device)
    TrainModel(model,EPOCHS, loss_fn, train_dataloader, val_dataloader, optimizer, lr_scheduler, scaler)
if not model_train:
    checkpoint = torch.load(model.path+model.name+"-best")
    model.load_state_dict(checkpoint)
    model = model.to(device)

EPOCH 1:
  batch 100 loss: 3.5072414875030518
  batch 200 loss: 3.363983631134033
  batch 300 loss: 3.3229148387908936
  batch 400 loss: 3.288853168487549
  batch 500 loss: 3.258984088897705
  batch 600 loss: 3.2367053031921387
  batch 700 loss: 3.2282090187072754
  batch 800 loss: 3.2222955226898193
  batch 900 loss: 3.2156665325164795
  batch 1000 loss: 3.2076709270477295


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "c:\Users\jacob\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\utils\data\_utils\worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jacob\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\utils\data\_utils\fetch.py", line 50, in fetch
    data = self.dataset.__getitems__(possibly_batched_index)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jacob\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\utils\data\dataset.py", line 416, in __getitems__
    return [self.dataset[self.indices[idx]] for idx in indices]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jacob\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\utils\data\dataset.py", line 416, in <listcomp>
    return [self.dataset[self.indices[idx]] for idx in indices]
            ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jacob\Documents\GitHub\ECE-228-Group-Project\our_datasets.py", line 30, in __getitem__
    label = self.target_transform(comp_country_dict[self.labels.iloc[idx, 1]],self.num_classes)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: Country_images.target_transform() takes 2 positional arguments but 3 were given


## Test our Model

In [None]:
#we will call the function we defined in "Training_Functions.py"
from Training_Functions import TestModel
TestModel(model, test_dataloader, loss_fn)