# Training and Testing A Visual Transformer Model
Here we test a visual Transfrom ViT from [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)

In [2]:
import torch
import torchvision as tv

USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
#get models
print(f"Using {device} device")

Using cuda device


## Load our Dataset and Create our dataloaders

In [3]:
batch_size = 64

## Load our model

In [None]:
#hyperparameters:
lr = 0.001
#maximum learning rate we will let our model train in order to train faster at the start
max_lr = 0.01
weight_decay = 0.00001
EPOCHS = 100
#end hyperparameters

#model and optimizers
model = tv.models.vit_b_16(progress = True)
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
#scales the gradients, neccessary for mixed precision data types to properly converge
scaler = torch.amp.GradScaler(device=device)
#change our learning rate based on far we are in training and if we are improving
lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lr, total_steps=EPOCHS) #*len(train_loader))

#added data to our model for ease of use (and to prevent passing so many variables to our training function)
model.device = device
model.name = "ViT_b_16"
model.path = "Trained_Models\\ViT\\" #where to save our best model
print(model.path)

Trained_Models\ViT\


## Train our Model

In [None]:
#we will call the function we defined in "Training_Functions.py"
from Training_Functions import TrainModel

TrainModel(model,EPOCHS, loss_fn, train_loader, val_loader, optimizer, lr_scheduler, scaler)

## Test our Model

In [None]:
#we will call the function we defined in "Training_Functions.py"
from Training_Functions import TestModel
TestModel(model, test_loader, loss_fn)