# Training

This notebook goes through an example of how to use Faeyon for model fine-tuning. We will be using a ViT model and fine-tune it using a custom dataset. 

## Initialize model

We will use the pre-trained model -- . This model is available from the built-in configurations. 

In [1]:
from faeyon.models import Pipeline
from faeyon.models.tasks import ClassifyTask
from faeyon.training import Recipe
from faeyon.io import load

import torch
from torch import nn
from faeyon import X, Op

In [19]:
model = load("vit/vit-base-patch16-224", True)

TypeError: 'PosixPath' object is not subscriptable

In [6]:
pipeline = Pipeline(
    model=model,
    task=ClassifyTask(num_hidden=768, num_labels=1000, pooling=0)
)

In [7]:
x = torch.randn(1, 3, 224, 224)

In [21]:

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

In [44]:
from transformers import AutoImageProcessor, ViTMAEForPreTraining, ViTForImageClassification
from PIL import Image
import requests

In [22]:
processor = AutoImageProcessor.from_pretrained('facebook/vit-mae-large')
model = ViTMAEForPreTraining.from_pretrained('facebook/vit-mae-large')

preprocessor_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


pytorch_model.bin:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

In [45]:
model = ViTForImageClassification.from_pretrained('facebook/vit-mae-large')

You are using a model of type vit_mae to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at facebook/vit-mae-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
inputs = processor(images=image, return_tensors="pt")

In [30]:
outputs = model(**inputs)

In [52]:
batch_size = 3
seq_length = 8
noise = torch.rand(batch_size, seq_length)
noise

tensor([[0.6318, 0.1906, 0.5343, 0.2344, 0.9375, 0.7655, 0.5219, 0.0964],
        [0.3034, 0.3704, 0.3058, 0.0647, 0.6987, 0.5331, 0.5707, 0.7920],
        [0.2304, 0.2124, 0.7848, 0.7270, 0.9876, 0.4346, 0.3693, 0.8762]])

In [None]:
def random_masking(sequence, noise=None):
    mask_ratio = 0.75
    batch_size, seq_length, dim = sequence.shape
    len_keep = int(seq_length * (1 - mask_ratio))

    if noise is None:
        noise = torch.rand(batch_size, seq_length, device=sequence.device)  # noise in [0, 1]

    # sort noise for each sample
    ids_shuffle = torch.argsort(noise, dim=1).to(sequence.device)  # ascend: small is keep, large is remove
    ids_restore = torch.argsort(ids_shuffle, dim=1).to(sequence.device)

    # keep the first subset
    ids_keep = ids_shuffle[:, :len_keep]
    sequence_unmasked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, dim))

    # generate the binary mask: 0 is keep, 1 is remove
    mask = torch.ones([batch_size, seq_length], device=sequence.device)
    mask[:, :len_keep] = 0
    # unshuffle to get the binary mask
    mask = torch.gather(mask, dim=1, index=ids_restore)

    return sequence_unmasked, mask, ids_restore

In [67]:
sequence = torch.randn(batch_size, seq_length, 4)

res, mask, ids_restore = random_masking(sequence, noise)

In [72]:
import numpy as np

In [75]:
grid_size = 4
grid_h = np.arange(grid_size, dtype=np.float32)
grid_w = np.arange(grid_size, dtype=np.float32)
grid = np.meshgrid(grid_w, grid_h)  # here w goes first
#grid = np.stack(grid, axis=0)

In [76]:
grid

(array([[0., 1., 2., 3.],
        [0., 1., 2., 3.],
        [0., 1., 2., 3.],
        [0., 1., 2., 3.]], dtype=float32),
 array([[0., 0., 0., 0.],
        [1., 1., 1., 1.],
        [2., 2., 2., 2.],
        [3., 3., 3., 3.]], dtype=float32))

In [79]:
np.stack(grid, axis=0).reshape([2, 1, grid_size, grid_size])

array([[[[0., 1., 2., 3.],
         [0., 1., 2., 3.],
         [0., 1., 2., 3.],
         [0., 1., 2., 3.]]],


       [[[0., 0., 0., 0.],
         [1., 1., 1., 1.],
         [2., 2., 2., 2.],
         [3., 3., 3., 3.]]]], dtype=float32)