# Creating a Simple SeqTrack Model

First we have to import all the required libraries for creating a simple
SeqTrack model. These mostly include standard pytorch improts along with importing th vit encoder we will use for the model.

In [2]:
from zipfile import ZipFile, BadZipFile
import os

from torchvision import models, datasets, tv_tensors
from torchvision.transforms import v2
import pathlib
from torchvision.models import vit_b_16, vit_b_32
import matplotlib.pyplot as plt
from torchvision.utils import draw_bounding_boxes
import torch
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(0)

<torch._C.Generator at 0x7a0011d38c70>

# Data Setup
Next we import the data and trasform it for training. For this model, I am using the readily available Common Objects in Context (COCO) dataset. I am using the standard dataset as the template image in the SeqTrack model and I am using a distorted image to act as the search image. I use several standard pytorch transforms to distort the image.

In [None]:
def extract_zip_file(extract_path):
     try:
         with ZipFile(extract_path+".zip") as zfile:
             zfile.extractall(extract_path)
         # remove zipfile
         zfileTOremove=f"{extract_path}"+".zip"
         if os.path.isfile(zfileTOremove):
             os.remove(zfileTOremove)
         else:
             print("Error: %s file not found" % zfileTOremove)
     except BadZipFile as e:
         print("Error:", e)
# Download and exact the cocodataset.
!wget http://images.cocodataset.org/zips/train2017.zip -O coco_train2017.zip
!wget http://images.cocodataset.org/zips/val2017.zip -O coco_val2017.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip -O coco_ann2017.zip

extract_train_path = "./coco_train2017"
extract_val_path = "./coco_val2017"
extract_ann_path="./coco_ann2017"
extract_zip_file(extract_train_path)
extract_zip_file(extract_val_path)
extract_zip_file(extract_ann_path)

# For the search images, we distort the images during the transform to act as a next
# video frame in the sequence. This is our search target
transform_search = v2.Compose(
    [
        v2.ToImage(),
        v2.RandomPhotometricDistort(p=1),
        v2.RandomZoomOut(fill={tv_tensors.Image: (123, 117, 104), "others": 0}),
        #v2.RandomIoUCrop(),
        v2.RandomHorizontalFlip(p=1),
        v2.ToDtype(torch.float32, scale=True),
    ]
)
# We use a simple transform for the template images
transform_template = v2.Compose(
    [
        v2.ToImage(),
        v2.SanitizeBoundingBoxes(),
        v2.ToDtype(torch.float32, scale=True),
    ]
)

coco_dataset_search = datasets.CocoDetection( root='coco_train2017/train2017',
                                                  annFile='coco_ann2017/annotations/instances_train2017.json',
                                                   transform=transform_search )
coco_dataset = datasets.CocoDetection( root='coco_train2017/train2017',
                                                  annFile='coco_ann2017/annotations/instances_train2017.json',
                                                   transform=transform_template )
# Combine the data sets into one dataset for use
combined_dataset = torch.utils.data.StackDataset(search=coco_dataset_search, template=coco_dataset)

--2024-11-11 03:13:47--  http://images.cocodataset.org/zips/train2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 54.231.204.129, 16.182.105.65, 3.5.28.182, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|54.231.204.129|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19336861798 (18G) [application/zip]
Saving to: ‘coco_train2017.zip’


2024-11-11 03:22:28 (35.4 MB/s) - ‘coco_train2017.zip’ saved [19336861798/19336861798]

--2024-11-11 03:22:28--  http://images.cocodataset.org/zips/val2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.76.116, 16.15.177.171, 52.217.136.33, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.76.116|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 815585330 (778M) [application/zip]
Saving to: ‘coco_val2017.zip’


2024-11-11 03:22:51 (34.5 MB/s) - ‘coco_val2017.zip’ saved [815585330/815585330]

--2024-11-11 03:22:51--  http:

# Visual the Dataset
For this next part of the code, I visualize the dataset as well as verify that everything is set up correctly.

In [1]:
!pip freeze > requirements.txt
# Create a PIL Image to Tensor transform that will be used to visualize the data
pil_2_tensor = v2.Compose([v2.PILToTensor(),])
# the following transform is needed to make all the images the
# same size when inputted into the model
pil_2_tensor_resize = v2.Compose([v2.PILToTensor(),v2.Resize((224,224)), v2.ToDtype(torch.float32, scale=True)])
# Wrap the dataset for use with pytorch transformers
dataset = datasets.wrap_dataset_for_transforms_v2(coco_dataset, target_keys=["boxes", "labels", "masks"])
print(len(coco_dataset))
print(type(coco_dataset))
print(len(dataset))
print(type(dataset))

# Visualize the transformer wrapped dataset
for x in dataset:
    img = pil_2_tensor(x[0])
    img_with_boxes = draw_bounding_boxes(img, x[1]['boxes'], width=3)
    plt.imshow(img_with_boxes.numpy().transpose(1, 2, 0))
    plt.show()
    print(x[1]['boxes'])
    break

# Visualize an image from the search dataset
for img, target in coco_dataset_search:
    plt.imshow(img)#.permute(1, 2, 0))
    plt.show()
    print(target)
    break

# Create the SeqTrack Module
Below is the core part of the code. The code itself is relatively straightforward. This simplicitic implementation is part of SeqTrack's strengths because it is able to achieve state of the art results for visual tracking while also being significantly faster than other more complicated models.

In [None]:
# Here is the core of the project. The code itself is relatively straightforward
# The module uses a pretrained ViT encoder with a causal decoder
class SeqTrack(nn.Module):
    def __init__(self):
        super(SeqTrack, self).__init__()
        self.hidden_dim = 256
        self.encoder = vit_b_16(weights='DEFAULT')
        for param in self.encoder.parameters():
            param.requires_grad = False
        self.encoder_2_decoder=nn.Linear(1000,self.hidden_dim*2)
        self.embedding = nn.Embedding(4000,self.hidden_dim)
        self.decoder = nn.TransformerDecoder(nn.TransformerDecoderLayer(d_model=self.hidden_dim,
                                                                        nhead=8, dim_feedforward=1024,
                                                                        dropout=0.1, activation="relu",
                                                                        batch_first=False, ),2)
        self.fc = nn.Linear(self.hidden_dim, 4000)  # convert output to n bins

    def forward(self, x,tgt_boxes):
        x = self.encoder(x)
        x = self.encoder_2_decoder(x)
        emb_box = [self.embedding(x) for x in tgt_boxes]
        x = self.decoder(torch.stack(emb_box),x.view(4,4,self.hidden_dim))
        x = self.fc(x)
        x = nn.Softmax(dim=-1)(x)
        return x


model = SeqTrack()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Using standard Cross Entropy Loss
criterion = nn.CrossEntropyLoss()


In [None]:
def normalize_bbox(bbox):
    # Normalize the bounding box coordinates to the range [0,1]
    y_max = bbox.canvas_size[0]
    x_max = bbox.canvas_size[1]
    for x in bbox:
        x[0] = x[0] / x_max
        x[1] = x[1] / y_max
        x[2] = x[2] / x_max
        x[3] = x[3] / y_max
    return bbox

# Train the Model
The data needs to be processed a little before it can be inputted into the model. First I load the dataset into a standard pytorch DataLoader object. While iterating through the data loader for training, I first extract the bounding box and convert it to the 4000 quantized bins. The images are also converted to tensors in order to be inputted into the model.

In [None]:
num_epochs = 10
dataset = datasets.wrap_dataset_for_transforms_v2(coco_dataset, target_keys=["boxes", "labels", "masks"])

data_loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True, num_workers=2,collate_fn=lambda batch: tuple(zip(*batch)),)
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    #for images, targets in data_loader:
    for images, targets in data_loader:
        optimizer.zero_grad()

        try:
            bboxes = [target['boxes'] for target in targets]
            bboxes = [(normalize_bbox(bb)*(4000-1)).int() for bb in bboxes]
            input_boxes = [bb[0] for bb in bboxes]
            images_tensor = torch.stack([pil_2_tensor_resize(img) for img in images])
            outputs = model(images_tensor,input_boxes)

            target_boxes = torch.zeros((8,4000),dtype=torch.int64)
            i = int(0)
            for c in input_boxes:
                for index in c:
                    target_boxes[i][index] = 1
                i+=1

            loss = criterion(outputs,target_boxes)

            # update the model params
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        except Exception as e:
            print(e)
            continue
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(data_loader):.4f}")


# Finally Test the model
Load the valuation dataset and run the model on it to find the loss and collect the outputed bounding box probabilities.

In [None]:
val_coco_dataset = datasets.CocoDetection( root='coco_coco_val2017/coco_val2017',
                                                  annFile='coco_ann2017/annotations/instances_val2017.json',
                                                   transform=transform_template )

val_dataset = datasets.wrap_dataset_for_transforms_v2(val_coco_dataset, target_keys=["boxes", "labels", "masks"])
all_outputs =[]
model.eval()
outputs_val = model(dataset)
val_data_loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True, num_workers=2,collate_fn=lambda batch: tuple(zip(*batch)),)
running_loss = 0
for images, targets in val_data_loader:
     val_bboxes = [target['boxes'] for target in targets]
     val_bboxes = [(normalize_bbox(bb)*(4000-1)).int() for bb in bboxes]
     val_input_boxes = [bb[0] for bb in bboxes]
     val_images_tensor = torch.stack([pil_2_tensor_resize(img) for img in images])
     outputs = model(val_images_tensor,val_input_boxes)
     all_outputs.append(outputs)
     loss = criterion(outputs,target_boxes)
     running_loss += loss.item()
print(f"Loss: {running_loss / len(val_data_loader):.4f}")
