This code tries to fine-tune the model with our dataset. It uses the code provided by **https://github.com/amoudgl/pygoturn**. It is made to be run on Google Colab but you can use it wherever you want.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('./drive/MyDrive/pygoturn/src')
print(os.getcwd())

/content/drive/MyDrive/pygoturn/src


# Fine-Tuning

## Dataset

In [None]:
import torch
from torch.utils.data import Dataset, ConcatDataset
import os
import numpy as np
import cv2
from torchvision import transforms
from PIL import Image
import random

class GOTURN_Dataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform

        # Get all image files inside the root folder
        self.frame_files = sorted([f for f in os.listdir(root_dir) if f.endswith(".bmp")])

        # Load bounding boxes
        bbox_path = os.path.join(os.path.dirname(root_dir), "groundtruth_rect.txt")
        if os.path.exists(bbox_path):
            self.bboxes = np.loadtxt(bbox_path, delimiter=",")
        else:
            raise FileNotFoundError(f"Bounding box file not found at {bbox_path}")

        # If bbox file contains frame indices, remove first column
        if self.bboxes.shape[1] == 5:
            self.bboxes = self.bboxes[:, 1:]

        # Ensure we have enough frames
        if len(self.frame_files) < 2 or len(self.bboxes) < 2:
            raise ValueError("Not enough frames or bounding boxes in the dataset")

        # Store image pairs and corresponding bounding boxes
        self.data = self._load_data()

    def _load_data(self):
        """
        Loads image pairs and bounding boxes for each frame sequence.
        """
        data = []
        for i in range(len(self.frame_files) - 1):
            prev_frame_path = os.path.join(self.root_dir, self.frame_files[i])
            curr_frame_path = os.path.join(self.root_dir, self.frame_files[i + 1])
            bbox = self.bboxes[i + 1]  # Assign bbox of next frame

            data.append((prev_frame_path, curr_frame_path, bbox))

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prev_frame_path, curr_frame_path, bbox = self.data[idx]

        # Read images using OpenCV
        prev_frame = cv2.imread(prev_frame_path)
        curr_frame = cv2.imread(curr_frame_path)

        # Ensure the images are read correctly
        if prev_frame is None or curr_frame is None:
            raise ValueError(f"Error loading image: {prev_frame_path}, {curr_frame_path}")

        # Convert from BGR (OpenCV default) to RGB
        prev_frame = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2RGB)
        curr_frame = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2RGB)

        # Convert numpy array to PIL Image
        prev_frame = Image.fromarray(prev_frame)
        curr_frame = Image.fromarray(curr_frame)

        # Apply the transformation if any
        if self.transform:
            # Ensure same random transformation is applied to both frames
            seed = random.randint(0, 99999)  # Generate a random seed
            torch.manual_seed(seed)  # Set the seed for PyTorch transforms
            prev_frame = self.transform(prev_frame)
            torch.manual_seed(seed)  # Reset the seed to ensure same transform
            curr_frame = self.transform(curr_frame)

        return prev_frame, curr_frame, torch.tensor(bbox, dtype=torch.float32)


# Define multiple transformation variations
base_transform = transforms.Compose([
    transforms.Resize((227, 360)),
    transforms.ToTensor(),
])

augmentations = [
    transforms.Compose([
        transforms.Resize((227, 360)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ToTensor(),
    ]),
    transforms.Compose([
        transforms.Resize((227, 360)),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
    ]),
    transforms.Compose([
        transforms.Resize((227, 360)),
        transforms.RandomResizedCrop((227, 360), scale=(0.8, 1.0)),
        transforms.ToTensor(),
    ]),
]

# Create datasets with different transformations
datasets = []
sequences = ["swan", "bag", "bear", "rhino", "book"]

for seq in sequences:
    datasets.append(GOTURN_Dataset(f"../data/OTB/{seq}/img", transform=base_transform))  # Base dataset
    for aug in augmentations:
        datasets.append(GOTURN_Dataset(f"../data/OTB/{seq}/img", transform=aug))  # Augmented datasets

# Concatenate the datasets
combined_dataset = ConcatDataset(datasets)

print(f"Total dataset size: {len(combined_dataset)}")


Total dataset size: 1628


## Fine Tuning


In [None]:
# !pip install got10k #if needed

Collecting got10k
  Downloading got10k-0.1.3.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fire (from got10k)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from got10k)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: got10k, fire, wget
  Building wheel for got10k (setup.py) ... [?25l[?25hdone
  Created wheel for got10k: filename=got10k-0.1.3-py3-none-any.whl size=43858 sha256=693a3775a81c69b212a0ca891bd91508b3c1a1cdd1a26f73150c2e75d191fe10
  Stored in directory: /root/.cache/pip/wheels/0c/75/da/81b64122700ec083d162c374aba1922beb523d542c429ed8ca
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=4ed7

In [None]:
import torch.optim as optim
from goturn import TrackerGOTURN

from torch.utils.data import DataLoader

# Create dataset instance
# dataset = GOTURN_Dataset("../data/OTB/swan/img", transform=None)

# Create DataLoader
train_loader = DataLoader(combined_dataset, batch_size=256, shuffle=True, num_workers=0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 50

# Assuming you've already set up the TrackerGOTURN class
tracker = TrackerGOTURN(net_path="./pytorch_goturn.pth")

# Set model to training mode
tracker.net.train()

# Define the optimizer and loss function
optimizer = optim.Adam(tracker.net.parameters(), lr=1e-4)  # You can adjust the learning rate
criterion = torch.nn.SmoothL1Loss()

# Example training loop
for epoch in range(num_epochs):
    for i, (prev_frame, curr_frame, bbox) in enumerate(train_loader):
        prev_frame, curr_frame, bbox = prev_frame.to(device), curr_frame.to(device), bbox.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        pred_bbox = tracker.net(prev_frame, curr_frame)

        # Calculate loss
        loss = criterion(pred_bbox, bbox)

        # Backpropagate
        loss.backward()

        # Update weights
        optimizer.step()

        if i % 10 == 0:  # Print every 10 steps
            print(f'Epoch {epoch}/{num_epochs}, Step {i}/{len(train_loader)}, Loss: {loss.item()}')

# Save the fine-tuned model
torch.save(tracker.net.state_dict(), "fine_tuned_goturn.pth")

Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth
100%|██████████| 233M/233M [00:02<00:00, 84.0MB/s]


Epoch 0/300, Step 0/7, Loss: 110.74299621582031
Epoch 1/300, Step 0/7, Loss: 86.9543685913086
Epoch 2/300, Step 0/7, Loss: 51.60289764404297
Epoch 3/300, Step 0/7, Loss: 43.509971618652344
Epoch 4/300, Step 0/7, Loss: 45.34307098388672
Epoch 5/300, Step 0/7, Loss: 41.39385986328125
Epoch 6/300, Step 0/7, Loss: 40.110633850097656
Epoch 7/300, Step 0/7, Loss: 40.62428283691406
Epoch 8/300, Step 0/7, Loss: 39.310997009277344
Epoch 9/300, Step 0/7, Loss: 37.426395416259766
Epoch 10/300, Step 0/7, Loss: 34.927513122558594
Epoch 11/300, Step 0/7, Loss: 26.967416763305664
Epoch 12/300, Step 0/7, Loss: 22.292936325073242
Epoch 13/300, Step 0/7, Loss: 17.704883575439453
Epoch 14/300, Step 0/7, Loss: 15.726051330566406
Epoch 15/300, Step 0/7, Loss: 14.655476570129395
Epoch 16/300, Step 0/7, Loss: 14.180479049682617
Epoch 17/300, Step 0/7, Loss: 13.946524620056152
Epoch 18/300, Step 0/7, Loss: 13.535764694213867
Epoch 19/300, Step 0/7, Loss: 12.867097854614258
Epoch 20/300, Step 0/7, Loss: 12.735

Save the new weights

In [None]:
# Load the fine-tuned checkpoint
checkpoint = torch.load('./fine_tuned_goturn.pth', map_location=torch.device('cuda'))

# Create a new dictionary to mimic the original model's structure
new_checkpoint = {
    'state_dict': checkpoint
}

# Save the new checkpoint
torch.save(new_checkpoint, './fine_tuned_goturn_reformatted.pth')

# Check the keys in the new checkpoint to ensure it's in the right format
new_checkpoint = torch.load('./fine_tuned_goturn_reformatted.pth', map_location=torch.device('cuda'))
print(new_checkpoint.keys())

You can run the test by changing the weights on the main notebook (*Project_test.ipynb*) in our github. (the results aren't great).