# Computer Vision HT 2025 - Optional Practical 4 (v1.0)

**This practical is entirely optional.**

## Instructions

1. Start Google Colab: https://colab.research.google.com. A modal dialog should have appeared to open a new notebook. If not, go to "File>Open notebook".
2. From the open notebook dialog, select the GitHub "tab" and enter this URL: https://github.com/chrirupp/cv_course
3. The notebook(s) should appear (*.ipynb). Select the one for the current practical.
4. To run a notebook on Colab you will typically need some data files (e.g., images). As Colab only loads the notebook itself, these other files need to be downloaded separately. The following cell is a `%%sh` block that downloads the required files. You can inspect the downloaded files by clicking on the "Files" tab on the left.

## Practicalities

The signing-off happens in the last half hour of each session or at the beginning of the following one.
As usual, when checking your work the demonstrator will want to see a working version of the program in action, as well as appropriate comments of your code. Try to make your report as concise as possible, perhaps in the form of appropriate comments to your code.

Since this is a new practical task, any errors, ambiguities or suggestions for improvement should be flagged as soon as possible.

If you are not familiar with the way practicals run, there are department-wide [rules](https://www.cs.ox.ac.uk/teaching/courses/2023-2024/practicals/). There you will find how the compulsory part, the optional tasks, and your report will factor into your mark.

## Advice

* You will need to look at the code for the lectures. There you will find many related computations that you can reuse and adapt to solve the practicals.
* The compulsory part of this practical is designed to give you additional understanding of the concepts taught in the lectures. It should be achievable in one session.

Here we import some libraries that we will need to process images, do maths, and to visualise results.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torchvision
import torch
import cv2
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import json
from torchvision import transforms
from tqdm import tqdm
from torchvision import datasets, transforms
from IPython.display import display, clear_output
%matplotlib inline

The usual set of helper functions:

In [None]:
class Visualizer():
    def __init__(self, num_rows=1, num_cols=1, figsize=(5,5), axis_off=True, title='', tight=False, cm=None):
        self.fig, self.axs = plt.subplots(num_rows, num_cols, figsize=figsize, squeeze=False)
        # remove ticks
        if axis_off:
          plt.setp(plt.gcf().get_axes(), xticks=[], yticks=[])
        # set colormap
        if cm is not None:
            plt.set_cmap(cm)
        # set supertitle
        self.fig.suptitle(title)
        if tight:
            self.fig.subplots_adjust(top=0.88)

    def add_image_subplot(self, i, j, image, normalize=False, title_str=''):
        if normalize:
            image = self.normalize_image(image)
        if len(image.shape) == 3:
            #BGR -> RGB
            image = image[:, :, ::-1]
        self.axs[i, j].imshow(image)
        self.axs[i, j].set_title(title_str)

    def show(self):
        display(self.fig)
        clear_output(wait = True)
        plt.pause(0.05)

    def add_stem_subplot(self, i, j, x, y, title_str=''):
        self.axs[i, j].stem(x, y)
        self.axs[i, j].set_title(title_str)

    def add_subplot(self, i, j, data, title_str=''):
        self.axs[i, j].plot(data)
        self.axs[i, j].set_title(title_str)

    def add_subplot_xy(self, i, j, x, y, title_str=''):
        self.axs[i, j].plot(x,y)
        self.axs[i, j].set_title(title_str)

    def add_bar_subplot(self, i, j, x, y, title_str=''):
        self.axs[i, j].bar(x, y)
        self.axs[i, j].set_title(title_str)

    @staticmethod
    def normalize_image(image):
        img = np.float64(image) - np.min(image)
        img /= np.max(img)
        return img

In [None]:
%%sh
# Download the data - you need to do this only once
wget --no-verbose --output-document=./image_cc3.jpg https://github.com/chrirupp/cv_course/raw/main/data/image_cc3.jpg

## Problem 4.1 - Coordinate Networks

We have seen that neural rendering (e.g. NeRFs) learn a coordinate network that maps from coordinates to colours. Here we will train a small model that learns to memorise a single 2D image $f(x,y) = (r,g,b)$.
Define a small 3 layer MLP and write a function to sample an image from it. We will normalise pixel coordinates to lie in the range -1 to 1.

In [None]:
image = cv2.imread('image_cc_3.jpg')
#downsample the image by a factor of 8
image = cv2.resize(image, (0,0), fx=0.125, fy=0.125)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

image = torch.tensor(image, dtype=torch.float32).permute(2,0,1)/255.0

class CoordinateNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = None 
        self.fc2 = None
        self.fc3 = None

    def forward(self, x):
        return x
    
model = CoordinateNetwork()

def generate_image(model):
    img = torch.zeros_like(image)

    # generate one row at a time
    for y in range(image.shape[1]):
        y_coordinates = None
        x_coordinates = None
        pixel_coordinates = torch.cat([x_coordinates, y_coordinates], dim=1)
        with torch.no_grad():
            colors = model(pixel_coordinates)
        img[:, y, :] = colors.permute(1,0).detach().cpu()
    
    return torch.clip(img, 0, 1)

gen_image = generate_image(model)
vis = Visualizer(num_rows=1, num_cols=2, figsize=(10,5), axis_off=True)
vis.add_image_subplot(0, 0, image.permute(1,2,0).numpy()[:, :, ::-1], title_str='Target image')
vis.add_image_subplot(0, 1, gen_image.permute(1,2,0).numpy()[:, :, ::-1], title_str='Generated image')


Load `image_cc3.jpg` and train it to reconstruct the image. For this, randomly sample a batch of pixel coordinates. You can use `torch.grid_sample` to find the colors for these pixels. This support bilinear interpolation so you do not need to sample exact pixel coordinates.

In [None]:
def train(model, optimizer, iterations=10000):
    bar = tqdm(range(iterations))
    for _ in bar:
        pixel_coordinates = None
        target_colors = None
        pred_colors = model(pixel_coordinates)
        loss = torch.mean(torch.abs(target_colors - pred_colors))

        bar.set_description(f'Loss: {loss.item()}', refresh=False)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

model = CoordinateNetwork()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for steps in range(100):
    train(model, optimizer, iterations=10000)
    gen_image = generate_image(model)
    vis = Visualizer(num_rows=1, num_cols=2, figsize=(10,5), axis_off=True)
    vis.add_image_subplot(0, 0, image.permute(1,2,0).numpy()[:, :, ::-1], title_str='Target image')
    vis.add_image_subplot(0, 1, gen_image.permute(1,2,0).numpy()[:, :, ::-1], title_str='Generated image')
    vis.show()

The generated image is very bad and blurry as the network has difficulties learning high frequency components. Positional encoding comes to our rescue. Define a second network that embeds the pixels coordinates using multiple octaves of sin and cos. Separately encode each channel using a positional encoding. The lowest frequency should be `torch.pi`, and each frequency thereafter should be double the previous frequency. For each frequency, you should encode the input signal using both sine and cosine. 

In [None]:
class CoordinateNetworkPE(nn.Module):
    def __init__(self, num_octaves=6):
        super().__init__()
        self.num_octaves = num_octaves
        # todo - precompute as much as you can

        self.fc1 = None
        self.fc2 = None
        self.fc3 = None

    def forward(self, x):
        
        # todo - implement positional encoding
        x = None
        
        # todo - implement forward pass
        return x
    
model = CoordinateNetworkPE()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for steps in range(100):
    train(model, optimizer, iterations=10000)
    gen_image = generate_image(model)
    vis = Visualizer(num_rows=1, num_cols=2, figsize=(10,5), axis_off=True)
    vis.add_image_subplot(0, 0, image.permute(1,2,0).numpy()[:, :, ::-1], title_str='Target image')
    vis.add_image_subplot(0, 1, gen_image.permute(1,2,0).numpy()[:, :, ::-1], title_str='Generated image')
    vis.show()

Play around with the MLP size, number of octaves, learning rate, loss function, etc. 

* Which configuration give the best results?
* Why does the model learn a grayscale image first?

## Problem 4.2 - RotNet

In practical 2 you have trained a classifier on CIFAR10. Now train an unsupervised network using only the images. We will follow the idea of [RotNet](https://arxiv.org/pdf/1803.07728.pdf) and learn to predict rotations.
To do this, you will need to randomly rotate the images in your batch by multiples of 90 degress (this gives you 4 classes) and use the rotation as the class label.

To evaluate your model, you can implement nearest neighbour lookup: For each validation sample check which training sample is closest in feature space. Then use that training sample's label to classify the validation set. This is slightly unrealistic (if we had training labels we could have trained with supervision) but a good baseline to understand the quality of your embedding space.

For fast NN lookup you can use the `faiss` library. It needs these steps to install on Colab.
```
!apt install libomp-dev
!python -m pip install --upgrade faiss faiss-gpu
import faiss
```

Compare the performance of your classifier to an untrained feature learning network and your supervised model from practical 2. 

You can start with this supervised template:

In [None]:
# get cifar10 dataset
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = datasets.CIFAR10('../data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10('../data', train=False, download=True, transform=transform)

# create dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False)

def evaluate(model, test_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    accuracy = 100. * correct / len(test_loader.dataset)
    return accuracy

def train(model, train_loader, test_loader, criterion, optimizer, num_epochs=10):
    train_losses = []
    test_accuracies = []
    for epoch in range(num_epochs):
        model.train()
        for data, target in tqdm(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        test_acc = evaluate(model, test_loader)
        test_accuracies.append(test_acc)
        print(f'Epoch {epoch+1}/{num_epochs}, Test accuracy: {test_acc:.2f}%')

    return train_losses, test_accuracies

class MiniCNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = torch.nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = torch.nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc = torch.nn.Linear(4096//2, 10)

    def forward(self, x):
        x = torch.nn.functional.relu(self.conv1(x))
        x = torch.nn.functional.max_pool2d(x, kernel_size=2, stride=2)
        x = torch.nn.functional.relu(self.conv2(x))
        x = torch.nn.functional.max_pool2d(x, kernel_size=2, stride=2)
        x = x.view(-1, 4096//2)
        x = self.fc(x)
        return x
    
model = MiniCNN()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_losses, test_accuracies = train(model, train_loader, test_loader, criterion, optimizer, num_epochs=5)