## Q1: SiamFC Object Tracking

In [9]:
import numpy as np
import tensorflow as tf
import cv2
from tensorflow.keras import layers, models
from PIL import Image


# ---------- SiamFC Network Definition ----------
class SiamFC(tf.keras.Model):
    def __init__(self):
        super(SiamFC, self).__init__()
        self.backbone = models.Sequential([
            layers.Conv2D(96, kernel_size=11, strides=2, activation='relu', input_shape=(127, 127, 3)),
            layers.MaxPooling2D(pool_size=3, strides=2),
            layers.Conv2D(256, kernel_size=5, activation='relu'),
            layers.MaxPooling2D(pool_size=3, strides=2),
            layers.Conv2D(384, kernel_size=3, activation='relu'),
            layers.Conv2D(384, kernel_size=3, activation='relu'),
            layers.Conv2D(256, kernel_size=3),
        ])

    def xcorr(self, z, x):
        """Cross-correlation using depthwise convolution"""
        # Convert z to filter shape (filter_height, filter_width, in_channels, out_channels)
        filters = tf.transpose(z, [1, 2, 3, 0])
        return tf.nn.conv2d(x, filters, strides=1, padding='VALID')

    def call(self, template, search):
        z = self.backbone(template)
        x = self.backbone(search)
        out = self.xcorr(z, x)
        return out


# ---------- Helper Functions ----------
def get_patch(img, center, size, output_size=127):
    cx, cy = center
    w, h = size
    x1 = int(cx - w / 2)
    y1 = int(cy - h / 2)
    x2 = x1 + w
    y2 = y1 + h
    imgh, imgw = img.shape[:2]
    x1 = max(0, x1)
    y1 = max(0, y1)
    x2 = min(imgw, x2)
    y2 = min(imgh, y2)
    if x2 <= x1 or y2 <= y1:
        return np.zeros((output_size, output_size, 3), dtype=np.float32)
    patch = img[y1:y2, x1:x2]
    patch = cv2.resize(patch, (output_size, output_size))
    patch = patch.astype(np.float32) / 255.0
    return patch


# ---------- Load Video and Template ----------
video_path = 'sample_tracking_video.mp4'
cap = cv2.VideoCapture(video_path)

ret, first_frame = cap.read()
init_bbox = cv2.selectROI("Frame", first_frame, fromCenter=False, showCrosshair=True)
cx, cy = init_bbox[0] + init_bbox[2] // 2, init_bbox[1] + init_bbox[3] // 2

# ---------- Prepare SiamFC ----------
model = SiamFC()
model.build(input_shape=[(1, 127, 127, 3), (1, 127, 127, 3)])

# Extract template from initial frame
template = get_patch(first_frame, (cx, cy), (init_bbox[2], init_bbox[3]))
template_tensor = np.expand_dims(template, axis=0)
# ---------- Main Tracking Loop ----------
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Extract search patch
    search = get_patch(frame, (cx, cy), (init_bbox[2] * 2, init_bbox[3] * 2), 255)
    search_tensor = np.expand_dims(search, axis=0)

    # Run SiamFC
    response = model(template_tensor, search_tensor)
    response_np = response.numpy().squeeze()

    # Find peak response
    dy, dx = np.unravel_index(np.argmax(response_np), response_np.shape)
    dy -= response_np.shape[0] // 2
    dx -= response_np.shape[1] // 2
    cx += dx
    cy += dy

    # Draw updated bounding box
    top_left = (int(cx - init_bbox[2] / 2), int(cy - init_bbox[3] / 2))
    bottom_right = (int(cx + init_bbox[2] / 2), int(cy + init_bbox[3] / 2))
    cv2.rectangle(frame, top_left, bottom_right, (0, 255, 0), 2)
    cv2.imshow("Tracking", frame)

    if cv2.waitKey(30) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()