# VO Pipeline
_Vision Algorithms for Mobile Robotics | Fall 2025_<br><br>
_David Jensen, Alessandro Pirini, Matteo Rubini, Alessandro Ferranti_

## Notes on writing code
For now, try to make each block a function; see below for format.

In [None]:
def this_is_a_function(state, params, whatever_else_function_needs):
    # update state in place and return only things that not included in the state
    return None

# you can then call your function below it to debug, process data for the next step
this_is_a_function(None, None, None)

This way debugging still works easily, but then we can have a couple main blocks at the very end that handle everything nicely.<br>
For the blocks in the _Operation_ section, probably pass in the state dictionary and parameter class and then whatever else might be relevant

Also, when you need to add a parameter (ex confidence for RANSAC), you have options. If it is a global paramter that is the same for all datasets, add it [here](#paramaters-for-all-datasets). If it could change based on the dataset, add it [here](#paramaters-for-specific-datasets). After adding the parameter value in the appropriant place, add where required [here](#paramaters)

Another cool thing is `Jupyter Variables`: click on it in the top toolbar, and it shows name, type, size, and value for all variables. Nice for debugging.

## Imports

### Libraries

In [None]:
import os
from glob import glob

import cv2
import skimage

import numpy as np

import matplotlib as plt

### Data
_Ensure that all datasets have been downloaded and unzipped into their respective folders_

In [None]:
# Dataset -> 0: KITTI, 1: Malaga, 2: Parking, 3: Own Dataset
DATASET = 0

In [None]:
# Define dataset paths
# (Set these variables before running)
kitti_path = "kitti/kitti05/kitti"
malaga_path = "malaga/malaga-urban-dataset-extract-07"
parking_path = "parking/parking"
# own_dataset_path = "/path/to/own_dataset"

if DATASET == 0:
    assert 'kitti_path' in locals(), "You must define kitti_path"
    img_dir = os.path.join(kitti_path, '05/image_0')
    images = glob(os.path.join(img_dir, '*.png'))
    last_frame = 4540
    K = np.array([
        [7.18856e+02, 0, 6.071928e+02],
        [0, 7.18856e+02, 1.852157e+02],
        [0, 0, 1]
    ])
    ground_truth = np.loadtxt(os.path.join(kitti_path, 'poses', '05.txt'))
    ground_truth = ground_truth[:, [-9, -1]]  # same as MATLAB(:, [end-8 end])
elif DATASET == 1:
    assert 'malaga_path' in locals(), "You must define malaga_path"
    img_dir = os.path.join(malaga_path, 'malaga-urban-dataset-extract-07_rectified_800x600_Images')
    images = sorted(glob(os.path.join(img_dir, '*.png')))
    last_frame = len(images)
    K = np.array([
        [621.18428, 0, 404.0076],
        [0, 621.18428, 309.05989],
        [0, 0, 1]
    ])
elif DATASET == 2:
    assert 'parking_path' in locals(), "You must define parking_path"
    img_dir = os.path.join(kitti_path, '05/image_0')
    images = glob(os.path.join(img_dir, '*.png'))
    last_frame = 598
    K = np.loadtxt(os.path.join(parking_path, 'K.txt'), delimiter=",", usecols=(0, 1, 2))
    ground_truth = np.loadtxt(os.path.join(parking_path, 'poses.txt'))
    ground_truth = ground_truth[:, [-9, -1]]
elif DATASET == 3:
    # Own Dataset
    # TODO: define your own dataset and load K obtained from calibration of own camera
    assert 'own_dataset_path' in locals(), "You must define own_dataset_path"

else:
    raise ValueError("Invalid dataset index")

## Parameters

### Paramaters for all datasets

In [None]:
# Paramaters for Shi-Tomasi corners
feature_params = dict( maxCorners = 10,
                       qualityLevel = 0.3,
                       minDistance = 7,
                       blockSize = 7 )

# Parameters for LKT
lk_params = dict( winSize  = (15, 15),
                  maxLevel = 2,
                  criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))


### Paramaters for specific datasets

In [None]:

# Next keyframe to use for bootstrapping
KITTI_BS_KF = 3
MALAGA_BS_KF = 5
PARKING_BS_KF = 5
CUSTOM_BS_KF = 5

# Number of rows and columns to divide image into for feature detection and number of features to track in each cell
KITTI_ST_ROWS, KITTI_ST_COLS, KITTI_NUM_FEATURES = 2, 4, 20
MALAGA_ST_ROWS, MALAGA_ST_COLS, MALAGA_NUM_FEATURES = 2, 4, 20
PARKING_ST_ROWS, PARKING_ST_COLS, PARKING_NUM_FEATURES = 2, 4, 20
CUSTOM_ST_ROWS, CUSTOM_ST_COLS, CUSTOM_NUM_FEATURES = 2, 4, 20

### Instantiate params class

#### Generate masks for feature tracking

In [None]:
def get_feature_masks(img_path, rows, cols) -> list[np.ndarray]:
    # get image shape
    img = cv2.imread(img_path)
    H, W = img.shape[:2]

    # get boundries of the cells
    row_boundries = np.linspace(0, H, rows + 1, dtype=int)
    col_boundries = np.linspace(0, W, cols + 1, dtype=int)

    # create masks left to right, top to bottom
    masks = []
    for row in range(rows):
        for col in range(cols):
            mask = np.zeros((H, W), dtype="uint8")
            r_s, r_e = row_boundries[[row, row + 1]]
            c_s, c_e = col_boundries[[col, col + 1]]
            mask[r_s:r_e, c_s:c_e] = 255
            masks.append(mask)
            
            # visulaization
            # vis = np.zeros_like(img)
            # vis[mask] = img[mask]
            # cv2.imshow("masked", vis)
            # cv2.waitKey(0)
            # cv2.destroyAllWindows()

    return masks

#### Paramaters

In [None]:
class VO_Params():
    bs_kf_1: str # path to first keyframe used for bootstrapping dataset
    bs_kf_2: str # path to second keyframe used for bootstrapping dataset
    feature_masks: list[np.ndarray] # mask image into regions for feature tracking 
    shi_tomasi_params: dict
    klt_params: dict
    k: np.ndarray # camera intrinsics matrix
    # ADD NEW PARAMS HERE

    def __init__(self, bs_kf_1, bs_kf_2, feature_masks, shi_tomasi_params, klt_params, k):
        self.bs_kf_1 = bs_kf_1
        self.bs_kf_2 = bs_kf_2
        self.feature_masks = feature_masks
        self.shi_tomasi_params = shi_tomasi_params
        self.klt_params = klt_params
        self.k = k
        # ADD NEW PARAMS HERE

if DATASET == 0:
    assert 'kitti_path' in locals(), "You must define kitti_path"
    bs_kf_1 = images[0]
    bs_kf_2 = images[KITTI_BS_KF]
    feature_masks = get_feature_masks(bs_kf_1, KITTI_ST_ROWS, KITTI_ST_COLS)
    # ADD NEW PARAMS HERE

elif DATASET == 1:
    assert 'malaga_path' in locals(), "You must define malaga_path"
    bs_kf_1 = images[0]
    bs_kf_2 = images[MALAGA_BS_KF]
    feature_masks = get_feature_masks(bs_kf_1, MALAGA_ST_ROWS, MALAGA_ST_COLS)
    # ADD NEW PARAMS HERE

elif DATASET == 2:
    assert 'parking_path' in locals(), "You must define parking_path"
    bs_kf_1 = images[0]
    bs_kf_2 = images[PARKING_BS_KF]
    feature_masks = get_feature_masks(bs_kf_1, PARKING_ST_ROWS, PARKING_ST_COLS)
    # ADD NEW PARAMS HERE

elif DATASET == 3:
    # Own Dataset
    # TODO: define your own dataset and load K obtained from calibration of own camera
    assert 'own_dataset_path' in locals(), "You must define own_dataset_path"

else:
    raise ValueError("Invalid dataset index")

# ADD NEW PARAMS HERE TO THE INIT
params = VO_Params(bs_kf_1, bs_kf_2, feature_masks, feature_params, lk_params, K)

# Initialization
- Select two keyframes with large enough baseline
- Use indirect (feature-based) or direct (KLT) method to establish keypoint corrispondences between frames
- Estimate relative pose and triangulate points to bootstrap point cloud (5-pt RANSAC)
- Initialize VO pipeline with inlier keypoints and their associated landmarks

### Initialization,Feature extraction

Detect a set of 2D keypoints in the first bootstrap keyframe (bs_kf_1) that are well distributed across the image.

Implementation:
- We use `cv2.goodFeaturesToTrack` (Shi–Tomasi) to detect corners.
- To avoid clustering of keypoints in high-texture areas, we apply Shi–Tomasi separately in multiple image regions using `params.feature_masks` and then concatenate the results.

Output:
- `st_corners`: array of shape `(N, 1, 2)` (float32), suitable for KLT tracking with `cv2.calcOpticalFlowPyrLK`.


In [None]:
def extract_features(img_grayscale, params):
    """
    Step 1 (Initialization): detect Shi–Tomasi corners on a grid using feature masks.

    Args:
        img_grayscale (np.ndarray): bootstrap keyframe 1 in grayscale (H x W).
        params (VO_Params): contains feature_masks and shi_tomasi_params.

    Returns:
        st_corners (np.ndarray): (N, 1, 2) float32 corners for KLT tracking.
    """
    st_corners = np.empty((0, 1, 2), dtype=np.float32)
    for n, mask in enumerate(params.feature_masks):
        features = cv2.goodFeaturesToTrack(img_grayscale, mask=mask, **params.shi_tomasi_params)
        # If no corners are found in this region, skip it
        if features is None: 
            print(f"No features found for mask {n+1}!")
            continue
        # Warn if very few features were found in this region (not necessarily an error)
        if features.shape[0] < 10:
            print(f"Only {features.shape[0]} features found for mask {n+1}!")
        st_corners = np.vstack((st_corners, features))
    return st_corners


### Initialization, KLT tracking across intermediate frames

Establish 2D–2D correspondences between the two bootstrap keyframes (bs_kf_1 → bs_kf_2).

Implementation:
- Tracking the detected keypoints using KLT optical flow (`cv2.calcOpticalFlowPyrLK`).
- Instead of tracking directly from keyframe 1 to keyframe 2 in one shot, we track *frame-by-frame* across the intermediate frames,more stable when the motion between keyframes is larger.
- We maintain a boolean mask `still_detected` that keeps track of keypoints successfully tracked at every frame.

Output:
- `points[still_detected]`: tracked keypoints in `bs_kf_2`
- `initial_points`: initial deteced keypoints in `bs_kf_1`
- `still_detected`: boolean mask (relative to the original set) indicating which keypoints survived the entire bootstrap tracking.



In [None]:
def feature_tracking_klt(images,params,st_corners_kf_1):
    """
    Module 2-Initialization: track keypoints from bs_kf_1 to bs_kf_2 with KLT across intermediate frames.

    Args:
        images (list[str]): full dataset sequence (paths).
        params (VO_Params): contains bs_kf_1, bs_kf_2, klt_params.
        st_corners_kf_1 (np.ndarray): (N,1,2) keypoints in bs_kf_1.

    Returns:
        points[still_detected] (np.ndarray): (M,1,2) tracked points in bs_kf_2.
        initial_points (np.ndarray): (N,1,2) points in bs_kf_1.
        still_detected (np.ndarray): (N,) boolean mask of points visible i all frames.
    """
    img_bs_kf_1_index=images.index(params.bs_kf_1)
    img_bs_kf_2_index=images.index(params.bs_kf_2)
    still_detected=np.ones(st_corners_kf_1.shape[0],dtype=bool)
    points = st_corners_kf_1.copy()
    initial_points = st_corners_kf_1.copy()
    for i in range(img_bs_kf_1_index, img_bs_kf_2_index):
        current_image=cv2.imread(images[i],cv2.IMREAD_GRAYSCALE)
        next_image=cv2.imread(images[i+1],cv2.IMREAD_GRAYSCALE)
        nextPts,status,error=cv2.calcOpticalFlowPyrLK(current_image,next_image,points, None, **params.klt_params)
        points=nextPts
        status=status.flatten()
        still_detected=still_detected & (status==1)

    return points[still_detected], initial_points, still_detected

In [None]:
#check point 1 and point 2
#read bootstrap keyframes
img_bs_kf_1 = cv2.imread(params.bs_kf_1,cv2.IMREAD_GRAYSCALE)
img_bs_kf_2 = cv2.imread(params.bs_kf_2,cv2.IMREAD_GRAYSCALE)

#1)extract features
st_corners_kf_1 = extract_features(img_bs_kf_1, params)

#2)KLT interframe tracking
st_corners_kf_2, st_corners_kf_1,still_detected =feature_tracking_klt(images, params, st_corners_kf_1)

print("Initial corners:",st_corners_kf_1.shape[0])
print("Tracked corners:",st_corners_kf_2.shape[0])


In [None]:
#simple visualization of tracked points on first keyframe
vis = cv2.cvtColor(img_bs_kf_1, cv2.COLOR_GRAY2BGR)
for p in st_corners_kf_1[still_detected][:200]:
    x,y = p.ravel()
    cv2.circle(vis, (int(x), int(y)), 2, (0,255,0), -1)
cv2.imshow("Tracked corners (alive) on kf1", vis); cv2.waitKey(0)


In [None]:
# track features to second keyframe
st_corners_kf_2, st, err = cv2.calcOpticalFlowPyrLK(img_bs_kf_1, img_bs_kf_2, st_corners_kf_1, None, **params.klt_params)

### Transformation
TODO: find transformation between the two frames using cv2.findHomography()

In [None]:
# make sure to use ransac

### Triangulate points to get point cloud
TODO: find 3d points using triangulatePoints(); the projection matrices are $K*[R|T]$ where $[R|T]$ is $[I|0]$ for the first image and the homography from above for the second image

In [None]:
def triangulate_bootstrap(params: VO_Params, H: np.ndarray, points_1: np.ndarray, points_2: np.ndarray) -> np.ndarray:
    """Bootstrap the initial 3D point cloud using least squares assuming the first frame is the origin

    Args:
        params (VO_Params): params object for the dataset being used
        H (np.ndarray): homographic transformation from bootstrap keyframe 1 to 2
        points_1 (np.ndarray): keypoints detected in bootstrap keyframe 1
        points_2 (np.ndarray): keypoints tracked in bootstrap keyframe 2

    Returns:
        np.ndarray: [3 x k] array of triangulated points
    """

    # projection matrices
    proj_1 = params.k @ np.hstack([np.eye(3), np.zeros((3,1))])
    proj_2 = params.k @ H

    # triangulate homogeneous coordinates using DLT
    points_homo = cv2.triangulatePoints(proj_1, proj_2, points_1, points_2)

    # convert back to 3D
    points_3d = (points_homo[:3, :]/points_homo[3, :])

    return points_3d
pts = triangulate_bootstrap(params, np.hstack([np.eye(3), np.ones((3,1))]), H_bootstrap, st_corners_kf_1, st_corners_kf_2)

### Build state to initialize the algorithm
The state is $(P^i, X^i, C^i, F^i, \Tau^i)$ where<br>
$P^i$ is a `[k x 1 x 2]` matrix of initial features' pixel in the second keyframe of the dataset<br>
$X^i$ is a `[3 x k]` matrix of the 3D cooridinates of the corrisponding landmarks<br>
$C^i$ is a `[m x 1 x 2]` matrix of current locations of candidate keypoints (empty to start so `c=0`)<br>
$F^i$ is a `[m x 1 x 2]` matrix of initial observation of candidate keypoints (empty to start so `c=0`)<br>
$\Tau^i$ is a `[m x 12]` matrix of the camera pose during the initial observation<br>
This can be stored in a dict $S = \{P: [k \times 1 \times 2], X: [3 \times k], ...\}$

In [None]:
# note that only P and X should be populated; the rest should be initialized as empty (ex for Ci: np.empty((0, 1, 2)))

# Operation
- Match keypoints in current image to existing landmarks
    - Extract keypoints (Harris)
    - Track (KLT)
- Estimate pose
    - Estimate pose and handle outliers (P3P plus RANSAC)
- Add new landmarks as needed by triangulating new features
    - Keep track of candidate landmarks
        - Keypoint itself
        - Observation when first seen
        - Pose when first seen
    - Only add when they have been tracked for long enough and baselineis large enough
    - Discard if track fails<br>

_NOTE: this starts at the frame after the second keyframe (`bs_kf_2`) and goes until the last frame in the dataset_


### Track keypoints forward one frame
TODO: Use cv2.calcOpticalFlowPyrLK() with previous frame, current frame, $P^i$ and $C^i$,  - see use in initialization for example and then update $P^i$ and $X^i$ as well as $C^i$, $F^i$ and $\Tau^i$ based on the features that were successfully tracked (ie remove any features that were not tracked)

### Estimate pose
TODO: use cv2.solvePnPRansac() with updated $P^i$, $X^i$, $K$, to find pose of camera at current position and then updated $P^i$ and $X^i$ with the inliers

### Try triangulating
TODO: Check the angle between the the first observation ($X^i$ and $\Tau^i$) of candidate keypoints and the current observation ($C^i$ and current pose) - triangulate features using cv2.triangulatePoints() if angle is above threshold

### Find new features
TODO: use cv2.goodFeaturesToTrack() - see use in initialization for example

In [None]:
print("tstt")

### Add new features
TODO: if the feature is not already being tracked as an actual keypoint or candidate keypoint, update $C^i$ and $X^i$ with keypoint location and $\Tau^i$ with the current camera pose

In [None]:
def keypoints2set(keypoints: np.ndarray) -> set:
    """Convert numpy keypoint list [k x 1 x 2] to a set of keypoints

    Args:
        keypoints (np.ndarray): keypoint array

    Returns:
        set: keypoint set
    """
    return set([(row[0][0], row[0][1]) for row in keypoints.tolist()])

def set2keypoints(keypoint_set: set) -> np.ndarray:
    """Convert keypoint set (u, v) to a numpy array [k x 1 x 2]

    Args:
        keypoint_set (set): keypoint set

    Returns:
        np.ndarray: keypoint array with shape [k x 1 x 2]
    """
    return np.array([[[keypoint[0], keypoint[1]]] for keypoint in keypoint_set])

# find which features are actually new
new_features = set2keypoints(keypoint2set(S["P"]) + keypoint2set(S["C"]) - keypoint2set(candidate_new_features))

# append new features to current points, first observed points, and first observed camera pose
S["C"] = np.vstack((S["C"], new_features))
S["X"] = np.vstack((S["X"], new_features))
S["T"] = np.vstack(S["T"], cur_pose.flatten()[None, :].repeat(len(new_features)))
