The **Part 1** of the project is divided into three sections:

1- Feature Extraction (Using SIFT)

2- Outlier Removal (Using RANSAC)

3- Computing the Homographies (Using DLT)


**pip install opencv-python**

**pip install opencv-contrib-python**

**Part 1**

In [1]:
#Imports
from numpy.linalg import eig
import numpy as np
import cv2
import os
import sys
import pickle
from src.extract_features import *
from src.matching_features import *
from src.homography import *
from src.ransac import *
from src.parsing import *
from src.display_video import *
from main import*

In [5]:
config_data = parse_configuration_file('config/part_1.cfg') #Parse the configuration file
match_img1 , match_map = parse_points(config_data) #Parse the points from the configuration file
video_path = config_data[0].split(' ')[1].strip() #Get the video path
H_frame1_to_map =compute_homography(match_img1, match_map)
print(H_frame1_to_map)
print("Condition: ", np.linalg.cond(H_frame1_to_map), '\n')

sift_points, kp_list = extract_features(video_path)
#homography_two_frames(img1, img2, sift_points, kp_list, 1) #option 1 - with openCV; option 2 - with numpy

match = matching_features_SCIKITLEARN(sift_points)
# print(match2)

H_sequential = create_sequential_homographies(match, sift_points)
print('H_sequential' , H_sequential)
H_output = homography_to_map(H_sequential, H_frame1_to_map)
print('H_output', H_output)


image matches:  [('225', '131'), ('580', '120'), ('626', '305'), ('133', '303')]
map matches:  [('225', '131'), ('580', '120'), ('626', '305'), ('133', '303')] 

[[ 1.00000000e+00 -1.83423599e-13  1.22165870e-11]
 [ 8.31845281e-14  1.00000000e+00  7.08329962e-12]
 [ 2.88444403e-16 -9.61481343e-16  1.00000000e+00]]
Condition:  1.0000000000141258 

Total frames of the video:  1901
(Nº features, Nº descriptors per feature):  (5000, 128)
Nº of frames extracted:  20
H_sequential [[ 1.00000000e+00  2.00000000e+00  3.00000000e+00  4.00000000e+00
   5.00000000e+00  6.00000000e+00  7.00000000e+00  8.00000000e+00
   9.00000000e+00  1.00000000e+01  1.10000000e+01  1.20000000e+01
   1.30000000e+01  1.40000000e+01  1.50000000e+01  1.60000000e+01
   1.70000000e+01  1.80000000e+01  1.90000000e+01]
 [ 2.00000000e+00  3.00000000e+00  4.00000000e+00  5.00000000e+00
   6.00000000e+00  7.00000000e+00  8.00000000e+00  9.00000000e+00
   1.00000000e+01  1.10000000e+01  1.20000000e+01  1.30000000e+01
   1.400

In [6]:

H_output=np.empty([11,0])
H_i = np.vstack((np.array([[0], [1]]) , H_frame1_to_map.reshape(9,1) )) #first part of the array is 0 and 1 - which means homography from frame 1 to map (frame 0)
H_output = np.hstack([H_output, H_i])
                        
for i in range(1, len(H_sequential)):
    T_to_map= np.matmul(  H_output[2:,i-1].reshape(3,3), H_sequential[2:,i-1].reshape(3,3)) 
    #for frame n, H_output[2:,i-1] should be the homography from frame n-1 to the map. 
    # H_sequential[2:,i-1] should be the homography from frame n to n-1
    # So T_to_Map should be the homography from frame n to map

    H_i = np.vstack(( np.array([[0],[H_sequential[1,i-1]]] ), T_to_map.reshape(9,1) ))

    H_output = np.hstack([H_output, H_i])


In [7]:

def extract_features_frames(video_path= 'video/trymefirst_lisbon.mp4'):
    """Extracts the features from the video and stores them in a list"""
    print(video_path)
    capture = cv2.VideoCapture(os.path.abspath(video_path))
    kp_list = []
    sift_points = [] #nome a definir no config
    sift = cv2.SIFT_create(5000) #number of sift points
    img1, img2 = None, None
    k = 0
    frames=[]
    count_frames(video_path)
    while k <= 1900:
        capture.set(cv2.CAP_PROP_POS_FRAMES, k)
        success, frame = capture.read() #read the video
        if success:
            if (k == 0):
                img1 = frame
            if (k == 1900):
                img2 = frame
            frame_points = []
            frames.append(frame)
            gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY) #convert image to gray
            key_points, descriptors = sift.detectAndCompute(gray,None) 
            kp_list.append(key_points)
            frame_points = ([key_points[0].pt[0],key_points[0].pt[1]]+descriptors[0].tolist())
            for i in range(1,len(key_points)):
                 temp_column = ([key_points[i].pt[0],key_points[i].pt[1]]+descriptors[i].tolist())
                 frame_points = np.column_stack((frame_points,temp_column))  
        sift_points.append(frame_points) #append everything into a list 
        k += 100
    print("(Nº features, Nº descriptors per feature): ", descriptors.shape)
    print("Nº of frames extracted: ", len(sift_points))
    return frames
    
def show_pixel_value(event, x, y, flags, param):
    if event == cv2.EVENT_LBUTTONDOWN:
        # Get the BGR values at the clicked position
        b, g, r = img[y, x]
        print(f"Pixel value at (x={x}, y={y}): B={b}, G={g}, R={r}")






def display(frame1, frame2,homography_de2_para1 ):
    

    # Display frame 1
    H= homography_de2_para1.reshape((3,3))
    

    height, width = frame2.shape[:2]
    warped_frame2 = cv2.warpPerspective(frame2, H, (width, height))

    # Apply homography to frame 2

    # Display frame 2 with homography applied
    while True:
        cv2.namedWindow('Image')
        cv2.setMouseCallback('Image', show_pixel_value)
        cv2.imshow("Frame 1", frame1)
        cv2.namedWindow('Image2')
        cv2.setMouseCallback('Image2', show_pixel_value)
        cv2.imshow("Frame 2 with Homography", warped_frame2)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    cv2.destroyAllWindows()

video/trymefirst_lisbon.mp4
Total frames of the video:  1901
(Nº features, Nº descriptors per feature):  (5000, 128)
Nº of frames extracted:  20


In [8]:
frames= extract_features_frames()

video/trymefirst_lisbon.mp4
Total frames of the video:  1901
(Nº features, Nº descriptors per feature):  (5000, 128)
Nº of frames extracted:  20


In [9]:
#to test homography
img1=frames[0]
for i in range(1,len(frames)-1):
    img2= frames[i]
    display(img1, img2, H_output[2:,i] )

IndexError: index 11 is out of bounds for axis 1 with size 11

In [24]:
from main import*
H_output = homography_to_map(H_sequential, H_frame1_to_map)
print('H_output', H_output)

TypeError: 'tuple' object is not callable

In [2]:
def count_frames(video_path):
    """Displays the video and counts the number of frames"""
    capture = cv2.VideoCapture(os.path.abspath(video_path))
    total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
    print("Total frames of the video: ", total_frames)

The following code **extracts SIFT features** from each frame of the input video

In [11]:

capture = cv2.VideoCapture(os.path.abspath('Video/trymefirst_lisbon.mp4'))
kp_list = []
sift_points = [] #nome a definir no config
t = 0 
sift = cv2.SIFT_create(5000) #number of sift points
img1, img2 = None, None
k = 0
count_frames(os.path.abspath('Video/trymefirst_lisbon.mp4'))
while k <= 1900:
        capture.set(cv2.CAP_PROP_POS_FRAMES, k)
        success, frame = capture.read() #read the video
        if success:
            if (k == 0):
                img1 = frame
            if (k == 1900):
                img2 = frame
            frame_points = []
            gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY) #convert image to gray
            key_points, descriptors = sift.detectAndCompute(gray,None) 
            kp_list.append(key_points)
            frame_points = ([key_points[0].pt[0],key_points[0].pt[1]]+descriptors[0].tolist())
            for i in range(1,len(key_points)):
                 temp_column = ([key_points[i].pt[0],key_points[i].pt[1]]+descriptors[i].tolist())
                 frame_points = np.column_stack((frame_points,temp_column))  
        sift_points.append(frame_points) #append everything into a list 
        k += 100
print("(Nº features, Nº descriptors per feature): ", descriptors.shape)
print("Nº of frames extracted: ", len(sift_points))
#print(des.shape)
#print(len(sift_points))
#The keypoint is a point of interest in the image, the descriptor is a vector that describes the image patch around the keypoint

Total frames of the video:  0


NameError: name 'frame_points' is not defined

The following code **matches SIFT features** between the frames

In [4]:
#Brute force method
bf = cv2.BFMatcher(crossCheck=True) #crossCheck is set to true so that the match is symmetric
all_matches = []
match = []
for s in range(len(sift_points)-1):
    point_matches = []

    des1 = (((sift_points[s])[2:,:])).astype('float32')  # descriptors of the first frame
    des2 = (((sift_points[s+1])[2:,:])).astype('float32')  # descriptors of the second
    des1 = np.reshape(des1,(np.shape(des1)[1],128))
    des2 = np.reshape(des2,(np.shape(des2)[1],128))

    if np.shape(des1)[0] > np.shape(des2)[0]:
             des1 = des1[:-abs(np.shape(des1)[0]-np.shape(des2)[0]),:]  # we are removing the last points so that we have an equal amount of SIFT features between two frames
    if np.shape(des1)[0] < np.shape(des2)[0]:
             des2 = des2[:-abs(np.shape(des1)[0]-np.shape(des2)[0]),:]
    matches = bf.match(des1,des2)  # an error occurs if two frames have different amounts of SIFT features

    for i in range(len(matches)):
        match.append(matches)
        point_matches.append([matches[i].queryIdx,matches[i].trainIdx])

    all_matches.append(point_matches)

    
#Feature detection: opencv
#Matching : sklearn , numpy
#RANSAC: numpy
#Create Homography: numpy

In [3]:
"""Feature matching using nearest neighbours, for pairs of consecutive frames"""
    
matches=[]
Threshold=0.75

for s in range(len(sift_points)-1):
    frame1_descriptors = sift_points[s][2:,:] #descriptor values of every feature point for video frame s (current shape: 128x5000)
    frame1_descriptors = np.transpose(frame1_descriptors) # transpose -> current shape: 5000x128 - > 5000 points/queries each with 128 features/columns
    #fit data of features from frame 1 to NearestNeighbour. When we ask for matches from this method, it should give us the 2 closest points to the point given
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(frame1_descriptors) 

    #predict matches for the other frame:
    
    frame_drescriptors = np.transpose(sift_points[s+1][2:,:]) #the same as done some lines above but for frame s+1
    # Find the 2 nearest neighbors
    distances, indices = nbrs.kneighbors(frame_drescriptors) 
    # indices is a 5000x2 shape matrix -> for each of the 5000 given feature points of frame_drescriptors it gives the 2 closest features from video frame 1
    # distances is a measure of distance between the feature points of frame_drescriptors and each of the two givenneighbours from the indices matrix - it has the same size as indices
    
    features_matches=np.empty([4,0])
    features_not_mateched=[]
    for i in range(len(distances)): 
        if distances[i,0]< Threshold*distances[i,1] and distances[i,0]< 700:
            #match is good for first neighbour found
            features_matches= np.hstack((  features_matches   , np.array([[int(i)],[int(indices[i,0])], [distances[i,0]],[distances[i,1]]])  ))
        else:                                                       # indice do frame s+1, indice do frame s, distâncias
            #point is not good
            features_not_mateched.append(i) #features from this frame that were not matched
    
    features_matches = features_matches[:, features_matches[1, :].argsort()] # this sorts the check_for_duplicates matrix in accordance to the values of it's second line
    features_matches_deletedColumns= features_matches.copy()

    for i in reversed (range (1, features_matches.shape[1])): #loop that starts in the last feature - because it deletes elements with their indexes from list check_for_duplicates_deletedColumns
        # this has to be done starting from the end to not change the index of columns

        # duplicates are adjacent because of sort
        if features_matches[1,i-1] == features_matches[1,i]:
            # if the value of the indice i and i-1 are equal, then there is one feature matched to 2 features of the new frame - we need to delete one of the matches
            if features_matches[2,i-1] <= features_matches[2,i]: #check distance of i and i-1. And remove the one with the most distance
                features_matches_deletedColumns= np.delete(features_matches_deletedColumns, i-1, 1) #remove duplicate feature matching (deletes one column - np dimension 1)
                features_not_mateched.append(features_matches[0,i-1]) #append number of feature that was deleted to features not matched
            else:
                features_matches_deletedColumns= np.delete(features_matches_deletedColumns, i, 1) 
                features_not_mateched.append(features_matches[0,i]) 
    
    matched_inThis_frame = features_matches_deletedColumns[:, features_matches_deletedColumns[0, :].argsort()] #to be in order in acoordance to index of frame s

    matches.append( (matched_inThis_frame[0:2,:]))

In [11]:
matches[2]

array([[0.000e+00, 2.000e+00, 3.000e+00, ..., 4.986e+03, 4.996e+03,
        4.999e+03],
       [6.650e+02, 4.133e+03, 1.332e+03, ..., 4.721e+03, 2.964e+03,
        4.553e+03]])

The following code **computes the Homography** between the frames of the video

In [113]:
import numpy as np

def normalize(points):
    mean = np.mean(points, axis=0)
    std_dev = np.std(points)
    T = np.array([[std_dev, 0, mean[0]], [0, std_dev, mean[1]], [0, 0, 1]])
    T_inv = np.linalg.inv(T)
    normalized_points = np.dot(T_inv, np.append(points, np.ones((points.shape[0], 1)), axis=1).T).T
    return normalized_points, T

def construct_matrix_A(points1, points2):
    A = []
    for i in range(points1.shape[0]):
        x1, y1 = points1[i, 0], points1[i, 1]
        x2, y2 = points2[i, 0], points2[i, 1]
        A.append([-x1, -y1, -1, 0, 0, 0, x2*x1, x2*y1, x2])
        A.append([0, 0, 0, -x1, -y1, -1, y2*x1, y2*y1, y2])
    return np.array(A)

def compute_homography(points1, points2):
    points1_norm, T1 = normalize(points1)
    points2_norm, T2 = normalize(points2)
    A = construct_matrix_A(points1_norm, points2_norm)
    _, _, V = np.linalg.svd(A)
    H = V[-1].reshape(3, 3)
    H = np.dot(np.linalg.inv(T2), np.dot(H, T1))
    return H / H[2, 2]

def normalize_points(points):
    # Normalize points to have zero mean and unit variance
    mean = np.mean(points, axis=0)
    std = np.std(points, axis=0)
    normalized_points = (points - mean) / std
    return normalized_points, mean, std


def denormalize_homography(H, mean_src, std_src, mean_dst, std_dst):
    # Denormalize the homography matrix based on mean and standard deviation
   # T_src = np.array([[1 / std_src[0], 0, -mean_src[0] / std_src[0]],
            #          [0, 1 / std_src[1], -mean_src[1] / std_src[1]],
            #          [0, 0, 1]])

   # T_dst = np.array([[1 / std_dst[0], 0, -mean_dst[0] / std_dst[0]],
   #                   [0, 1 / std_dst[1], -mean_dst[1] / std_dst[1]],
   #                   [0, 0, 1]])
    T_src = np.array([[std_src[0], 0, mean_src[0]], [0, std_src[1], mean_src[1]], [0, 0, 1]])
    T_dst = np.array([[std_dst[0], 0, mean_dst[0]], [0, std_src[1], mean_dst[1]], [0, 0, 1]])

    Homography = np.dot(np.linalg.inv(T_dst), np.dot(H, T_src))
    
    return  Homography


In [35]:
from sklearn import preprocessing
kp1 = kp_list[0]
kp2 = kp_list[19]
src_pts = np.float32([ kp1[q[0].queryIdx].pt for q in good ]).reshape(-1,1,2)
dst_pts = np.float32([ kp2[t[0].trainIdx].pt for t in good ]).reshape(-1,1,2)
src = np.reshape(src_pts,(np.shape(src_pts)[0],2))
dst = np.reshape(dst_pts,(np.shape(dst_pts)[0],2))

#src_pts_normalized, mean_src, std_src = normalize_points(src)
#dst_pts_normalized, mean_dst, std_dst = normalize_points(dst)
#src = preprocessing.normalize(src)   #Normalization
#dst = preprocessing.normalize(dst)


In [32]:
def Comp_H(src,dst):
        A = []
        for p, q in zip(src, dst):
            x1 = p[0]
            y1 = p[1]
            x2 = q[0]
            y2 = q[1]
            A.append([-x1, -y1, -1, 0, 0, 0, x2*x1, x2*y1, x2])
            A.append([0, 0, 0, -x1, -y1, -1, y2*x1, y2*y1, y2])

        _, _, Vt = np.linalg.svd(A, full_matrices=True)
        x = Vt[-1]
        homography = x.reshape(3, -1) #/ x[-1]
        return homography

def RANSAC(Comp_H,src,dst,iter,threshold):
      best_homography = None
      inliers = [0]
      for t in range(iter):
            sample_indices = np.random.choice(int(len(src)), size=4, replace=False)
            # Compute the Homography
            H = Comp_H(src[sample_indices],dst[sample_indices])
           # H = denormalize_homography(homography)
            inl = 0
            for p, q in zip(src, dst):
                x1 = p[0]
                y1 = p[1]
                x2 = q[0]
                y2 = q[1]
            # Transform the point using the estimated homography
                transformed_point = np.dot(H, np.array([x1, y1, 1]))

            # Normalize the transformed point
                transformed_point /= transformed_point[2]

            # Calculate the Euclidean distance between the transformed point and the actual point
                distance = np.linalg.norm(np.array([x2, y2, 1]) - transformed_point)
                if distance < threshold:
                   inl += 1
            if inl > inliers[0]:
                 best_homography = H
                 inliers[0] = inl
      return best_homography, inliers[0] 
      

In [37]:
H, inliers = RANSAC(Comp_H,src,dst,143,0.5)
print('condition:',np.linalg.cond(H), 'inliers: ', inliers )

condition: 30609.206718393925 inliers:  44


In [39]:
#src_pts = np.float32([ kp1[all_matches[0][i][0]].pt for i in range(len(all_matches[0])) ]).reshape(-1,1,2)
#dst_pts = np.float32([ kp2[all_matches[0][i][1]].pt for i in range(len(all_matches[0])) ]).reshape(-1,1,2)
#M2, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC,5.0)
#np.linalg.cond(M2)

#kp1 = kp_list[0]
#kp2 = kp_list[1]
#src_pts = np.float32([ kp1[q.queryIdx].pt for q in match[0] ]).reshape(-1,1,2)
#dst_pts = np.float32([ kp2[t.trainIdx].pt for t in match[1] ]).reshape(-1,1,2)
M, _ = cv2.findHomography(src, dst, cv2.RANSAC)
np.linalg.cond(M)


#picture2 = np.reshape(np.array([552,59,1]),(3,1))
#picture1 = np.reshape(np.array([549,56,1]),(3,1))
#pic_h = np.matmul(H,picture1)
#print('the error is: \n', pic_h/pic_h[2]-picture1)

26395.908156677036

In [44]:
homography = H

def apply_homography(image, H):
    """ Apply homography to the image """
    warped_img = cv2.warpPerspective(image, H, (image.shape[1], image.shape[0]))
    return warped_img

img_src = img1
img_dest = img2

warped_src = apply_homography(img_src, homography)
#print('condition:',np.linalg.cond(H),'inliers: ', inliers)
cv2.imshow('Warped Source Image', warped_src)
cv2.imshow('Source Image', img_src)
cv2.imshow('Destination Image', img_dest)
cv2.waitKey(0)
cv2.destroyAllWindows()

**Testing Zone**

In [42]:
def getPerspectiveTransform(src, dst):
    if len(src) == len(dst):
        # Make homogeneous coordiates if necessary
        if src.shape[1] == 2:
            src = np.hstack((src, np.ones((len(src), 1), dtype=src.dtype)))
        if dst.shape[1] == 2:
            dst = np.hstack((dst, np.ones((len(dst), 1), dtype=dst.dtype)))

        # Solve 'Ax = 0'
        A = []
        for p, q in zip(src, dst):
            A.append([0, 0, 0, q[2]*p[0], q[2]*p[1], q[2]*p[2], -q[1]*p[0], -q[1]*p[1], -q[1]*p[2]])
            A.append([q[2]*p[0], q[2]*p[1], q[2]*p[2], 0, 0, 0, -q[0]*p[0], -q[0]*p[1], -q[0]*p[2]])

        eigenvalue,eigenvector=eig(np.matmul(np.transpose(A),A))
        _, _, Vt = np.linalg.svd(A, full_matrices=True)
        x = Vt[-1]
         

        # Reorganize `x` as a matrix
        H = x.reshape(3, -1) / x[-1] # Normalize the last element as 1
        return H
    

H_slides = getPerspectiveTransform(np.reshape(src_pts,(np.shape(src_pts)[0],2)), np.reshape(dst_pts,(np.shape(dst_pts)[0],2)))
np.linalg.cond(H_slides)

861174.1382601217

In [49]:
def select_point(event,x,y,flags,param):
    global ix,iy
    if event == cv2.EVENT_LBUTTONDBLCLK: # captures left button double-click
        print('x = %d, y = %d'%(x, y))

cv2.namedWindow('frame')
cv2.setMouseCallback('frame', select_point)
cv2.imshow('frame',warped_src)
cv2.waitKey(0)
cv2.destroyAllWindows()

x = 422, y = 56
x = 422, y = 56
x = 422, y = 56
x = 423, y = 56


In [None]:


capture = cv2.VideoCapture(os.path.abspath('trymefirst_lisbon.mp4'))
framenr = 0 
list_points = []
while True:
    success, frame = capture.read()
    if success:
        print('Current Frame!')
        cv2.namedWindow('frame')
        cv2.setMouseCallback('frame', select_point)
        cv2.imshow('frame',frame)
        if cv2.waitKey(0) & 0xFF == ord('q'):  ##press q if you want the video to stop 
             break
        key = cv2.waitKey(0) & 0xFF == ord('k')
        
        print('New Frame!')

capture.release()
cv2.destroyAllWindows()