## Averaging Frames

In this notebook I want to write some code that will take in a video, split it up by frame and run it through clip, then average all the frames of the video into one big vector.

In [3]:
import cv2
import numpy as np
import os
%matplotlib inline 
from matplotlib import pyplot as plt
from PIL import Image
import os
import clip
import torch

In [7]:
def get_saving_frames_durations(cap, saving_fps):
    '''A function that returns the list of durations where to save the frames'''
    s = []
    # get the clip duration by dividing number of frames by the number of frames per second
    clip_duration = cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)
    # use np.arange() to make floating-point steps
    for i in np.arange(0, clip_duration, 1 / saving_fps):
        s.append(i)
    return s

In [30]:
def save_video_frames(video_file, SAVING_FRAMES_PER_SECOND):
    '''A function that saves the individual frames of a video, returning a list containing
    each frame, saved as an image saved in a numpy array.
    
    Saving frames per second is the desired number of frames to be saved from each second
    of the video.
    '''
    
    # read the video file
    cap = cv2.VideoCapture(video_file)
    
    # get the fps of the video
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    # if the SAVIGN_FRAMES_PER_SECOND is above video fps, then set it to fps (as maximum)
    saving_frames_per_second = min(fps, SAVING_FRAMES_PER_SECOND)
    
    # get the list and duration spots to save
    saving_frames_durations = get_saving_frames_durations(cap, saving_frames_per_second)
    
    # start the loop
    count = 0
    all_frames = []
    while True:
        is_read, frame = cap.read()
        if not is_read:
            # break out of the loop if there are no frames to read
            break
        # Reverse colors to be in RGB format
        frame = frame[...,::-1]
        # get the duration by dividing the frame count by the FPS
        frame_duration = count / fps
        try:
            # get the earliest duration to save
            closest_duration = saving_frames_durations[0]
        except IndexError:
            # the list is empty, all duration frames were saved
            break
        if frame_duration >= closest_duration:
            # if closest duration is less than or equals the frame duration, 
            # then save the frame
            all_frames.append(frame) 
            # drop the duration spot from the list, since this duration spot is already saved
            try:
                saving_frames_durations.pop(0)
            except IndexError:
                pass
        # increment the frame count
        count += 1
    
    # video = np.stack(all_frames, axis=0)
    return all_frames

In [10]:
model, preprocess = clip.load("ViT-B/32", device = 'cpu')

In [14]:
def apply_clip(frames, model, preprocess, device = "cpu"):
    image_inputs = [preprocess(Image.fromarray(image)).unsqueeze(0).to(device) for image in frames]
    
    with torch.no_grad():
        image_features = [model.encode_image(image_input) for image_input in image_inputs]
        
    return image_features

In [23]:
def average_vectors(encoded_frames):
    '''A function that returns the average image from all frames in a list.'''
    
    number_of_frames = len(encoded_frames)
    
    # Initialize average_frame. We will be adding the rest of the frames in the for loop
    average_vector = encoded_frames[0] / number_of_frames
    for i in range(1, len(encoded_frames)):
        average_vector += encoded_frames[i] / number_of_frames
    
    return average_vector

In [25]:
def process_video(video_file, model, preprocess, saving_frames_per_second = 10, device = "cpu"):
    frames = save_video_frames(video_file, saving_frames_per_second)
    encoded_frames = apply_clip(frames, model, preprocess, device)
    average_vector = average_vectors(encoded_frames)
    return average_vector

In [27]:
frames = save_video_frames("football.mp4", 1)
encoded_frames = apply_clip(frames, model, preprocess)

In [20]:
len(encoded_frames)

10

In [31]:
process_video("football.mp4", model, preprocess, 10)

tensor([[-1.3518e-01, -3.2092e-01,  2.0781e-01, -1.0628e-01,  5.1675e-01,
         -6.7412e-02,  2.1520e-02, -5.8846e-01, -8.6637e-02, -2.7533e-01,
          1.9898e-03, -2.0490e-01,  7.1309e-01,  6.4703e-01, -2.5003e-01,
         -8.0517e-02, -6.9651e-01,  7.0320e-01,  3.8460e-01, -9.3695e-02,
         -1.5936e-01,  1.8842e-01, -1.3201e-02,  3.4823e-01,  1.0230e-02,
         -2.9162e-01, -2.6660e-01, -2.5026e-01, -7.0919e-02, -1.4804e-01,
         -5.2722e-01,  3.9126e-01, -6.5792e-01, -3.8299e-01,  4.0105e-01,
          3.1995e-01, -1.8066e-01,  7.2472e-02,  3.6513e-01,  1.8472e+00,
         -5.8928e-01, -2.0333e-01,  3.9182e-01, -1.1459e-01, -4.3590e-01,
          1.9570e+00,  3.8751e-01, -3.5395e-01,  3.6742e-01, -2.0511e-01,
          3.6374e-05,  2.0662e-01,  1.4818e-01,  4.4389e-02, -1.6615e-01,
          9.9178e-03,  1.2454e-01,  4.5020e-02, -2.7934e-01, -6.8343e-02,
          9.4762e-01, -2.0876e-01,  1.5982e-01,  6.9297e-01, -4.6680e-01,
          2.5823e-01,  5.9937e-01,  9.