In [None]:
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print('mount success')

Mounted at /content/drive
mount success


In [None]:
dataset_path = '/content/drive/Shareddrives/msvd-dataset'

# for training dataset
train_path = os.path.join(dataset_path, 'train')
output_path = '/content/drive/Shareddrives/msvd-train-feats/train/custom_feat'

# for validation dataset
#train_path = os.path.join(dataset_path, 'val')
#output_path = '/content/drive/Shareddrives/msvd-test-feats/val/custom_feat'

# for test dataset
#train_path = os.path.join(dataset_path, 'test')
#output_path = '/content/drive/Shareddrives/msvd-test-feats/test/custom_feat'

In [None]:
video_folder = 'video'

## FEATURE EXTRACTION using VGG16 and YOLOv8 (for training)

In [None]:
import shutil
import numpy as np
import cv2
import os
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model

In [None]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.1.47-py3-none-any.whl (750 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/750.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/750.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m747.5/750.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.4/750.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none

In [None]:
from ultralytics import YOLO

In [None]:
#splitting video clip into frames
# returns list of frame names

def video_to_frames(video):
    path = os.path.join(train_path, 'temporary_images')
    if os.path.exists(path):
        shutil.rmtree(path)
    os.makedirs(path)
    video_path = os.path.join(train_path, video_folder, video)
    count = 0
    image_list = []

    # Path to video file
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if ret is False:
            break
        cv2.imwrite(os.path.join(train_path, 'temporary_images', 'frame%d.jpg' % count), frame)
        image_list.append(os.path.join(train_path, 'temporary_images', 'frame%d.jpg' % count))
        count += 1

    cap.release()
    cv2.destroyAllWindows()
    print('Frames extracted')
    return image_list

VGG16 CNN MODEL

In [None]:
def model_cnn_load():
    model = VGG16(weights="imagenet", include_top=True, input_shape=(224, 224, 3))
    out = model.layers[-2].output
    model_final = Model(inputs=model.input, outputs=out)
    return model_final

In [None]:
def load_image(path):
    img = cv2.imread(path)
    img = cv2.resize(img, (224, 224))
    return img

YOLOv8 OBJECT DETECTION MODEL

In [None]:
yolo_model_name = "yolov8n.pt"
yolo_model = YOLO(yolo_model_name)

Downloading https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.23M/6.23M [00:00<00:00, 111MB/s]


FEATURE EXTRACTION

80 frames per video are taken.
From each frame -> 4096 features are extracted using VGG16 and 30 detection features using YOLO.

The features are stacked to form an (80, 4096 + 30) shaped array.

In [None]:
# extracting features for each video

def extract_features(video, model):
    """
    :param video: The video whose frames are to be extracted to convert into a numpy array
    :param model: the pretrained vgg16 model
    :return: numpy array of size 4096x80
    """

    video_id = video.split(".")[0]
    print(video_id)
    print(f'Processing video {video}')


    image_list = video_to_frames(video) #get frame list for the video
    samples = np.round(np.linspace(0, len(image_list) - 1, 80))
    image_list = [image_list[int(sample)] for sample in samples]
    images = np.zeros((len(image_list), 224, 224, 3))

    yolo_img_feats = np.array([]) #contains (6*5) * 80 elements
    for i in range(len(image_list)):
        img = load_image(image_list[i])
        images[i] = img

        #object detection
        results = yolo_model.predict(source=images[i], save=True) # single frame, many objects
        out_list = []
        out = results[0].boxes.data.tolist() # list of detected objects for 1 frame
        for r in out:
          if r[-1] == 0:
            r[-1] = 80
          out_list.append(r)

        while len(out_list) < 6:
          out_list.append([0, 0, 0, 0, 0, 0])

        #print(out_list)
        # sorting the list based on score and taking first 5 objects
        sorted_out_list = sorted(out_list, key=lambda x: x[4], reverse=True)[:5]
        #print(sorted_out_list)
        sorted_array = np.array(sorted_out_list).flatten() #1D array of obj detection info - each frame i (30)
        #print('sorted array', sorted_array)
        yolo_img_feats = np.append(yolo_img_feats, sorted_array)


    images = np.array(images)
    fc_feats = model.predict(images, batch_size=128)
    img_feats = np.array(fc_feats) # 80 frames' vgg features extracted (80,4096)


    # deleting the frame image files
    temp_images_dir = os.path.join(train_path, 'temporary_images')
    try:
      shutil.rmtree(temp_images_dir)
    except FileNotFoundError:
        print(f"Directory '{temp_images_dir}' not found.")
    return (img_feats, yolo_img_feats)

In [None]:
# Saves the numpy features from all the videos. Passes videos one by one to extract_features() fn, and saves the received features into npy file

def extract_feats_pretrained_cnn():
    model = model_cnn_load()
    print('VGG16 model loaded')

    #for saving features
    if not os.path.isdir(os.path.join(output_path, 'feat')):
        os.mkdir(os.path.join(output_path, 'feat'))
    if not os.path.isdir(os.path.join(output_path, 'yolo-feat')):
        os.mkdir(os.path.join(output_path, 'yolo-feat'))

    video_list = os.listdir(os.path.join(train_path, video_folder))
    for video in video_list:
        video_name = video.split(".")[0]
        npy_file = os.path.join(output_path, 'feat', video_name + '.npy')
        yolo_npy_file = os.path.join(output_path, 'yolo-feat', video_name + '.npy')

        # Check if corresponding .npy file already exists
        if os.path.exists(npy_file) and os.path.exists(yolo_npy_file):
            print(f"Skipping {video_name} as features already exist.")
            continue

        # Extract features for the video and save into feat folder
        img_feats, yolo_img_feats = extract_features(video, model)
        np.save(npy_file, img_feats)
        np.save(yolo_npy_file, yolo_img_feats)

In [None]:
if __name__ == "__main__":
  extract_feats_pretrained_cnn()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Speed: 9.4ms preprocess, 8.6ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns/detect/predict[0m

0: 640x640 (no detections), 9.8ms
Speed: 7.9ms preprocess, 9.8ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns/detect/predict[0m

0: 640x640 1 vase, 9.4ms
Speed: 10.8ms preprocess, 9.4ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns/detect/predict[0m

0: 640x640 (no detections), 8.6ms
Speed: 9.0ms preprocess, 8.6ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns/detect/predict[0m

0: 640x640 1 person, 15.8ms
Speed: 10.3ms preprocess, 15.8ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns/detect/predict[0m

0: 640x640 (no detections), 8.9ms
Speed: 11.6ms preprocess, 8.9ms inference, 0.6ms postprocess per im