### Purpose of this notebook
The purpose of this notebook is to convert 10 videos and its parquet into video frames to determine time taken for loading

In [1]:
#Install missing packages
#!pip install boto3
#!pip install pyarrow
#!pip install fastparquet
#!pip install s3fs
#!pip install mediapipe

In [1]:
# Import required libraries
import boto3 #Video files get read through this
import cv2
import os
import pandas as pd
import math
import numpy as np
import random
import io
import s3fs # Parquet files get read through this
import zlib # For compression
import time # To calculate download time
import configparser
import requests
import psutil # Checks memory usage
import mediapipe as mp


2023-10-30 02:48:13.864199: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Let's read in the credentials file
config = configparser.ConfigParser()
config.read('/home/ec2-user/.aws/credentials')

['/home/ec2-user/.aws/credentials']

In [3]:
aws_access_key_id = config["default"]['aws_access_key_id']
aws_secret_access_key = config["default"]['aws_secret_access_key']
bucket_name = 'asl-capstone'
prefix = 'youtube-asl/test_sample/'
save_path = '/content/temp_folder'
s3_URI = 's3://asl-capstone/'

In [4]:
# Create an s3 object
s3 = boto3.client('s3',aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_access_key,
                  region_name = 'us-west-2')

In [5]:
# List files in the S3 bucket
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
video_files = []
for content in response.get('Contents', []):
    if content['Key'].endswith(('.mkv','.webm','.mp4')):
        video_files.append(content['Key'])

In [6]:
print(video_files)

['youtube-asl/test_sample/06kKvQp4SfM.webm', 'youtube-asl/test_sample/0XGfpv6PUq4.mkv', 'youtube-asl/test_sample/5J8a2CaXWvk.webm', 'youtube-asl/test_sample/8VTAvocbMpI.mkv', 'youtube-asl/test_sample/Dmvi3_q3gMc.mkv', 'youtube-asl/test_sample/UEryLHbfb0M.webm', 'youtube-asl/test_sample/aoLQ0VchSec.mkv', 'youtube-asl/test_sample/dzWgVm2oY44.webm', 'youtube-asl/test_sample/esx9dGfUuto.webm', 'youtube-asl/test_sample/oOkSSJbNv68.webm']


In [7]:
# Let's capture the parquet files into a list
caption_files = []
for content in response.get('Contents',[]):
    if content['Key'].endswith(('.parquet')):
        caption_files.append(content['Key'])

In [8]:
print(caption_files)

['youtube-asl/test_sample/06kKvQp4SfM.en-0hllRZe4s5s.parquet', 'youtube-asl/test_sample/0XGfpv6PUq4.en.parquet', 'youtube-asl/test_sample/5J8a2CaXWvk.en.parquet', 'youtube-asl/test_sample/8VTAvocbMpI.en.parquet', 'youtube-asl/test_sample/Dmvi3_q3gMc.en-CA.parquet', 'youtube-asl/test_sample/UEryLHbfb0M.en-eEY6OEpapPo.parquet', 'youtube-asl/test_sample/aoLQ0VchSec.en.parquet', 'youtube-asl/test_sample/dzWgVm2oY44.en.parquet', 'youtube-asl/test_sample/esx9dGfUuto.en-xDMNrYfabLQ.parquet', 'youtube-asl/test_sample/oOkSSJbNv68.en.parquet']


In [9]:
# Let's read in one parquet file using S3FS
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)
with fs.open(f"{bucket_name}/{caption_files[0]}", 'rb') as f:
    df = pd.read_parquet(f)
df.head()

Unnamed: 0,File Name,Start Timestamp,End Timestamp,Caption,frame_rate,start_time_seconds,end_time_seconds,start_frame,end_frame,start_frame_float,end_frame_float
0,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:00.000,00:00:02.236,Welcome to the third round,30.0,0.0,2.236,0.0,67.0,0.0,67.08
1,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:02.236,00:00:04.304,of the Pearls announcement.,30.0,2.236,4.304,67.0,129.0,67.08,129.12
2,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:04.304,00:00:07.207,Today's category is called the,30.0,4.304,7.207,129.0,216.0,129.12,216.21
3,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:07.207,00:00:09.610,Hidden Pearls. What is it?,30.0,7.207,9.61,216.0,288.0,216.21,288.3
4,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:09.610,00:00:12.379,"Hidden Pearls...in other words,",30.0,9.61,12.379,288.0,371.0,288.3,371.37


In [16]:
# Define function to process video
def video_frame_capturer(video_name, start_frame, end_frame):
  #frame_holder = []
  current_frame = start_frame
  filename = os.path.basename(video_name)
  video_path = "s3://asl-capstone/youtube-asl/test_sample/"+filename
  print(video_path)
  video_capture = cv2.VideoCapture(video_path)
  frames = []
  while True:
      ret, frame = video_capture.read()
  video_capture.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
  while video_capture.isOpened() and current_frame <= end_frame:
    ret, frame = video_capture.read()
    #print("In frame number ",video_capture.get(cv2.CAP_PROP_POS_FRAMES))
    if not ret:
      break
    #frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame = cv2.resize(frame, (455, 256))
    current_frame+=1
    yield frame # Does not terminate the function call; but comes back to it
    #frame_holder.append(frame)
  video_capture.release()
  #return frame_holder

In [52]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    if x > y:
        start_x = (x-y)/2
        end_x = start_x + y
        start_x = int(start_x)
        end_x = int(end_x)
        return frame[:, start_x:end_x]
    else:
        return frame

In [57]:
def load_video(path, max_frames=0, resize=(256,256)):
    #url = s3.generate_presigned_url(ClientMethod='get_object', Params={ 'Bucket': bucket_name, 'Key': path })
    #print(url)
    s3_uri = "s3://asl-capstone/youtube-asl/test_sample/"+path
    cap = cv2.VideoCapture(url)
    frames = []
    try:
      while True:
        ret, frame = cap.read()
        if not ret:
          print("Did not read the video")
          break
        frame = crop_center_square(frame)
        frame = cv2.resize(frame, resize)
        frame = frame[:, :, [2, 1, 0]]
        frames.append(frame)

        if len(frames) == max_frames:
          break
    finally:
      cap.release()
    return np.array(frames) / 255.0

In [58]:
def read_caption(caption_file):
    with fs.open(f"{bucket_name}/{caption_file}", 'rb') as f:
        df = pd.read_parquet(f)
    return df

In [59]:
def save_frame_file(video_array, filename, caption, start_frame, end_frame):
    filename = filename.split('.')[0]
    caption_array = np.array(caption, dtype = str)
    s3_uri = "s3://asl-capstone/youtube-asl/test_sample/numpy_files/RGB/"
    numpy_array = video_array[start_frame:end_frame]
    with fs.open(f"{s3_uri+filename}.npy","wb") as f:
        np.save(f,numpy_array)
    with fs.open(f"{s3_uri+filename}_cap.npy","wb") as f:
        np.save(f,caption_array)

In [60]:
for video, caption in zip(video_files, caption_files):
    filename = os.path.basename(video)
    video_path = filename
    video_array = load_video(video_path)
    temp_df = read_caption(caption)
    for _, row in temp_df.iterrows():
        save_frame_file(video_array, filename, row['Caption'],int(row['start_frame']), int(row['end_frame']))
        

https://asl-capstone.s3.amazonaws.com/youtube-asl/test_sample/06kKvQp4SfM.webm?AWSAccessKeyId=AKIAS6WKZTC4DEVCM7WY&Signature=I8YM1juQK3cB8c1qqY6mu1Gyh2w%3D&Expires=1698640322
Did not read the video


KeyboardInterrupt: 

In [18]:
with fs.open("s3://asl-capstone/youtube-asl/test_sample/numpy_files/RGB/aoLQ0VchSec.npy", "rb") as f:
    my_np = np.load(f)
print(my_np.shape)

(0,)


In [18]:
def save_frames_file(frame_list, filename, caption_list):
    numpy_array = np.array(frame_list, dtype = object)
    #caption_array = np.array(caption_list[0], dtype = str)
    caption_array = np.array(caption_list, dtype = str)
    print(f"Frame size is {numpy_array.shape} and Caption size is {caption_array.shape}")
    filename = filename.split('.')[0]
    s3_file_path = f'{prefix}numpy_files/RGB/{filename}.npy'
    s3_cap_path = f'{prefix}numpy_files/RGB/{filename}_cap.npy'
    file_path = "temp_folder/"+filename+".npy"
    caption_path = "temp_folder/"+filename+"_cap.npy"
    #file_path = "s3://asl-capstone/youtube-asl/test_sample/numpy_files/"+filename
    s3_uri = "s3://asl-capstone/youtube-asl/test_sample/numpy_files/RGB/"
    with fs.open(f"{s3_uri+filename}.npy","wb") as f:
        np.save(f,numpy_array)
    with fs.open(f"{s3_uri+filename}_cap.npy","wb") as f:
        np.save(f,caption_array)
    #np.save(f"s3://asl-capstone/youtube-asl/test_sample/numpy_files/RGB/{filename}.npy", numpy_array)
    #np.save(f"s3://asl-capstone/youtube-asl/test_sample/numpy_files/RGB/{filename}_cap.npy", caption_array)
    #np.save(f"{file_path}",numpy_array)
    #np.save(f"{caption_path}",caption_array)
    #s3.upload_file(file_path, bucket_name, s3_file_path)
    #s3.upload_file(caption_path, bucket_name, s3_cap_path)
    #os.remove(file_path)
    #os.remove(caption_path)

In [19]:
#vid_frames = [] #Empty list to store video frames
#master_caption = [] # Empty list to store captions
download_time = []
for video, caption in zip(video_files, caption_files):
    vid_frames = []
    master_caption = []
    start_time = time.time()
    filename = os.path.basename(video)
    #local_video_path = "temp_folder/"+filename
    #if os.path.exists(local_video_path):
    #    print("Video already exists. Not downloading again")
    #else:
    #    #s3.download_file(bucket_name,video,local_video_path)
    #    print(video)
        
    temp_df = read_caption(caption)
    for _, row in temp_df.iterrows():
        #print(f"Reading {video} at frame number {row['start_frame']}")
        temp_frame = video_frame_capturer(video,int(row['start_frame']), int(row['end_frame']))
        for frame in temp_frame:
            vid_frames.append(frame)
        #vid_frames.append(temp_frame)
        master_caption.append(row['Caption'])
    save_frames_file(vid_frames,filename, master_caption)
    #print(len(temp_frame), len(master_caption))
    #print(vid_frames)
    #file_size = os.path.getsize(local_video_path)/(1024*1024)
    #os.remove(local_video_path)
    end_time = time.time()
    download_time.append(end_time-start_time)
    #print(f"Time taken to process {filename} of size {file_size:.2f} MB is {((end_time - start_time)):.2f} seconds")

s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.webm
s3://asl-capstone/youtube-asl/test_sample/06kKvQp4SfM.we

In [15]:
time_download = sum(download_time)/len(download_time)
print(f"Average processing time per video is {time_download:.2f} seconds")

Average processing time per video is 0.19 seconds
