### Purpose of this notebook
The purpose of this notebook is to convert 10 videos and its parquet into video frames to determine time taken for loading

In [1]:
#Install missing packages
#!pip install boto3
#!pip install pyarrow
#!pip install fastparquet
#!pip install s3fs

In [1]:
# Import required libraries
import boto3 #Video files get read through this
import cv2
import os
import pandas as pd
import math
import numpy as np
import random
import io
import s3fs # Parquet files get read through this
import zlib # For compression
import time # To calculate download time

In [2]:
# Setup the S3 object
aws_access_key_id = 'AWS_KEY'
aws_secret_access_key = 'AWS_SECRET'
bucket_name = 'asl-capstone'
prefix = 'youtube-asl/test_sample/'
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-west-2'
)

In [3]:
# List files in the S3 bucket
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
video_files = []
for content in response.get('Contents', []):
    if content['Key'].endswith(('.mkv','.webm','.mp4')):
        video_files.append(content['Key'])

In [4]:
print(video_files)

['youtube-asl/test_sample/06kKvQp4SfM.webm', 'youtube-asl/test_sample/0XGfpv6PUq4.mkv', 'youtube-asl/test_sample/5J8a2CaXWvk.webm', 'youtube-asl/test_sample/8VTAvocbMpI.mkv', 'youtube-asl/test_sample/Dmvi3_q3gMc.mkv', 'youtube-asl/test_sample/UEryLHbfb0M.webm', 'youtube-asl/test_sample/aoLQ0VchSec.mkv', 'youtube-asl/test_sample/dzWgVm2oY44.webm', 'youtube-asl/test_sample/esx9dGfUuto.webm', 'youtube-asl/test_sample/oOkSSJbNv68.webm']


In [5]:
# Let's capture the parquet files into a list
caption_files = []
for content in response.get('Contents',[]):
    if content['Key'].endswith(('.parquet')):
        caption_files.append(content['Key'])

In [6]:
print(caption_files)

['youtube-asl/test_sample/06kKvQp4SfM.en-0hllRZe4s5s.parquet', 'youtube-asl/test_sample/0XGfpv6PUq4.en.parquet', 'youtube-asl/test_sample/5J8a2CaXWvk.en.parquet', 'youtube-asl/test_sample/8VTAvocbMpI.en.parquet', 'youtube-asl/test_sample/Dmvi3_q3gMc.en-CA.parquet', 'youtube-asl/test_sample/UEryLHbfb0M.en-eEY6OEpapPo.parquet', 'youtube-asl/test_sample/aoLQ0VchSec.en.parquet', 'youtube-asl/test_sample/dzWgVm2oY44.en.parquet', 'youtube-asl/test_sample/esx9dGfUuto.en-xDMNrYfabLQ.parquet', 'youtube-asl/test_sample/oOkSSJbNv68.en.parquet']


In [7]:
# Let's read in one parquet file using S3FS
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)
with fs.open(f"{bucket_name}/{caption_files[0]}", 'rb') as f:
    df = pd.read_parquet(f)
df.head()

Unnamed: 0,File Name,Start Timestamp,End Timestamp,Caption,frame_rate,start_time_seconds,end_time_seconds,start_frame,end_frame,start_frame_float,end_frame_float
0,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:00.000,00:00:02.236,Welcome to the third round,30.0,0.0,2.236,0.0,67.0,0.0,67.08
1,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:02.236,00:00:04.304,of the Pearls announcement.,30.0,2.236,4.304,67.0,129.0,67.08,129.12
2,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:04.304,00:00:07.207,Today's category is called the,30.0,4.304,7.207,129.0,216.0,129.12,216.21
3,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:07.207,00:00:09.610,Hidden Pearls. What is it?,30.0,7.207,9.61,216.0,288.0,216.21,288.3
4,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:09.610,00:00:12.379,"Hidden Pearls...in other words,",30.0,9.61,12.379,288.0,371.0,288.3,371.37


In [18]:
# Define function to process video
def video_frame_capturer(video_name, start_frame, end_frame):
  current_frame = start_frame
  filename = os.path.basename(video_name)
  video_path = "temp_folder/" + filename
  #download_from_s3('asl-capstone', video_name, local_video_path, aws_access_key_id, aws_secret_access_key)
  #print(f"Reading {video_path}") 
  video_capture = cv2.VideoCapture(video_path)
  video_capture.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
  while video_capture.isOpened() and current_frame <= end_frame:
    ret, frame = video_capture.read()
    #print("In frame number ",video_capture.get(cv2.CAP_PROP_POS_FRAMES))
    if not ret:
      break
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame = cv2.resize(frame, (1280, 720))
    current_frame+=1
    yield frame # Does not terminate the function call; but comes back to it
  video_capture.release()

In [14]:
def read_caption(caption_file):
    with fs.open(f"{bucket_name}/{caption_file}", 'rb') as f:
        df = pd.read_parquet(f)
    return df

In [21]:
def save_frames_file(frame_list, filename, caption_list):
    numpy_array = frame_list[0]
    caption_array = np.array(caption_list[0], dtype = str)
    filename = filename.split('.')[0]
    s3_file_path = f'{prefix}numpy_files/{filename}.npy'
    s3_cap_path = f'{prefix}numpy_files/{filename}_cap.npy'
    file_path = "temp_folder/"+filename+".npy"
    caption_path = "temp_folder/"+filename+"_cap.npy"
    #file_path = "s3://asl-capstone/youtube-asl/test_sample/numpy_files/"+filename
    np.save(f"{file_path}",numpy_array)
    np.save(f"{caption_path}",caption_array)
    s3.upload_file(file_path, bucket_name, s3_file_path)
    s3.upload_file(caption_path, bucket_name, s3_cap_path)
    os.remove(file_path)
    os.remove(caption_path)

In [22]:
#vid_frames = [] #Empty list to store video frames
#master_caption = [] # Empty list to store captions
download_time = []
for video, caption in zip(video_files, caption_files):
    vid_frames = []
    master_caption = []
    start_time = time.time()
    filename = os.path.basename(video)
    local_video_path = "temp_folder/"+filename
    if os.path.exists(local_video_path):
        print("Video already exists. Not downloading again")
    else:
        s3.download_file(bucket_name,video,local_video_path)
    temp_df = read_caption(caption)
    for _, row in temp_df.iterrows():
        #print(f"Reading {video} at frame number {row['start_frame']}")
        temp_frame = video_frame_capturer(video,int(row['start_frame']), int(row['end_frame']))
        for frame in temp_frame:
            vid_frames.append(frame)
        master_caption.append(row['Caption'])
    save_frames_file(vid_frames,filename, master_caption)
    file_size = os.path.getsize(local_video_path)/(1024*1024)
    os.remove(local_video_path)
    end_time = time.time()
    download_time.append(end_time-start_time)
    print(f"Time taken to process {filename} of size {file_size:.2f} MB is {((end_time - start_time)):.2f} seconds")

Time taken to process 06kKvQp4SfM.webm of size 17.86 MB is 9.05 seconds
Time taken to process 0XGfpv6PUq4.mkv of size 59.04 MB is 22.47 seconds
Time taken to process 5J8a2CaXWvk.webm of size 82.17 MB is 60.20 seconds
Time taken to process 8VTAvocbMpI.mkv of size 9.07 MB is 3.33 seconds
Time taken to process Dmvi3_q3gMc.mkv of size 145.80 MB is 36.41 seconds
Time taken to process UEryLHbfb0M.webm of size 17.73 MB is 10.61 seconds
Time taken to process aoLQ0VchSec.mkv of size 10.50 MB is 5.06 seconds
Time taken to process dzWgVm2oY44.webm of size 204.51 MB is 105.13 seconds
Time taken to process esx9dGfUuto.webm of size 6.80 MB is 8.96 seconds
Time taken to process oOkSSJbNv68.webm of size 280.17 MB is 114.41 seconds


In [23]:
time_download = sum(download_time)/len(download_time)
print(f"Average processing time per video is {time_download:.2f} seconds")

Average processing time per video is 37.56 seconds
