### Purpose of this notebook
The purpose of this notebook is to convert 10 videos and its parquet into video frames to determine time taken for loading

In [1]:
#Install missing packages
#!pip install boto3
#!pip install pyarrow
#!pip install fastparquet
#!pip install s3fs
#!pip install mediapipe

In [2]:
# Import required libraries
import boto3 #Video files get read through this
import cv2
import os
import pandas as pd
import math
import numpy as np
import random
import io
import s3fs # Parquet files get read through this
import zlib # For compression
import time # To calculate download time
import configparser
import requests
import psutil # Checks memory usage
import tempfile
#import mediapipe as mp


In [3]:
# Let's read in the credentials file
config = configparser.ConfigParser()
config.read('credentials')

['credentials']

In [4]:
aws_access_key_id = config["default"]['aws_access_key_id']
aws_secret_access_key = config["default"]['aws_secret_access_key']
bucket_name = 'asl-capstone'
prefix = 'youtube-asl/test_sample/'
save_path = '/content/temp_folder'
s3_URI = 's3://asl-capstone/'

In [5]:
# Create an s3 object
s3 = boto3.client('s3',aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_access_key,
                  region_name = 'us-west-2')

In [6]:
# List files in the S3 bucket
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
video_files = []
for content in response.get('Contents', []):
    if content['Key'].endswith(('.mkv','.webm','.mp4')):
        video_files.append(content['Key'])

In [7]:
print(video_files)

['youtube-asl/test_sample/06kKvQp4SfM.webm', 'youtube-asl/test_sample/0XGfpv6PUq4.mkv', 'youtube-asl/test_sample/5J8a2CaXWvk.webm', 'youtube-asl/test_sample/8VTAvocbMpI.mkv', 'youtube-asl/test_sample/Dmvi3_q3gMc.mkv', 'youtube-asl/test_sample/UEryLHbfb0M.webm', 'youtube-asl/test_sample/aoLQ0VchSec.mkv', 'youtube-asl/test_sample/dzWgVm2oY44.webm', 'youtube-asl/test_sample/esx9dGfUuto.webm', 'youtube-asl/test_sample/oOkSSJbNv68.webm']


In [8]:
# Let's capture the parquet files into a list
caption_files = []
for content in response.get('Contents',[]):
    if content['Key'].endswith(('.parquet')):
        caption_files.append(content['Key'])

In [9]:
print(caption_files)

['youtube-asl/test_sample/06kKvQp4SfM.en-0hllRZe4s5s.parquet', 'youtube-asl/test_sample/0XGfpv6PUq4.en.parquet', 'youtube-asl/test_sample/5J8a2CaXWvk.en.parquet', 'youtube-asl/test_sample/8VTAvocbMpI.en.parquet', 'youtube-asl/test_sample/Dmvi3_q3gMc.en-CA.parquet', 'youtube-asl/test_sample/UEryLHbfb0M.en-eEY6OEpapPo.parquet', 'youtube-asl/test_sample/aoLQ0VchSec.en.parquet', 'youtube-asl/test_sample/dzWgVm2oY44.en.parquet', 'youtube-asl/test_sample/esx9dGfUuto.en-xDMNrYfabLQ.parquet', 'youtube-asl/test_sample/oOkSSJbNv68.en.parquet']


In [10]:
# Let's read in one parquet file using S3FS
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)
with fs.open(f"{bucket_name}/{caption_files[0]}", 'rb') as f:
    df = pd.read_parquet(f)
df.head()

Unnamed: 0,File Name,Start Timestamp,End Timestamp,Caption,frame_rate,start_time_seconds,end_time_seconds,start_frame,end_frame,start_frame_float,end_frame_float
0,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:00.000,00:00:02.236,Welcome to the third round,30.0,0.0,2.236,0.0,67.0,0.0,67.08
1,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:02.236,00:00:04.304,of the Pearls announcement.,30.0,2.236,4.304,67.0,129.0,67.08,129.12
2,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:04.304,00:00:07.207,Today's category is called the,30.0,4.304,7.207,129.0,216.0,129.12,216.21
3,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:07.207,00:00:09.610,Hidden Pearls. What is it?,30.0,7.207,9.61,216.0,288.0,216.21,288.3
4,06kKvQp4SfM.en-0hllRZe4s5s.vtt,00:00:09.610,00:00:12.379,"Hidden Pearls...in other words,",30.0,9.61,12.379,288.0,371.0,288.3,371.37


In [11]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    if x > y:
        start_x = (x-y)/2
        end_x = start_x + y
        start_x = int(start_x)
        end_x = int(end_x)
        return frame[:, int(start_x):int(end_x)]
    else:
        return frame

In [12]:
def load_video(path, max_frames=0, resize=(256,256)):
    local_video_path = "temp_folder/"+os.path.basename(path)
    s3.download_file(bucket_name,path,local_video_path)
    #print(local_video_path)
    video_capture = cv2.VideoCapture(local_video_path)
    frames = []
    try:
      while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
          #print("Did not read the video")
          break
        frame = crop_center_square(frame)
        frame = cv2.resize(frame, resize)
        frame = frame[:, :, [2, 1, 0]]
        frames.append(frame)

        if len(frames) == max_frames:
          break
    finally:
      video_capture.release()
      os.remove(local_video_path)
      #temp_file.close()
    return np.array(frames) / 255.0

In [13]:
def read_caption(caption_file):
    with fs.open(f"{bucket_name}/{caption_file}", 'rb') as f:
        df = pd.read_parquet(f)
    return df

In [14]:
def extract_number(text):
    parts = text.split('_cap_')
    if len(parts) > 1:
        return parts[1].split('.')[0]
    else:
        return None

In [15]:
def extract_video_number(text):
    parts = text.split('_')
   # print(parts)
    if len(parts)>1:
        return parts[1].split('.')[0]
    else:
        return None

In [16]:
def file_name_generator(filename):
    filename = os.path.basename(filename).rsplit('.',1)[0]
    # Let's get a list of all the Numpy Caption Files
    paginator = s3.get_paginator('list_objects_v2')
    caption_files = []
    for page in paginator.paginate(Bucket = bucket_name, Prefix = 'youtube-asl/test_sample/numpy_files/RGB/'):
        caption_files.extend(content['Key'] for content in page.get('Contents',[]) if '_cap' in content['Key'])
    file_of_interest = [x for x in caption_files if filename in x]
    #print(len(file_of_interest))
    #print(file_of_interest)
    if len(file_of_interest) == 1 and "_cap.npy" in file_of_interest[0]:
        return (filename+"_cap_1.npy")
    else:
        file_of_interest = [os.path.basename(x) for x in file_of_interest]
        file_of_interest = [x.replace('.npy','') for x in file_of_interest]
        file_of_interest = [extract_number(x) for x in file_of_interest]
        file_of_interest = [int(x) for x in file_of_interest if x !=None]
        file_of_interest.sort(reverse=True)
        #print(file_of_interest)
        if len(file_of_interest) == 0:
            return (filename+"_cap.npy")
        else:
            suffix_val = int(file_of_interest[0])+1
            return (filename+"_cap_"+str(suffix_val)+".npy")
            

In [17]:
def video_name_generator(filename):
    filename = os.path.basename(filename).rsplit('.',1)[0]
    # Let's get a list of all the Numpy Caption Files
    paginator = s3.get_paginator('list_objects_v2')
    caption_files = []
    for page in paginator.paginate(Bucket = bucket_name, Prefix = 'youtube-asl/test_sample/numpy_files/RGB/'):
        caption_files.extend(content['Key'] for content in page.get('Contents',[]) if '_cap' not in content['Key'])
    file_of_interest = [x for x in caption_files if filename in x]
    #print((file_of_interest))
    if len(file_of_interest) == 1:
        return (filename+"_1.npy")
    else:
        file_of_interest = [os.path.basename(x) for x in file_of_interest]
        file_of_interest = [x.replace('.npy','') for x in file_of_interest]
        file_of_interest = [extract_video_number(x) for x in file_of_interest]
        file_of_interest = [int(x) for x in file_of_interest if x !=None]
        file_of_interest.sort(reverse=True)
        if len(file_of_interest) == 0:
            return (filename+".npy")
        else:
            suffix_val = int(file_of_interest[0])+1
            return (filename+"_"+str(suffix_val)+".npy")
            

In [18]:
file_name = ["06kKvQp4SfM_1","06kKvQp4SfM", "06kKvQp4SfM", "06kKvQp4SfM", "06kKvQp4SfM"]
for file in file_name:
    output = video_name_generator(file)
    #print(output)

In [19]:
def save_frame_file(video_array, filename, caption, start_frame, end_frame):
    filename = filename.split('.')[0]
    caption_array = np.array(caption, dtype = str)
    s3_uri = "s3://asl-capstone/youtube-asl/test_sample/numpy_files/RGB/"
    numpy_array = video_array[int(start_frame):int(end_frame)]
    vid_filename = video_name_generator(filename)
    print(vid_filename)
    with fs.open(f"{s3_uri+vid_filename}","wb") as f:
        np.save(f,numpy_array)
    new_filename = file_name_generator(filename)
    print(new_filename)
    with fs.open(f"{s3_uri+new_filename}","wb") as f:
        np.save(f,caption_array)

In [None]:
for video, caption in zip(video_files, caption_files):
    filename = os.path.basename(video)
    video_path = filename
    video_array = load_video(video)
    temp_df = read_caption(caption)
    print(f"Reading {filename}")
    for _, row in temp_df.iterrows():
        save_frame_file(video_array, filename, row['Caption'],int(row['start_frame']), int(row['end_frame']))
        

Reading 06kKvQp4SfM.webm
06kKvQp4SfM.npy
06kKvQp4SfM_cap.npy
06kKvQp4SfM_1.npy
06kKvQp4SfM_cap_1.npy
06kKvQp4SfM_2.npy
06kKvQp4SfM_cap_2.npy
06kKvQp4SfM_3.npy
06kKvQp4SfM_cap_3.npy
06kKvQp4SfM_4.npy
06kKvQp4SfM_cap_4.npy
06kKvQp4SfM_5.npy
06kKvQp4SfM_cap_5.npy
06kKvQp4SfM_6.npy
06kKvQp4SfM_cap_6.npy
06kKvQp4SfM_7.npy
06kKvQp4SfM_cap_7.npy
06kKvQp4SfM_8.npy
06kKvQp4SfM_cap_8.npy
06kKvQp4SfM_9.npy
06kKvQp4SfM_cap_9.npy
06kKvQp4SfM_10.npy
06kKvQp4SfM_cap_10.npy
06kKvQp4SfM_11.npy
06kKvQp4SfM_cap_11.npy
06kKvQp4SfM_12.npy
06kKvQp4SfM_cap_12.npy
06kKvQp4SfM_13.npy
