### Purpose of this notebook
The purpose of this notebook is to convert 10 videos and its parquet into video frames to determine time taken for loading

In [1]:
#Install missing packages
#!pip install boto3
#!pip install pyarrow
#!pip install fastparquet
#!pip install s3fs
#!pip install mediapipe

In [2]:
# Import required libraries
import boto3 #Video files get read through this
import cv2
import os
import pandas as pd
import math
import numpy as np
import random
import io
import s3fs # Parquet files get read through this
import zlib # For compression
import time # To calculate download time
import configparser
import requests
import psutil # Checks memory usage
import tempfile
#import mediapipe as mp


In [3]:
# Let's read in the credentials file
config = configparser.ConfigParser()
config.read('credentials')

['credentials']

In [4]:
aws_access_key_id = config["default"]['aws_access_key_id']
aws_secret_access_key = config["default"]['aws_secret_access_key']
bucket_name = 'asl-capstone'
prefix = 'youtube-asl/1000-samples/'
save_path = '/content/temp_folder'
s3_URI = 's3://asl-capstone/'

In [5]:
# Create an s3 object
s3 = boto3.client('s3',aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_access_key,
                  region_name = 'us-west-2')

In [6]:
# Let's get a list of all the videos
paginator = s3.get_paginator('list_objects_v2')
video_files = []
for page in paginator.paginate(Bucket = bucket_name, Prefix = 'youtube-asl/1000-samples/'):
    video_files.extend(content['Key'] for content in page.get('Contents',[]) if content['Key'].endswith(('.mkv','.webm','.mp4')))
print(len(video_files))

1152


In [7]:
# Let's get a list of all the parquet files in the S3 bucket
paginator = s3.get_paginator('list_objects_v2')
caption_files = []
for page in paginator.paginate(Bucket = bucket_name, Prefix = prefix+'parsed/'):
    caption_files.extend(content['Key'] for content in page.get('Contents',[]))
print(len(caption_files))

1155


In [8]:
video_names = [os.path.splitext(os.path.basename(x))[0] for x in video_files]
caption_names = [os.path.splitext(os.path.splitext(os.path.basename(x))[0])[0] for x in caption_files]

In [9]:
print(video_names[0:4])
print(caption_names[0:4])

['--6bmFM9wT4', '-9aGqJpaN7c', '-FSlHH2ReLA', '-GtDaiSJkSQ']
['--6bmFM9wT4', '-9aGqJpaN7c', '-FSlHH2ReLA', '-GtDaiSJkSQ']


In [10]:
missing_names  = [x for x in caption_names if x not in video_names]
print(missing_names)

['combined_data', 'master_parquet', 'rncXf-_rDjg']


In [11]:
caption_files = [x for x in caption_files if os.path.splitext(os.path.splitext(os.path.basename(x))[0])[0] not in ['combined_data', 'master_parquet', 'rncXf-_rDjg']]
print(len(caption_files))


1152


In [12]:
# Let's read in one parquet file using S3FS
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)
with fs.open(f"{bucket_name}/{caption_files[0]}", 'rb') as f:
    df = pd.read_parquet(f)
df.head()

Unnamed: 0,File Name,Start Timestamp,End Timestamp,Caption,frame_rate,start_time_seconds,end_time_seconds,start_frame,end_frame,start_frame_float,end_frame_float,UID
0,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:06.320,00:00:07.440,Hello everyone.,29.97003,6.32,7.44,189,223,189.410589,222.977023,0_youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt
1,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:07.440,00:00:10.020,Welcome to Sign1News.,29.97003,7.44,10.02,223,300,222.977023,300.2997,1_youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt
2,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:10.020,00:00:11.220,I'm Candace Jones.,29.97003,10.02,11.22,300,336,300.2997,336.263736,2_youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt
3,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:11.220,00:00:14.500,Here are your top stories for today.,29.97003,11.22,14.5,336,435,336.263736,434.565435,3_youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt
4,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:16.840,00:00:25.420,We are about a week away from the start of\nth...,29.97003,16.84,25.42,505,762,504.695305,761.838162,4_youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt


In [13]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    if x > y:
        start_x = (x-y)/2
        end_x = start_x + y
        start_x = int(start_x)
        end_x = int(end_x)
        return frame[:, int(start_x):int(end_x)]
    else:
        return frame

In [14]:
def load_video(path, max_frames=0, resize=(256,256)):
    local_video_path = "temp_folder/"+os.path.basename(path)
    s3.download_file(bucket_name,path,local_video_path)
    #print(local_video_path)
    video_capture = cv2.VideoCapture(local_video_path)
    frames = []
    try:
      while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
          #print("Did not read the video")
          break
        frame = crop_center_square(frame)
        frame = cv2.resize(frame, resize)
        frame = frame[:, :, [2, 1, 0]]
        frames.append(frame)

        if len(frames) == max_frames:
          break
    finally:
      video_capture.release()
      os.remove(local_video_path)
      #temp_file.close()
    return np.array(frames) / 255.0

In [15]:
def read_caption(caption_file):
    try:
        with fs.open(f"{bucket_name}/{caption_file}", 'rb') as f:
            df = pd.read_parquet(f)
        return df
    except:
        print("error")        

In [16]:
# def extract_number(text):
#     try:
#         parts = text.split('_cap_')
#         if len(parts) > 1:
#             return parts[1].split('.')[0]
#         else:
#             return None
#     except:
#         return None

In [17]:
# def extract_video_number(text):
#     try:
#         parts = text.split('_')
#        # print(parts)
#         if len(parts)>1:
#             return parts[1].split('.')[0]
#         else:
#             return None
#     except:
#         return None

In [18]:
# def file_name_generator(filename):
#     try:
#         filename = os.path.basename(filename).rsplit('.',1)[0]
#         # Let's get a list of all the Numpy Caption Files
#         paginator = s3.get_paginator('list_objects_v2')
#         caption_files = []
#         for page in paginator.paginate(Bucket = bucket_name, Prefix = 'youtube-asl/test_sample/numpy_files/RGB/'):
#             caption_files.extend(content['Key'] for content in page.get('Contents',[]) if '_cap' in content['Key'])
#         file_of_interest = [x for x in caption_files if filename in x]
#         #print(len(file_of_interest))
#         #print(file_of_interest)
#         if len(file_of_interest) == 1 and "_cap.npy" in file_of_interest[0]:
#             return (filename+"_cap_1.npy")
#         else:
#             file_of_interest = [os.path.basename(x) for x in file_of_interest]
#             file_of_interest = [x.replace('.npy','') for x in file_of_interest]
#             file_of_interest = [extract_number(x) for x in file_of_interest]
#             file_of_interest = [int(x) for x in file_of_interest if x !=None]
#             file_of_interest.sort(reverse=True)
#             #print(file_of_interest)
#             if len(file_of_interest) == 0:
#                 return (filename+"_cap.npy")
#             else:
#                 suffix_val = int(file_of_interest[0])+1
#                 return (filename+"_cap_"+str(suffix_val)+".npy")
#     except:
#         suffix_val = random.randint(1245093,12456893)
#         return (filename+"_cap_"+str(suffix_val)+".npy")
        
        

In [19]:
# def video_name_generator(filename):
#     try:
#         filename = os.path.basename(filename).rsplit('.',1)[0]
#         # Let's get a list of all the Numpy Caption Files
#         paginator = s3.get_paginator('list_objects_v2')
#         caption_files = []
#         for page in paginator.paginate(Bucket = bucket_name, Prefix = 'youtube-asl/test_sample/numpy_files/RGB/'):
#             caption_files.extend(content['Key'] for content in page.get('Contents',[]) if '_cap' not in content['Key'])
#         file_of_interest = [x for x in caption_files if filename in x]
#         #print((file_of_interest))
#         if len(file_of_interest) == 1:
#             return (filename+"_1.npy")
#         else:
#             file_of_interest = [os.path.basename(x) for x in file_of_interest]
#             file_of_interest = [x.replace('.npy','') for x in file_of_interest]
#             file_of_interest = [extract_video_number(x) for x in file_of_interest]
#             file_of_interest = [int(x) for x in file_of_interest if x !=None]
#             file_of_interest.sort(reverse=True)
#             if len(file_of_interest) == 0:
#                 return (filename+".npy")
#             else:
#                 suffix_val = int(file_of_interest[0])+1
#                 return (filename+"_"+str(suffix_val)+".npy")
#     except:
#         return None

In [20]:
def save_frame_file(video_array, filename, caption, start_frame, end_frame, counter, s3_uri):
    caption_array = np.array(caption, dtype = str)
    numpy_array = video_array[int(start_frame):int(end_frame)]
    print(filename)
    numpy_array = numpy_array.astype('float32')
    #vid_filename = video_name_generator(filename)
    #if vid_filename == None:
    #    new_filename = file_name_generator(filename)
    #print(vid_filename)
    with fs.open(f"{s3_uri+filename}.npy","wb") as f:
        np.save(f,numpy_array)#.astype('float32')
    #new_filename = file_name_generator(filename)
    #print(new_filename)
    #with fs.open(f"{s3_uri+filename}_cap.npy","wb") as f:
    #    np.save(f,caption_array)

In [21]:
# with fs.open(f"s3://asl-capstone/youtube-asl/1000-samples/numpy_files/masterfile.parquet","rb") as f:
#     parquet_df = pd.read_parquet(f)

In [22]:
# display(parquet_df.head())

In [23]:
temp_video = video_files[28:100]
temp_caption = caption_files[28:100]

In [24]:
parquet_df = pd.DataFrame()

In [25]:
num_vid_counter = 0

In [None]:
for video, caption in zip(temp_video, temp_caption):
    num_vid_counter+=1
    with open('completed_vid_counter.txt','w') as num_vid:
        num_vid.write(str(num_vid_counter))
    try:
        master_caption = []
        master_path = []
        filename = os.path.basename(video)
        video_path = filename
        video_array = load_video(video)
        temp_df = read_caption(caption)
        print(f"Reading {filename}")
        filename = filename.split('.')[0]
        s3_uri = "s3://asl-capstone/youtube-asl/1000-samples/numpy_files/"
        counter = 0
        for _, row in temp_df.iterrows():
            new_filename = filename+"_"+str(counter)
            master_caption.append(row['Caption'])
            master_path.append((s3_uri+new_filename+".npy"))
            save_frame_file(video_array, new_filename, row['Caption'],int(row['start_frame']), int(row['end_frame']), counter, s3_uri)
            counter+=1
            temp_df = pd.DataFrame({'caption':master_caption, 'path':master_path})
            parquet_df = pd.concat([parquet_df,temp_df], ignore_index=True)
            with fs.open(f"s3://asl-capstone/youtube-asl/1000-samples/numpy_files/masterfile.parquet","wb") as f:
                parquet_df.to_parquet(f)
            print("Clearing lists")
            master_caption.clear()
            master_path.clear()
    except Exception as e:
        print(e)
        print("Errored on video. Moving to next file")
        with open('error_files.txt','a') as file:
            file.write(video)

Reading 0L6S0b14ZmI.webm
0L6S0b14ZmI_0
Clearing lists
0L6S0b14ZmI_1
Clearing lists
0L6S0b14ZmI_2
Clearing lists
0L6S0b14ZmI_3
Clearing lists
0L6S0b14ZmI_4
Clearing lists
0L6S0b14ZmI_5
Clearing lists
0L6S0b14ZmI_6
Clearing lists
0L6S0b14ZmI_7
Clearing lists
0L6S0b14ZmI_8
Clearing lists
0L6S0b14ZmI_9
Clearing lists


In [None]:
def get_instance_id():
    """Get current instance ID from metadata"""
    url = "http://169.254.169.254/latest/meta-data/instance-id"
    response = requests.get(url)
    return response.text

In [None]:
def stop_instance(instance_id, region_name='us-west-2'):
    """Stop the EC2 instance"""
    ec2 = boto3.client('ec2', aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_access_key, region_name=region_name)
    ec2.stop_instances(InstanceIds=[instance_id])

In [None]:
# Get the current instance ID
instance_id = get_instance_id()
print(instance_id)
# Stop the instance
stop_instance(instance_id)