### Purpose of this notebook
The purpose of this notebook is to download WLASL dataset and upload to S3 bucket after processing as Numpy

The videos will be labeled by the 'gloss' name from the Json.
Some videos only have the required sign between Frame-A and Frame-B. These will be cropped accordingly

In [1]:
#Install missing packages
#!pip install boto3
#!pip install pyarrow
#!pip install fastparquet
#!pip install s3fs
#!pip install mediapipe
#!pip install kaggle

In [30]:
# Import required libraries
import boto3 #Video files get read through this
import cv2
import os
import pandas as pd
import math
import numpy as np
import random
import io
import s3fs # Parquet files get read through this
import zlib # For compression
import time # To calculate download time
import configparser
import requests
import psutil # Checks memory usage
import tempfile
import json
import yt_dlp
import subprocess
from tqdm import tqdm
#import mediapipe as mp


In [3]:
import configparser

config = configparser.ConfigParser()
config.read('/home/ec2-user/.aws/credentials')

['/home/ec2-user/.aws/credentials']

In [4]:
aws_access_key_id = config["default"]['aws_access_key_id']
aws_secret_access_key = config["default"]['aws_secret_access_key']
bucket_name = 'asl-capstone'
prefix = '/msasl/RGB/'
save_path = '/content/temp_folder'
s3_URI = 's3://asl-capstone/msasl/RGB/'

In [5]:
# Create an s3 object
s3 = boto3.client('s3',aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_access_key,
                  region_name = 'us-west-2')

In [6]:
# Create an S3FS object
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key) # Define an S3FS object

In [7]:
# Let's read the WLASL json file to map the number to the word
filename = "wlasl-dataset/WLASL_v0.3.json"
wlasl_df = pd.read_json(filename)

In [8]:
display(wlasl_df.head())
print(wlasl_df.shape)
#print(wlasl_df['instances'][0])

Unnamed: 0,gloss,instances
0,book,"[{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra..."
1,drink,"[{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f..."
2,computer,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."
3,before,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."
4,chair,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."


(2000, 2)


In [9]:
# Not all videos from the JSON exist. Let's create a function to check which ones exist
def video_features(json_data):
    video_list = []
    frame_list = []
    gloss = json_data['gloss']
    json_list = json_data['instances']
    for id in json_list:
        vid = id['video_id']
        frame_start = id['frame_start']
        frame_end = id['frame_end']
        bbox = id['bbox']
        if os.path.exists(f"wlasl-dataset/videos/{vid}.mp4"):
            features = {'word':gloss,'id':vid,'start':frame_start, 'end':frame_end,'box':bbox}
            #yield features
            video_list.append(features)
    return video_list

In [10]:
# Load the data and get the features
with open("wlasl-dataset/WLASL_v0.3.json",'r') as file:
    json_data = file.read()
json_data = json.loads(json_data)

In [11]:
# Let's extract all relevant features from the JSON
video_list = []
for data in json_data:
    video_list.append(video_features(data))

In [12]:
# Let's check how many unique words we have
word_list = []
for video in video_list:
    for vid in video:
        word_list.append(vid['word'])

In [13]:
print(len(word_list))
word_list = list(set(word_list))
print(len(word_list))
print(word_list[:10])

11980
2000
['shower', 'left', 'hippopotamus', 'manage', 'event', 'normal', 'lend', 'flirt', 'u', 'snack']


In [14]:
# Let's now work on the MS-ASL Dataset
filenames = ['MSASL/MSASL_train.json','MSASL/MSASL_val.json','MSASL/MSASL_test.json']
msasl_words = []
for file in filenames:
    with open("MSASL/MSASL_train.json",'r') as file:
        json_data = file.read()
    json_data = json.loads(json_data)
    for word in json_data:
        msasl_words.append(word)

In [15]:
print(msasl_words[0])

{'org_text': 'match [light-a-MATCH]', 'clean_text': 'match', 'start_time': 0.0, 'signer_id': 0, 'signer': 0, 'start': 0, 'end': 83, 'file': 'match light-a-MATCH', 'label': 830, 'height': 360.0, 'fps': 30.0, 'end_time': 2.767, 'url': 'https://www.youtube.com/watch?v=C37R_Ix8-qs', 'text': 'match', 'box': [0.05754461884498596, 0.21637457609176636, 1.0, 0.7300844192504883], 'width': 640.0}


In [16]:
def msasl_features(msasl_word):
    gloss = msasl_word['clean_text']
    url = msasl_word['url']
    frame_start = msasl_word['start_time']*msasl_word['fps']
    frame_end = msasl_word['end_time']*msasl_word['fps']
    bbox = msasl_word['box']
    features = {'word':gloss,'url':url,'start':frame_start, 'end':frame_end,'box':bbox}
    return features
    

In [17]:
msasl_data = [] # Store all relevant information about the MSASL dataset
for word in msasl_words:
    msasl_data.append(msasl_features(word))    

In [18]:
print(msasl_data[:4])

[{'word': 'match', 'url': 'https://www.youtube.com/watch?v=C37R_Ix8-qs', 'start': 0.0, 'end': 83.00999999999999, 'box': [0.05754461884498596, 0.21637457609176636, 1.0, 0.7300844192504883]}, {'word': 'fail', 'url': 'https://www.youtube.com/watch?v=PIsUJl8BN_I', 'start': 0.0, 'end': 74.0, 'box': [0.0657794177532196, 0.16717177629470825, 0.9392627477645874, 0.9187960028648376]}, {'word': 'laugh', 'url': 'www.youtube.com/watch?v=9FdHlMOnVjg', 'start': 0.0, 'end': 30.988979999999998, 'box': [0.13188594579696655, 0.32334136962890625, 1.0, 0.831863522529602]}, {'word': 'book', 'url': 'https://www.youtube.com/watch?v=J7tP98oDxqE', 'start': 0.0, 'end': 66.0, 'box': [0.05569887161254883, 0.25173279643058777, 0.9968159794807434, 0.9009996652603149]}]


In [19]:
matching_data = [x for x in msasl_data if x['word'] in word_list]
print(matching_data[:4])

[{'word': 'match', 'url': 'https://www.youtube.com/watch?v=C37R_Ix8-qs', 'start': 0.0, 'end': 83.00999999999999, 'box': [0.05754461884498596, 0.21637457609176636, 1.0, 0.7300844192504883]}, {'word': 'fail', 'url': 'https://www.youtube.com/watch?v=PIsUJl8BN_I', 'start': 0.0, 'end': 74.0, 'box': [0.0657794177532196, 0.16717177629470825, 0.9392627477645874, 0.9187960028648376]}, {'word': 'laugh', 'url': 'www.youtube.com/watch?v=9FdHlMOnVjg', 'start': 0.0, 'end': 30.988979999999998, 'box': [0.13188594579696655, 0.32334136962890625, 1.0, 0.831863522529602]}, {'word': 'book', 'url': 'https://www.youtube.com/watch?v=J7tP98oDxqE', 'start': 0.0, 'end': 66.0, 'box': [0.05569887161254883, 0.25173279643058777, 0.9968159794807434, 0.9009996652603149]}]


In [20]:
print(len(matching_data))

42135


In [21]:
# Sort all the elements by the word so that they come sequentially
matching_data = sorted(matching_data, key = lambda x: x['word'])

In [22]:
print(matching_data[:10])

[{'word': 'about', 'url': 'https://www.youtube.com/watch?v=mCjHYreiZ24', 'start': 5045.98896, 'end': 5194.9998, 'box': [0.13303735852241516, 0.2521272599697113, 0.89015132188797, 0.7639555335044861]}, {'word': 'about', 'url': 'https://www.youtube.com/watch?v=Dax964vUumQ', 'start': 23602.96341, 'end': 23777.98821, 'box': [0.057298243045806885, 0.0033825039863586426, 1.0, 0.8730295896530151]}, {'word': 'about', 'url': 'https://www.youtube.com/watch?v=Gs9zBeSIVIE', 'start': 9801.9882, 'end': 9900.97911, 'box': [0.05145266652107239, 0.36207592487335205, 1.0, 0.6267668604850769]}, {'word': 'about', 'url': 'www.youtube.com/watch?v=Tfj63Ai9tfw', 'start': 0.0, 'end': 36.994968, 'box': [0.017124980688095093, 0.3652523458003998, 1.0, 0.912168025970459]}, {'word': 'about', 'url': 'https://www.youtube.com/watch?v=xaHll2DRUkg', 'start': 12.010554, 'end': 57.991257, 'box': [0.11106356978416443, 0.3129425644874573, 1.0, 0.70534348487854]}, {'word': 'about', 'url': 'https://www.youtube.com/watch?v=xaH

In [23]:
def check_video(url): #Checks if a YouTube video is available and returns True if available and False if not
    try:
        result = subprocess.run(["yt-dlp", "--simulate", "--get-url", url], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return True
    except subprocess.CalledProcessError:
        return False # Video is not available in YouTube

In [24]:
def download_vid(url, filename):
    try:
        filename = "temp_folder/"+filename
        if os.path.exists(filename):
            os.remove(filename)
            print("Deleted file")
        if not os.path.exists(filename):
            command = ["yt-dlp","-o",filename,url]
            subprocess.run(command)
            return True
    except:
        return False
        

In [25]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    if x > y:
        start_x = (x-y)/2
        end_x = start_x + y
        start_x = int(start_x)
        end_x = int(end_x)
        return frame[:, int(start_x):int(end_x)]
    else:
        return frame

In [26]:
def load_video(filename, max_frames=0, resize=(256,256)):
    filename = "temp_folder/"+filename
    video_capture = cv2.VideoCapture(filename)
    frames = []
    try:
      while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
          break
        frame = crop_center_square(frame)
        frame = cv2.resize(frame, resize)
        frame = frame[:, :, [2, 1, 0]]
        frames.append(frame)

        if len(frames) == max_frames:
          break
    finally:
      video_capture.release()
    return np.array(frames) / 255.0

In [27]:
parquet_df = pd.DataFrame()
# with fs.open(f"s3://asl-capstone/msasl/videos/master_file.parquet","rb") as f:
     # parquet_df = pd.read_parquet(f)

In [28]:
def save_frame_file(video_array, filename, start_frame, end_frame, word):
    s3_uri = "s3://asl-capstone/msasl/RGB/"
    filename = filename+".npy"
    if end_frame!=-1:
        video_array = video_array[int(start_frame):int(end_frame)].astype(np.float32)
    else:
        video_array = video_array.astype(np.float32)
        
    with fs.open(f"{s3_uri+filename}","wb") as f:
        np.save(f,video_array)
    try:
        with fs.open(f"s3://asl-capstone/msasl/master_file_Deepak.parquet","rb") as f:
            parquet_df = pd.read_parquet(f)
    except:
        parquet_df = pd.DataFrame()
        print("Parquet does not exist yet")
    temp_df = pd.DataFrame({'caption':[word], 'path':[s3_uri+filename]})
    parquet_df = pd.concat([parquet_df,temp_df], ignore_index=True)
    with fs.open(f"s3://asl-capstone/msasl/master_file_Deepak.parquet","wb") as f:
        parquet_df.to_parquet(f)

    

In [29]:
%%time
filename_counter = {}
for video in tqdm(matching_data[:4], desc="Processing videos"):
    try:
        word = video['word']
        url = video['url']
        start_frame = video['start']
        end_frame = video['end']
        if word in filename_counter:
            filename_counter[word]+=1
            filename = f"{word}_{filename_counter[word]}"
        else:
            filename_counter[word] = 1
            filename = f"{word}"
        #print(filename, url)
        downloaded_filename = filename+".mp4"
        download_vid(url, downloaded_filename)
        video_array = load_video(downloaded_filename)
        save_frame_file(video_array, filename, start_frame, end_frame, word)
        local_filename = "temp_folder/"+downloaded_filename
        os.remove(local_filename)
    except Exception as e:
        # print(f"Error with video download. Skipping it")
        print(e)

[youtube] Extracting URL: https://www.youtube.com/watch?v=mCjHYreiZ24
[youtube] mCjHYreiZ24: Downloading webpage
[youtube] mCjHYreiZ24: Downloading ios player API JSON
[youtube] mCjHYreiZ24: Downloading android player API JSON
[youtube] mCjHYreiZ24: Downloading m3u8 information
[info] mCjHYreiZ24: Downloading 1 format(s): 18
[download] Destination: temp_folder/about.mp4
[download] 100% of   27.23MiB in 00:00:00 at 37.40MiB/s    
Parquet does not exist yet
[youtube] Extracting URL: https://www.youtube.com/watch?v=Dax964vUumQ
[youtube] Dax964vUumQ: Downloading webpage
[youtube] Dax964vUumQ: Downloading ios player API JSON
[youtube] Dax964vUumQ: Downloading android player API JSON


ERROR: [youtube] Dax964vUumQ: Private video. Sign in if you've been granted access to this video


[Errno 2] No such file or directory: 'temp_folder/about_2.mp4'
[youtube] Extracting URL: https://www.youtube.com/watch?v=Gs9zBeSIVIE
[youtube] Gs9zBeSIVIE: Downloading webpage
[youtube] Gs9zBeSIVIE: Downloading ios player API JSON
[youtube] Gs9zBeSIVIE: Downloading android player API JSON
[youtube] Gs9zBeSIVIE: Downloading m3u8 information
[info] Gs9zBeSIVIE: Downloading 1 format(s): 18
[download] Destination: temp_folder/about_3.mp4
[download] 100% of   16.71MiB in 00:00:00 at 19.05MiB/s    
[generic] Extracting URL: www.youtube.com/watch?v=Tfj63Ai9tfw
[youtube] Extracting URL: http://www.youtube.com/watch?v=Tfj63Ai9tfw
[youtube] Tfj63Ai9tfw: Downloading webpage
[youtube] Tfj63Ai9tfw: Downloading ios player API JSON




[youtube] Tfj63Ai9tfw: Downloading android player API JSON
[Errno 2] No such file or directory: 'temp_folder/about_4.mp4'
CPU times: user 36.7 s, sys: 5.59 s, total: 42.2 s
Wall time: 40.5 s


ERROR: [youtube] Tfj63Ai9tfw: Private video. Sign in if you've been granted access to this video


In [22]:
def get_instance_id():
    """Get current instance ID from metadata"""
    url = "http://169.254.169.254/latest/meta-data/instance-id"
    response = requests.get(url)
    return response.text

In [23]:
def stop_instance(instance_id, region_name='us-west-2'):
    """Stop the EC2 instance"""
    ec2 = boto3.client('ec2', aws_access_key_id = aws_access_key_id, aws_secret_access_key = aws_secret_access_key, region_name=region_name)
    ec2.stop_instances(InstanceIds=[instance_id])

In [24]:
# Get the current instance ID
instance_id = get_instance_id()
print(instance_id)
# Stop the instance
stop_instance(instance_id)

i-0e01b9bb0332fcb0b
