### Purpose of notebook:
The purpose of this notebook is to download 1K videos that are randomly sampled from the Youtube ASL dataset. These videos will be downloaded into the "youtube-asl" S3 bucket located at https://s3.console.aws.amazon.com/s3/buckets/youtube-asl

In [1]:
# Install boto3 to access S3 and yt-dlp for Downloading YouTube videos
!pip install boto3 yt-dlp

Collecting boto3
  Downloading boto3-1.28.62-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting yt-dlp
  Downloading yt_dlp-2023.9.24-py2.py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.32.0,>=1.31.62 (from boto3)
  Downloading botocore-1.31.62-py3-none-any.whl (11.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.8.0,>=0.7.0 (from boto3)
  Downloading s3transfer-0.7.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mutagen (from yt-dl

In [2]:
# Mount the Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# Import required libraries
import boto3
import yt_dlp
import random
import os

In [None]:
# Path to the file containing YouTube IDs
file_path = '/content/drive/My Drive/MIDS Capstone/Data/YoutubeASL/youtube_asl_video_ids.txt'
processed_ids_file_path = '/content/drive/My Drive/MIDS Capstone/Data/YoutubeASL/processed_video_ids.txt'


with open(processed_ids_file_path, 'r') as file:
    processed_videos = file.read().splitlines()

with open(file_path, 'r') as file:
    youtube_ids = file.read().splitlines()

video_ids = [f for f in youtube_ids if f not in processed_videos]

random.shuffle(video_ids) # Shuffle the IDs
lucky_ids = video_ids[:1] # Choose 1000
backup_ids = [id for id in video_ids if id not in lucky_ids] # This will create a list of IDs that can be used in case one video is not downloadable

In [4]:
# Pick 10 videos from YT ASL
video_path = '/content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/'
file_names = [vid for vid in os.listdir(video_path) if vid.endswith(('.mkv','.webm','.mp4'))]

In [5]:
# Choose 10 videos
random.shuffle(file_names) # Shuffle the IDs
lucky_ids = file_names[:10] # Choose 10
print(lucky_ids)

['aoLQ0VchSec.mkv', 'dzWgVm2oY44.webm', 'UEryLHbfb0M.webm', 'Dmvi3_q3gMc.mkv', 'oOkSSJbNv68.webm', '06kKvQp4SfM.webm', '5J8a2CaXWvk.webm', '8VTAvocbMpI.mkv', '0XGfpv6PUq4.mkv', 'esx9dGfUuto.webm']


In [6]:
# Extract just the filename for the caption
caption_file_name = []
for vid_id in lucky_ids:
  video_file_name = os.path.splitext(vid_id)[0]
  try:
    for file in os.listdir('/content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/Parsed_Parquet/'):
      if file.startswith(video_file_name) and file.endswith('.parquet'):
        caption_file_name.append(file)
  except:
      print("")
print(caption_file_name)

['aoLQ0VchSec.en.parquet', 'dzWgVm2oY44.en.parquet', 'UEryLHbfb0M.en-eEY6OEpapPo.parquet', 'Dmvi3_q3gMc.en-CA.parquet', 'oOkSSJbNv68.en.parquet', '06kKvQp4SfM.en-0hllRZe4s5s.parquet', '5J8a2CaXWvk.en.parquet', '8VTAvocbMpI.en.parquet', '0XGfpv6PUq4.en.parquet', 'esx9dGfUuto.en-xDMNrYfabLQ.parquet']


In [12]:
# AWS credentials and S3 settings
aws_access_key_id = 'AKIAS6WKZTC4EHZXPYLQ'
aws_secret_access_key = r'xURRak/xusr0tTLgpH7v4gGUW7sfc8Thy72Kfpro'
bucket_name = 'asl-capstone'
prefix = 'youtube-asl/test_sample/'
save_path = '/content/temp_folder'

In [8]:
# Initialize a Boto3 S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-west-2'
)

In [9]:
video_path = '/content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/'
parquet_path = '/content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/Parsed_Parquet/'

In [14]:
for video, file in zip(lucky_ids, caption_file_name):
  s3_video_path = f'{prefix}{video}'
  temp_video = video_path+video
  temp_file = parquet_path+file
  print(f'Uploading {temp_video} to {s3_video_path}')
  s3.upload_file(temp_video,bucket_name,s3_video_path)
  s3_file_path = f'{prefix}{file}'
  print(f'Uploading {temp_file} to {s3_file_path}')
  s3.upload_file(temp_file,bucket_name,s3_file_path)

Uploading /content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/aoLQ0VchSec.mkv to youtube-asl/test_sample/aoLQ0VchSec.mkv
Uploading /content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/Parsed_Parquet/aoLQ0VchSec.en.parquet to youtube-asl/test_sample/aoLQ0VchSec.en.parquet
Uploading /content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/dzWgVm2oY44.webm to youtube-asl/test_sample/dzWgVm2oY44.webm
Uploading /content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/Parsed_Parquet/dzWgVm2oY44.en.parquet to youtube-asl/test_sample/dzWgVm2oY44.en.parquet
Uploading /content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/UEryLHbfb0M.webm to youtube-asl/test_sample/UEryLHbfb0M.webm
Uploading /content/drive/My Drive/MIDS Capstone/100_videos/video_with_transcript/Parsed_Parquet/UEryLHbfb0M.en-eEY6OEpapPo.parquet to youtube-asl/test_sample/UEryLHbfb0M.en-eEY6OEpapPo.parquet
Uploading /content/drive/My Drive/MIDS Capsto

In [None]:
def download_and_upload(video_id, save_path, s3, bucket_name, prefix):
    try:
        # Setup download preferences
        ydl_opts = {
        'outtmpl': f"{save_path}/{video_id}",
        'write_auto_sub': True,  # Download automatically generated subtitles
        'sub_langs': 'en',        # set language to English
        #'subtitleslangs': 'en',  # again.. only English
        'writesubtitles': True
        #'allsubtitles': True,
        #'postprocessors': [{
        #    'key': 'FFmpegVideoConvertor',
        #    'preferedformat': 'mp4',
        #}],
    }


        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
          ydl.download([f'https://www.youtube.com/watch?v={video_id}'])

        # Iterate over all files in the specified directory, upload them to S3 and then remove them
        for file_name in os.listdir(save_path):
            file_path = os.path.join(save_path, file_name)
            if os.path.isfile(file_path):
                s3_file_path = f'{prefix}{file_name}'
                print(f"Uploading {file_path} to {s3_file_path}")
                s3.upload_file(file_path, bucket_name, s3_file_path)
                os.remove(file_path)  # Remove the local file after upload

    except Exception as e:
        print(f"Error with video_id {video_id}: {str(e)}")
        # Attempt to remove any remaining local files
        for file_name in os.listdir(save_path):
            file_path = os.path.join(save_path, file_name)
            try:
                if os.path.isfile(file_path):
                    os.remove(file_path)
            except:
              print("Error removing file")
        return False

    return True

In [None]:
counter = 0

In [None]:
for video_id in lucky_ids:
    success = download_and_upload(video_id, save_path, s3, bucket_name, prefix)
    if success:
      counter+=1
      print("Number of videos downloaded and uploaded: ", counter)
      with open(processed_ids_file_path, 'a') as file:
            file.write(video_id + '\n')

    if not success and backup_ids:
        new_id = random.choice(backup_ids)
        backup_ids.remove(new_id)
        download_and_upload(new_id, save_path, s3, bucket_name, prefix)


[youtube] Extracting URL: https://www.youtube.com/watch?v=eNEGtLtHksQ
[youtube] eNEGtLtHksQ: Downloading webpage
[youtube] eNEGtLtHksQ: Downloading ios player API JSON
[youtube] eNEGtLtHksQ: Downloading android player API JSON
[youtube] eNEGtLtHksQ: Downloading m3u8 information
[info] eNEGtLtHksQ: Downloading subtitles: en
[info] eNEGtLtHksQ: Downloading 1 format(s): 248+251
[info] Writing video subtitles to: /content/temp_folder/eNEGtLtHksQ.en.vtt
[download] Destination: /content/temp_folder/eNEGtLtHksQ.en.vtt
[download] 100% of    6.27KiB in 00:00:00 at 174.03KiB/s
[download] Destination: /content/temp_folder/eNEGtLtHksQ.f248.webm
[download] 100% of   37.07MiB in 00:00:06 at 5.84MiB/s   
[download] Destination: /content/temp_folder/eNEGtLtHksQ.f251.webm
[download] 100% of  161.97KiB in 00:00:00 at 731.12KiB/s 
[Merger] Merging formats into "/content/temp_folder/eNEGtLtHksQ.webm"
Deleting original file /content/temp_folder/eNEGtLtHksQ.f251.webm (pass -k to keep)
Deleting original file

In [None]:
# Let's list all the VTT files in the S3 bucket
objects = s3.list_objects_v2(Bucket = bucket_name, Prefix = prefix)


In [None]:
for obj in objects.get('Contents',[]):
  print(obj['Key'])

youtube-asl/1000-samples/
youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt
youtube-asl/1000-samples/--6bmFM9wT4.webm
youtube-asl/1000-samples/-9aGqJpaN7c.ase.vtt
youtube-asl/1000-samples/-9aGqJpaN7c.mkv
youtube-asl/1000-samples/-FSlHH2ReLA.ase.vtt
youtube-asl/1000-samples/-FSlHH2ReLA.webm
youtube-asl/1000-samples/-GtDaiSJkSQ.en.vtt
youtube-asl/1000-samples/-GtDaiSJkSQ.mkv
youtube-asl/1000-samples/-HkeOGWJWLI.en.vtt
youtube-asl/1000-samples/-HkeOGWJWLI.mp4
youtube-asl/1000-samples/-JGpOd2AlVY.en-xgJ5vofi3O8.vtt
youtube-asl/1000-samples/-JGpOd2AlVY.mkv
youtube-asl/1000-samples/-QHnZBBE8Ho.ase.vtt
youtube-asl/1000-samples/-QHnZBBE8Ho.webm
youtube-asl/1000-samples/-ZOyG_dW_1M.en.vtt
youtube-asl/1000-samples/-ZOyG_dW_1M.webm
youtube-asl/1000-samples/-Zrf6jWiFZs.en.vtt
youtube-asl/1000-samples/-Zrf6jWiFZs.webm
youtube-asl/1000-samples/-_oUXqM2Zjc.ase.vtt
youtube-asl/1000-samples/-_oUXqM2Zjc.mkv
youtube-asl/1000-samples/-aLiyA30EQI.en-Ct-zS48ckYo.vtt
youtube-asl/1000-samples/-aLiyA30EQI.webm
yout