In [18]:
# Import required libraries
import boto3
import yt_dlp
import random
import os

In [19]:
import configparser

config = configparser.ConfigParser()
config.read('/home/ec2-user/.aws/credentials')

['/home/ec2-user/.aws/credentials']

In [20]:
# AWS credentials and S3 settings
aws_access_key_id = config["root"]['aws_access_key_id']
aws_secret_access_key = config["root"]['aws_secret_access_key']
bucket_name = 'asl-capstone'
prefix = 'youtube-asl/1000-samples/'
save_path = '/content/temp_folder'
s3_URI = 's3://asl-capstone/'

In [21]:
# Initialize a Boto3 S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-west-2'
)

In [22]:
import s3fs
fs = s3fs.S3FileSystem(anon=True)

In [23]:
import os
import re
import pandas as pd

# Specify the folder path containing the VTT files
folder_path = 's3://asl-capstone/youtube-asl/1000-samples/'

# Specify the folder where you want to save the parsed Parquet files
parsed_pq_folder = 's3://asl-capstone/youtube-asl/1000-samples/'

# Create the Parsed_Parquet folder if it doesn't exist
os.makedirs(parsed_pq_folder, exist_ok=True)

# Define a regular expression to match timestamps and text
pattern = r'(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\n(.*?)\n\n'

# Initialize a DataFrame to store the combined data from all VTT files
combined_df = pd.DataFrame(columns=['File Name', 'Start Timestamp', 'End Timestamp', 'Caption'])

# Iterate through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".vtt"):
        file_path = os.path.join(folder_path, filename)

        # Read the VTT file
        with open(file_path, 'r') as file:
            vtt_data = file.read()

        # Find all matches for timestamps and text
        matches = re.findall(pattern, vtt_data, re.DOTALL)

        # Initialize lists to store parsed data for the current file
        start_timestamps = []
        end_timestamps = []
        captions = []

        # Process the matched data and store it in lists
        for match in matches:
            start_timestamps.append(match[0])
            end_timestamps.append(match[1])
            captions.append(match[2].strip())

        # Create a DataFrame for the current file with the "File Name" column
        data = {'File Name': [filename] * len(start_timestamps),
                'Start Timestamp': start_timestamps,
                'End Timestamp': end_timestamps,
                'Caption': captions}
        file_df = pd.DataFrame(data)

        # Save the DataFrame to a Parquet file in the Parsed_Parquet folder
        parquet_filename = os.path.splitext(filename)[0] + '.parquet'
        parquet_path = os.path.join(parsed_pq_folder, parquet_filename)
        file_df.to_parquet(parquet_path, index=False)

        # Append the data to the combined DataFrame
        combined_df = pd.concat([combined_df, file_df], ignore_index=True)

# Combined file in Parquet format
combined_parquet_path = os.path.join(parsed_pq_folder, 'combined_data.parquet')
combined_df.to_parquet(combined_parquet_path, index=False)

# Confirm
print("Processing complete.")



Processing complete.


In [42]:
import os
import re
import pandas as pd
import boto3
import pyarrow as pa
import pyarrow.parquet as pq
from io import BytesIO

# Initialize the S3 client
s3 = boto3.client('s3')

# Specify the S3 bucket name
bucket_name = 'asl-capstone'

# Specify the folder path within the S3 bucket containing the VTT files
folder_path = 'youtube-asl/1000-samples/'

# Specify the folder where you want to save the parsed Parquet files in the S3 bucket
parsed_pq_folder = 'youtube-asl/1000-samples/parsed/'

# Define a regular expression to match timestamps and text
pattern = r'(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\n(.*?)\n\n'

# Initialize a DataFrame to store the combined data from all VTT files
combined_df = pd.DataFrame(columns=['File Name', 'Start Timestamp', 'End Timestamp', 'Caption'])

# Function to list all objects in an S3 bucket with pagination
def list_all_objects(bucket, prefix):
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for item in page.get('Contents', []):
            yield item

# List all objects in the folder
all_objects = list_all_objects(bucket_name, folder_path)

for item in all_objects:
    file_path = item['Key']

    if file_path.endswith(".vtt"):
        # Read the VTT file from S3
        vtt_object = s3.get_object(Bucket=bucket_name, Key=file_path)
        vtt_data = vtt_object['Body'].read().decode('utf-8')

        # Find all matches for timestamps and text
        matches = re.findall(pattern, vtt_data, re.DOTALL)

        # Initialize lists to store parsed data for the current file
        start_timestamps = []
        end_timestamps = []
        captions = []

        # Process the matched data and store it in lists
        for match in matches:
            start_timestamps.append(match[0])
            end_timestamps.append(match[1])
            captions.append(match[2].strip())

        # Create a DataFrame for the current file with the "File Name" column
        data = {'File Name': [file_path] * len(start_timestamps),
                'Start Timestamp': start_timestamps,
                'End Timestamp': end_timestamps,
                'Caption': captions}
        file_df = pd.DataFrame(data)

        # Write the DataFrame to Parquet format
        parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet'
        parquet_path = os.path.join(parsed_pq_folder, parquet_filename)
        
        # Convert the DataFrame to an Arrow Table
        table = pa.Table.from_pandas(file_df)
        
        # Write the Arrow Table to Parquet directly to S3
        with BytesIO() as buf:
            with pq.ParquetWriter(f's3://{bucket_name}/{parquet_path}', table.schema, use_deprecated_int96_timestamps=True) as writer:
                writer.write_table(table)

        # Append the data to the combined DataFrame
        combined_df = pd.concat([combined_df, file_df], ignore_index=True)

# Combined file in Parquet format
combined_parquet_path = 'youtube-asl/1000-samples/parsed/combined_data.parquet'

# Convert the combined DataFrame to an Arrow Table
combined_table = pa.Table.from_pandas(combined_df)

# Write the Arrow Table to Parquet directly to S3
with BytesIO() as buf:
    with pq.ParquetWriter(f's3://{bucket_name}/{combined_parquet_path}', combined_table.schema, use_deprecated_int96_timestamps=True) as writer:
        writer.write_table(combined_table)

# Confirm
print("Processing complete.")


Processing complete.


In [28]:
import os
import re
import pandas as pd
import boto3

# Initialize the S3 client
s3 = boto3.client('s3')

# Specify the S3 bucket name
bucket_name = 'asl-capstone'

# folder path 
folder_path = 'youtube-asl/1000-samples/'

# Count 
vtt_file_count = 0

# Define a regular expression 
pattern = r'(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\n(.*?)\n\n'

# Iterate
for filename in s3.list_objects(Bucket=bucket_name, Prefix=folder_path)['Contents']:
    if filename['Key'].endswith(".vtt"):
        vtt_file_count += 1

# Print the count of VTT files
print(f"Number of VTT files in {folder_path}: {vtt_file_count}")


Number of VTT files in youtube-asl/1000-samples/: 500


In [32]:
import pandas as pd
import re

# Define the file path to the Parquet file
parquet_file_path = 's3://asl-capstone/youtube-asl/1000-samples/parsed/combined_data.parquet'

# Load the Parquet file into a DataFrame
df = pd.read_parquet(parquet_file_path)

# Display the DataFrame
df

Unnamed: 0,File Name,Start Timestamp,End Timestamp,Caption
0,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:06.320,00:00:07.440,Hello everyone.
1,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:07.440,00:00:10.020,Welcome to Sign1News.
2,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:10.020,00:00:11.220,I'm Candace Jones.
3,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:11.220,00:00:14.500,Here are your top stories for today.
4,youtube-asl/1000-samples/--6bmFM9wT4.ase.vtt,00:00:16.840,00:00:25.420,We are about a week away from the start of\nth...
...,...,...,...,...
25121,youtube-asl/1000-samples/QgTPL4L5iMQ.en.vtt,00:01:04.560,00:01:08.240,Five separate movies produced by the Deaf...
25122,youtube-asl/1000-samples/QgTPL4L5iMQ.en.vtt,00:01:08.240,00:01:10.500,It's fantastic!
25123,youtube-asl/1000-samples/QgTPL4L5iMQ.en.vtt,00:01:10.520,00:01:13.220,"Also, we're providing the popcorn."
25124,youtube-asl/1000-samples/QgTPL4L5iMQ.en.vtt,00:01:13.220,00:01:14.640,Join us...
