In this Notebook the existing vtt files will be converted to txt files.

In [6]:
import os
import webvtt
import csv

# Increase the CSV field size limit to handle larger fields
csv.field_size_limit(10000000)  # Adjust the limit as needed

def list_vtt_files(directory):
    # Recursively find all .vtt files in the directory and subdirectories
    vtt_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.vtt'):
                vtt_files.append(os.path.join(root, file))  # Store full path of each .vtt file
    return vtt_files

def read_metadata(csv_file):
    metadata = {}
    # Read metadata from the CSV file and store it in a dictionary
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            # Extract the base name from the thumbnail_path column
            thumbnail_path = row['thumbnail_path'].strip()
            base_name = os.path.splitext(os.path.basename(thumbnail_path))[0].strip().lower()
            # Remove 'transcript' column from the metadata
            metadata[base_name] = {key: value for key, value in row.items() if key != 'transcript'}
    return metadata

def format_metadata(metadata_row):
    # Format the metadata row into XML-like tags
    metadata_str = ""
    for key, value in metadata_row.items():
        metadata_str += f"<{key}>{value}</{key}>\n"
    return metadata_str

def normalize_filename(filename):
    # Remove any language suffixes and normalize the filename
    base_name = os.path.splitext(os.path.basename(filename))[0]
    # Remove common language suffixes (.de, .en, .es, etc.)
    base_name = base_name.split('.')[0].strip().lower()
    return base_name

def add_metadata(file, metadata):
    # Get the base filename (without path or extension) to match with 'thumbnail_path'
    base_name = normalize_filename(file)
    
    # Debug print
    print(f"Processing file: {file}")
    print(f"Base name for metadata lookup: {base_name}")

    # Check if metadata is available for this file
    if base_name in metadata:
        print(f"Metadata found for: {base_name}")  # Debug print
        metadata_info = format_metadata(metadata[base_name])
        return f"{metadata_info}"  # Enclose video_id tag and metadata
    else:
        print(f"No metadata found for: {base_name}")  # Debug print
    return ""  # Return an empty string if no metadata is found

def extract_txt(file, metadata):
    # Open and extract text from the .vtt file, remove duplicates
    text_output = []
    seen_lines = set()

    for caption in webvtt.read(file):
        for line in caption.text.splitlines():
            if line not in seen_lines:
                text_output.append(line)
                seen_lines.add(line)

    text = ' '.join(text_output)

    # Normalize the filename to use for the output file
    normalized_filename = normalize_filename(file)

    # Create the corresponding .txt filename in the same folder as the .vtt file
    output_directory = os.path.dirname(file)  # Get the directory of the .vtt file
    txt_filename = os.path.join(output_directory, f"{normalized_filename}.txt")  # Use normalized filename for the .txt output

    # Add metadata to the text
    metadata_info = add_metadata(file, metadata)

    # Write the metadata and the extracted text to a .txt file with specific tags
    with open(txt_filename, 'w', encoding='utf-8') as f:
        f.write(metadata_info)  # Write metadata with <video_id> and other tags
        f.write(f"\n<doc>\n{text}\n</doc>")  # Enclose the extracted text in <doc> tags

    print(f"Extracted text saved to: {txt_filename}")

# Example usage
directory_path = 'data/'  # Root directory containing subfolders with .vtt files
vtt_files = list_vtt_files(directory_path)

# Load metadata from the CSV files in the data/ folder
metadata_files = [f for f in os.listdir('data') if f.endswith('.csv')]
for metadata_file in metadata_files:
    csv_file = os.path.join('data', metadata_file)
    metadata = read_metadata(csv_file)  # Load the metadata from the data/ folder

    # Process each .vtt file
    for vtt_file in vtt_files:
        # Ensure that the CSV file corresponds to the folder name
        subfolder = os.path.basename(os.path.dirname(vtt_file))
        if metadata_file.startswith(f'{subfolder}.'):
            extract_txt(vtt_file, metadata)  # Extract text and save with metadata
        else:
            print(f"No matching metadata CSV for: {subfolder}")

Processing file: data/de_DW\54 Objekte in sieben deutschen Bundesländern durchsucht  DW Nachrichten.de.vtt
Base name for metadata lookup: 54 objekte in sieben deutschen bundesländern durchsucht  dw nachrichten
Metadata found for: 54 objekte in sieben deutschen bundesländern durchsucht  dw nachrichten
Extracted text saved to: data/de_DW\54 objekte in sieben deutschen bundesländern durchsucht  dw nachrichten.txt
Processing file: data/de_DW\Angespannte Proteste um den Gaza-Krieg an US-Unis  DW Nachrichten.de.vtt
Base name for metadata lookup: angespannte proteste um den gaza-krieg an us-unis  dw nachrichten
Metadata found for: angespannte proteste um den gaza-krieg an us-unis  dw nachrichten
Extracted text saved to: data/de_DW\angespannte proteste um den gaza-krieg an us-unis  dw nachrichten.txt
Processing file: data/de_DW\Angriff auf dem Golan - Sorge vor einem größeren Krieg  DW Nachrichten.de.vtt
Base name for metadata lookup: angriff auf dem golan - sorge vor einem größeren krieg  dw 