<a href="https://colab.research.google.com/github/fofsinx/echo.dataset/blob/data/prepare_transcriptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/fofsinx/echo.dataset

Cloning into 'echo.dataset'...
remote: Enumerating objects: 4647, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 4647 (delta 0), reused 0 (delta 0), pack-reused 4644 (from 1)[K
Receiving objects: 100% (4647/4647), 457.88 MiB | 17.25 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Updating files: 100% (4639/4639), done.


In [2]:
!pip install faster_whisper pandas

Collecting faster_whisper
  Downloading faster_whisper-1.1.0-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2<5,>=4.0 (from faster_whisper)
  Downloading ctranslate2-4.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster_whisper)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting av>=11 (from faster_whisper)
  Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster_whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster_whisper)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading faster_whisper-1.1.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m

In [1]:
import os
import uuid
import csv
import concurrent.futures
from faster_whisper import WhisperModel
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize Whisper model with GPU
model = WhisperModel("large-v3", device="mps", compute_type="float16")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def transcribe_audio(row):
    """Transcribe a single audio file using Whisper model"""
    try:
        logger.info(f"🎯 File: prepare.py, Line: 16, Function: transcribe_audio, Processing file: {row['path']}")
        segments, _ = model.transcribe(row['path'])
        transcription = " ".join([segment.text for segment in segments])
        return {
            'id': row['id'],
            'path': row['path'],
            'transcription': transcription
        }
    except Exception as e:
        logger.error(f"❌ Error processing {row['path']}: {str(e)}")
        return {
            'id': row['id'],
            'path': row['path'],
            'transcription': ''
        }

def process_csv_file(csv_path, base_path):
    """Process a CSV file and add transcriptions"""
    try:
        # Read CSV file
        df = pd.read_csv(csv_path)
        logger.info(f"📊 File: prepare.py, Line: 35, Function: process_csv_file, Processing CSV: {csv_path}")

        # Create ThreadPoolExecutor for parallel processing
        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            # Process each row in parallel
            # Add base path to each record
            records = df.to_dict('records')
            for record in records:
                record['path'] = f"{base_path}/{record['path']}"
            results = list(executor.map(transcribe_audio, records))

        # Create new DataFrame with results
        new_df = pd.DataFrame(results)

        # Save back to CSV
        new_df.to_csv(csv_path, index=False)
        logger.info(f"✅ Successfully processed {csv_path}")

    except Exception as e:
        logger.error(f"❌ Error processing CSV {csv_path}: {str(e)}")

def get_folders(parent_path):
    return [os.path.join(parent_path, f) for f in os.listdir(parent_path) if os.path.isdir(os.path.join(parent_path, f))]

def create_csv_for_folder(folder_path):
    folder_name = os.path.basename(folder_path.rstrip("/"))
    csv_filename = f"{folder_name}_dataset.csv"

    data = []
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        if os.path.isfile(file_path):
            data.append({"id": str(uuid.uuid4()), "path": file_path})

    with open(csv_filename, mode="w", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["id", "path"])
        writer.writeheader()
        writer.writerows(data)

    logger.info(f"📝 Created CSV for folder '{folder_name}': {csv_filename}")

    # Process the newly created CSV to add transcriptions
    process_csv_file(csv_filename)

# Main execution
if __name__ == "__main__":
    folders = [
        'vale',
        'maple',
        'glimmer',
        'juniper'
    ]
    base_path = "echo.dataset"
    for folder in folders:
        process_csv_file(os.path.join(base_path, folder, f"{folder}_dataset.csv"), os.path.join(base_path, folder))