<a href="https://colab.research.google.com/github/fofsinx/echo.dataset/blob/data/prepare_transcriptions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/fofsinx/echo.dataset

In [1]:
!pip install faster_whisper pandas

Collecting faster_whisper
  Downloading faster_whisper-1.1.0-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2<5,>=4.0 (from faster_whisper)
  Downloading ctranslate2-4.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster_whisper)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting av>=11 (from faster_whisper)
  Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster_whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster_whisper)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading faster_whisper-1.1.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m

In [2]:
import os

import uuid

import csv

import concurrent.futures

from faster_whisper import WhisperModel

import pandas as pd

import logging



# Set up logging

logging.basicConfig(level=logging.INFO)

logger = logging.getLogger(__name__)



# Initialize Whisper model with GPU

model = WhisperModel("large-v3", device="cuda", compute_type="float16")

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

vocabulary.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

In [3]:
def transcribe_audio(row):

    """Transcribe a single audio file using Whisper model"""

    try:

        print(f"🎯 File: prepare.py, Line: 16, Function: transcribe_audio, Processing file: {row['path']}")

        segments, _ = model.transcribe(row['path'])

        transcription = " ".join([segment.text for segment in segments])

        return {

            'id': row['id'],

            'path': row['path'],

            'transcription': transcription

        }

    except Exception as e:

        print(f"❌ Error processing {row['path']}: {str(e)}")

        return {

            'id': row['id'],

            'path': row['path'],

            'transcription': ''

        }



def process_csv_file(csv_path, base_path):

    """Process a CSV file and add transcriptions"""

    try:

        # Read CSV file

        df = pd.read_csv(csv_path)

        print(f"📊 File: prepare.py, Line: 35, Function: process_csv_file, Processing CSV: {csv_path}")



        # Create ThreadPoolExecutor for parallel processing

        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:

            # Process each row in parallel

            # Add base path to each record

            records = df.to_dict('records')

            for record in records:

                record['path'] = f"{base_path}/{record['path']}"

            results = list(executor.map(transcribe_audio, records))



        # Create new DataFrame with results

        new_df = pd.DataFrame(results)



        # Save back to CSV

        new_df.to_csv(csv_path, index=False)

        print(f"✅ Successfully processed {csv_path}")



    except Exception as e:

        print(f"❌ Error processing CSV {csv_path}: {str(e)}")



def get_folders(parent_path):

    return [os.path.join(parent_path, f) for f in os.listdir(parent_path) if os.path.isdir(os.path.join(parent_path, f))]


# Main execution

if __name__ == "__main__":

    folders = [

        'vale',

        'maple',

        'glimmer',

        'juniper'

    ]

    base_path = "echo.dataset"

    for folder in folders:

        process_csv_file(os.path.join(base_path, folder, f"{folder}_dataset.csv"), os.path.join(base_path, folder))

📊 File: prepare.py, Line: 35, Function: process_csv_file, Processing CSV: echo.dataset/vale/vale_dataset.csv
🎯 File: prepare.py, Line: 16, Function: transcribe_audio, Processing file: echo.dataset/vale/echo.dataset/vale/13c1892d-14d0-46a8-9a8d-0cc48bc645f2_002.wav
❌ Error processing echo.dataset/vale/echo.dataset/vale/13c1892d-14d0-46a8-9a8d-0cc48bc645f2_002.wav: [Errno 2] No such file or directory: 'echo.dataset/vale/echo.dataset/vale/13c1892d-14d0-46a8-9a8d-0cc48bc645f2_002.wav'
🎯 File: prepare.py, Line: 16, Function: transcribe_audio, Processing file: echo.dataset/vale/echo.dataset/vale/8bbfa9a3-bc15-4cf9-b362-03ef2e8ad5ad_016.wav
❌ Error processing echo.dataset/vale/echo.dataset/vale/8bbfa9a3-bc15-4cf9-b362-03ef2e8ad5ad_016.wav: [Errno 2] No such file or directory: 'echo.dataset/vale/echo.dataset/vale/8bbfa9a3-bc15-4cf9-b362-03ef2e8ad5ad_016.wav'
🎯 File: prepare.py, Line: 16, Function: transcribe_audio, Processing file: echo.dataset/vale/echo.dataset/vale/8bbfa9a3-bc15-4cf9-b362-03

KeyboardInterrupt: 