In [None]:
# 🎯 Audio Transcription with Whisper on Google Colab

This notebook allows you to transcribe audio files from your Google Drive using OpenAI's Whisper model with GPU acceleration.

## Features:
- 🚀 **GPU Acceleration** - Automatically uses Colab's free GPU for faster processing
- 📁 **Google Drive Integration** - Read audio files directly from your Drive
- 🌐 **Multi-language Support** - English, Portuguese, and auto-detection
- 📄 **Multiple Output Formats** - TXT, SRT, VTT for subtitles
- 💾 **Auto-save to Drive** - Results saved back to your Google Drive

## Instructions:
1. Run all cells in order
2. Authorize Google Drive access when prompted
3. Specify your audio file path
4. Get your transcription!


In [None]:
# Check GPU availability
import subprocess
import sys

def check_gpu():
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        if result.returncode == 0:
            print("✅ GPU is available!")
            print("\nGPU Details:")
            # Extract GPU name
            for line in result.stdout.split('\n'):
                if 'Tesla' in line or 'GeForce' in line or 'RTX' in line:
                    print(f"  {line.strip()}")
            return True
        else:
            print("❌ No GPU detected. Please enable GPU in Runtime > Change runtime type > GPU")
            return False
    except Exception as e:
        print(f"❌ Error checking GPU: {e}")
        return False

# Check if we have GPU
has_gpu = check_gpu()

# Also check PyTorch CUDA availability
try:
    import torch
    if torch.cuda.is_available():
        print(f"\n✅ PyTorch CUDA is available")
        print(f"  Device: {torch.cuda.get_device_name(0)}")
        print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
except ImportError:
    print("\n⚠️  PyTorch not installed yet - will be installed in next step")


In [None]:
# Install dependencies
print("📦 Installing dependencies...")
print("This may take a few minutes...\n")

# Install faster-whisper and other requirements
!pip install -q faster-whisper
!pip install -q rich

print("\n✅ Dependencies installed successfully!")

# Verify installation
try:
    from faster_whisper import WhisperModel
    import rich
    print("✅ All imports working correctly")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Please restart the runtime and try again")


In [None]:
# Mount Google Drive
from google.colab import drive
import os
from pathlib import Path

print("🔗 Mounting Google Drive...")
drive.mount('/content/drive')

print("\n✅ Google Drive mounted successfully!")
print("\nYour Drive structure:")
print("  📁 /content/drive/MyDrive/")

# List some folders in Drive
try:
    folders = os.listdir('/content/drive/MyDrive/')[:5]
    for folder in folders:
        print(f"     └── {folder}")
    if len(os.listdir('/content/drive/MyDrive/')) > 5:
        print("     └── ...")
except Exception as e:
    print(f"Could not list folders: {e}")

# Create audio folders if they don't exist
audio_dir = Path("/content/drive/MyDrive/audio")
transcriptions_dir = Path("/content/drive/MyDrive/transcriptions")

if not audio_dir.exists():
    audio_dir.mkdir(parents=True)
    print(f"\n📁 Created folder: {audio_dir}")
    print("   → Place your audio files here!")
    
if not transcriptions_dir.exists():
    transcriptions_dir.mkdir(parents=True)
    print(f"📁 Created folder: {transcriptions_dir}")
    print("   → Transcriptions will be saved here (optional)")


In [None]:
# Core transcription functions
import time
from pathlib import Path
from typing import Optional, Tuple
import torch
from faster_whisper import WhisperModel
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, TimeElapsedColumn
from rich.panel import Panel
from rich.table import Table

console = Console()

def detect_device_and_compute_type(force_cpu: bool = False) -> Tuple[str, str]:
    """Detect if GPU is available and return appropriate device and compute type."""
    if force_cpu:
        console.print("[yellow]ℹ[/yellow] Using CPU (forced)")
        return "cpu", "int8"
        
    try:
        import torch
        
        if torch.cuda.is_available():
            gpu_name = torch.cuda.get_device_name(0)
            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
            
            console.print(f"[green]✓[/green] GPU detected: {gpu_name} ({gpu_memory:.1f}GB)")
            
            # Use float16 for GPUs with compute capability >= 7.0
            gpu_capability = torch.cuda.get_device_capability(0)
            
            if gpu_capability[0] >= 7:
                return "cuda", "float16"
            else:
                return "cuda", "int8"
        else:
            console.print("[yellow]ℹ[/yellow] No GPU detected, using CPU")
            return "cpu", "int8"
            
    except Exception as e:
        console.print(f"[yellow]⚠[/yellow] Error detecting GPU: {e}")
        return "cpu", "int8"

def load_whisper_model(model_size: str = "base", force_cpu: bool = False) -> WhisperModel:
    """Load the Whisper model with progress indicator."""
    device, compute_type = detect_device_and_compute_type(force_cpu)
    
    with console.status(
        f"[bold cyan]Loading Whisper {model_size} model on {device.upper()}...[/bold cyan]",
        spinner="dots12",
    ):
        model = WhisperModel(model_size, device=device, compute_type=compute_type)
        console.print(f"[green]✓[/green] Model loaded successfully on {device.upper()}!")
    return model

def format_timestamp(seconds):
    """Convert seconds to timestamp format"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"

def transcribe_file(
    audio_file: Path,
    model: WhisperModel,
    language: Optional[str] = None,
    output_format: str = "txt",
    output_dir: Optional[Path] = None,
    multilingual: bool = False
) -> dict:
    """Transcribe a single audio file."""
    
    if not audio_file.exists():
        raise FileNotFoundError(f"Audio file not found: {audio_file}")
    
    # Set output directory
    if output_dir is None:
        output_dir = audio_file.parent
    
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Create output file paths
    base_name = audio_file.stem
    output_files = {}
    
    if output_format == "all":
        formats = ["txt", "srt", "vtt"]
    else:
        formats = [output_format]
    
    for fmt in formats:
        output_files[fmt] = output_dir / f"{base_name}.{fmt}"
    
    # Start transcription
    console.print(f"\n[cyan]Transcribing:[/cyan] {audio_file.name}")
    
    start_time = time.time()
    
    # Create progress bar
    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        TaskProgressColumn(),
        TimeElapsedColumn(),
        console=console,
    ) as progress:
        
        task = progress.add_task("[cyan]Processing...", total=None)
        
        # Transcribe
        segments, info = model.transcribe(
            str(audio_file),
            language=language,
            beam_size=5,
            vad_filter=True,
            vad_parameters=dict(min_silence_duration_ms=500),
            multilingual=multilingual,
        )
        
        # Process segments
        all_segments = []
        segment_count = 0
        
        # Update progress to show segment processing
        progress.update(task, total=100, completed=0, description="[yellow]Writing segments...")
        
        for segment in segments:
            segment_count += 1
            all_segments.append(segment)
            
            # Update progress
            if segment_count % 10 == 0:
                progress.update(task, advance=10)
        
        progress.update(task, completed=100, description="[green]Complete!")
    
    # Write output files
    for fmt, output_file in output_files.items():
        with open(output_file, "w", encoding="utf-8") as f:
            if fmt == "txt":
                if info.language and not multilingual:
                    f.write(f"[Detected language: {info.language}]\n\n")
                elif multilingual:
                    f.write("[Multilingual transcription]\n\n")
                
                for segment in all_segments:
                    if multilingual and hasattr(segment, "language"):
                        f.write(f"[{segment.language}] {segment.text.strip()}\n")
                    else:
                        f.write(f"{segment.text.strip()}\n")
            
            elif fmt == "srt":
                for i, segment in enumerate(all_segments, 1):
                    f.write(f"{i}\n")
                    start = format_timestamp(segment.start).replace(".", ",")
                    end = format_timestamp(segment.end).replace(".", ",")
                    f.write(f"{start} --> {end}\n")
                    if multilingual and hasattr(segment, "language"):
                        f.write(f"[{segment.language}] {segment.text.strip()}\n\n")
                    else:
                        f.write(f"{segment.text.strip()}\n\n")
            
            elif fmt == "vtt":
                f.write("WEBVTT\n\n")
                for segment in all_segments:
                    start = format_timestamp(segment.start)
                    end = format_timestamp(segment.end)
                    f.write(f"{start} --> {end}\n")
                    if multilingual and hasattr(segment, "language"):
                        f.write(f"[{segment.language}] {segment.text.strip()}\n\n")
                    else:
                        f.write(f"{segment.text.strip()}\n\n")
    
    # Calculate statistics
    duration = time.time() - start_time
    audio_duration = info.duration
    speed = audio_duration / duration if duration > 0 else 0
    
    # Show results
    console.print(f"\n[green]✓[/green] Transcription complete!")
    console.print(f"  Audio duration: {audio_duration:.1f}s")
    console.print(f"  Processing time: {duration:.1f}s")
    console.print(f"  Speed: {speed:.1f}x realtime")
    console.print(f"  Segments: {segment_count}")
    if info.language:
        console.print(f"  Language: {info.language}")
    
    console.print(f"\n[green]Output files:[/green]")
    for fmt, output_file in output_files.items():
        console.print(f"  📄 {output_file}")
    
    return {
        "audio_duration": audio_duration,
        "processing_time": duration,
        "segments": segment_count,
        "language": info.language,
        "output_files": output_files
    }

print("✅ Transcription functions loaded")


In [None]:
# Configuration - Modify these settings as needed

# Audio file path in Google Drive
# Examples:
#   Single file: "/content/drive/MyDrive/audio/interview.mp3"
#   Folder: "/content/drive/MyDrive/audio_files/"
AUDIO_PATH = "/content/drive/MyDrive/audio/sample.mp3"  # Change this to your file/folder

# Output directory (where to save transcriptions)
# Leave as None to save in the same folder as the audio files
OUTPUT_DIR = None  # or "/content/drive/MyDrive/transcriptions/"

# Model size: "tiny", "base", "small", "medium", "large"
# Larger models are more accurate but slower
MODEL_SIZE = "base"

# Language: "en" (English), "pt" (Portuguese), None (auto-detect)
LANGUAGE = None  # Auto-detect

# Output format: "txt", "srt", "vtt", "all"
OUTPUT_FORMAT = "txt"

# Enable multilingual mode (detect language changes within audio)
MULTILINGUAL = False

# Force CPU usage (set to True if you want to use CPU instead of GPU)
FORCE_CPU = False

print("📋 Configuration:")
print(f"  Audio path: {AUDIO_PATH}")
print(f"  Output directory: {OUTPUT_DIR or 'Same as audio files'}")
print(f"  Model size: {MODEL_SIZE}")
print(f"  Language: {LANGUAGE or 'Auto-detect'}")
print(f"  Output format: {OUTPUT_FORMAT}")
print(f"  Multilingual: {MULTILINGUAL}")
print(f"  Force CPU: {FORCE_CPU}")


In [None]:
# Load model and run transcription
from pathlib import Path
import os

# Supported audio formats
AUDIO_EXTENSIONS = {
    ".mp3", ".wav", ".flac", ".ogg", ".m4a", ".mp4",
    ".aac", ".wma", ".opus", ".webm", ".mkv", ".avi", ".mov", ".m4v",
}

def get_audio_files(path_str: str):
    """Get list of audio files from path (file or directory)."""
    path = Path(path_str)
    
    if not path.exists():
        raise FileNotFoundError(f"Path not found: {path}")
    
    if path.is_file():
        # Single file
        if path.suffix.lower() in AUDIO_EXTENSIONS:
            return [path]
        else:
            raise ValueError(f"Not a supported audio format: {path.suffix}")
    else:
        # Directory
        audio_files = []
        for ext in AUDIO_EXTENSIONS:
            audio_files.extend(path.glob(f"*{ext}"))
            audio_files.extend(path.glob(f"*{ext.upper()}"))
        
        if not audio_files:
            raise ValueError(f"No audio files found in: {path}")
        
        return sorted(audio_files)

# Get audio files
try:
    audio_files = get_audio_files(AUDIO_PATH)
    console.print(f"\n[green]Found {len(audio_files)} audio file(s):[/green]")
    for f in audio_files[:5]:  # Show first 5
        console.print(f"  🎵 {f.name}")
    if len(audio_files) > 5:
        console.print(f"  ... and {len(audio_files) - 5} more")
except Exception as e:
    console.print(f"[red]Error: {e}[/red]")
    raise

# Load the model
console.print(f"\n[cyan]Loading Whisper model...[/cyan]")
model = load_whisper_model(MODEL_SIZE, force_cpu=FORCE_CPU)

# Set output directory
if OUTPUT_DIR:
    output_path = Path(OUTPUT_DIR)
    output_path.mkdir(parents=True, exist_ok=True)
else:
    output_path = None

# Process each file
console.print(f"\n[bold cyan]Starting transcription of {len(audio_files)} file(s)...[/bold cyan]")

total_start = time.time()
results = []

for i, audio_file in enumerate(audio_files):
    console.print(f"\n[cyan]File {i+1}/{len(audio_files)}[/cyan]")
    
    try:
        result = transcribe_file(
            audio_file=audio_file,
            model=model,
            language=LANGUAGE,
            output_format=OUTPUT_FORMAT,
            output_dir=output_path,
            multilingual=MULTILINGUAL
        )
        results.append(result)
    except Exception as e:
        console.print(f"[red]Error transcribing {audio_file.name}: {e}[/red]")
        continue

total_duration = time.time() - total_start

# Summary
console.print("\n" + "="*60)
console.print(Panel.fit(
    f"[bold green]✅ Transcription Complete![/bold green]\n\n"
    f"Files processed: {len(results)}/{len(audio_files)}\n"
    f"Total time: {total_duration:.1f}s ({total_duration/60:.1f} min)",
    title="Session Summary",
    border_style="green"
))

# Show performance stats
if results:
    total_audio = sum(r["audio_duration"] for r in results)
    total_process = sum(r["processing_time"] for r in results)
    avg_speed = total_audio / total_process if total_process > 0 else 0
    
    stats_table = Table(title="Performance Statistics")
    stats_table.add_column("Metric", style="cyan")
    stats_table.add_column("Value", style="yellow")
    
    stats_table.add_row("Total Audio Duration", f"{total_audio:.1f}s ({total_audio/60:.1f} min)")
    stats_table.add_row("Total Processing Time", f"{total_process:.1f}s ({total_process/60:.1f} min)")
    stats_table.add_row("Average Speed", f"{avg_speed:.1f}x realtime")
    stats_table.add_row("Total Segments", str(sum(r["segments"] for r in results)))
    
    console.print("\n")
    console.print(stats_table)

print("\n✨ All done! Check your output files in Google Drive.")


In [None]:
## 📚 Usage Examples

### Example 1: Transcribe a single file
```python
AUDIO_PATH = "/content/drive/MyDrive/recordings/interview.mp3"
OUTPUT_FORMAT = "txt"
```

### Example 2: Transcribe all files in a folder with subtitles
```python
AUDIO_PATH = "/content/drive/MyDrive/podcasts/"
OUTPUT_FORMAT = "srt"  # or "all" for all formats
```

### Example 3: Portuguese transcription with large model
```python
AUDIO_PATH = "/content/drive/MyDrive/aula.wav"
MODEL_SIZE = "large"
LANGUAGE = "pt"
```

### Example 4: Multilingual transcription (detect language changes)
```python
AUDIO_PATH = "/content/drive/MyDrive/bilingual_meeting.mp4"
MULTILINGUAL = True
LANGUAGE = None  # Auto-detect
OUTPUT_FORMAT = "all"
```

## 💡 Tips

1. **Model Selection**:
   - `tiny`: Fastest but least accurate (good for quick drafts)
   - `base`: Good balance of speed and accuracy (recommended)
   - `large`: Best accuracy but slower (use for important content)

2. **GPU Usage**:
   - Colab's free GPU makes transcription 3-5x faster
   - Check GPU availability in Runtime > Change runtime type

3. **File Organization**:
   - Keep audio files organized in folders by project
   - Use OUTPUT_DIR to separate transcriptions from audio files

4. **Language Detection**:
   - Use `LANGUAGE = None` for auto-detection
   - Use `MULTILINGUAL = True` for mixed-language content

5. **Output Formats**:
   - `txt`: Plain text (best for reading)
   - `srt`: Subtitles with timestamps (for video editing)
   - `vtt`: Web subtitles (for HTML5 video)
   - `all`: Generate all formats at once


In [None]:
# Optional: Interactive file browser for Google Drive
# Run this cell if you want to browse and select files interactively

from google.colab import widgets
import ipywidgets as ipw
from IPython.display import display

def list_audio_files_in_drive(root_path="/content/drive/MyDrive", max_files=100):
    """List audio files in Google Drive."""
    audio_files = []
    root = Path(root_path)
    
    if not root.exists():
        print(f"Path not found: {root}")
        return []
    
    # Search for audio files
    for ext in AUDIO_EXTENSIONS:
        audio_files.extend(list(root.rglob(f"*{ext}"))[:max_files])
        if len(audio_files) >= max_files:
            break
    
    return sorted(audio_files)[:max_files]

# Find audio files
print("🔍 Searching for audio files in your Google Drive...")
print("(This may take a moment for large drives)\n")

audio_files_found = list_audio_files_in_drive()

if audio_files_found:
    print(f"Found {len(audio_files_found)} audio files:\n")
    
    # Create dropdown widget
    file_dropdown = ipw.Dropdown(
        options=[(f.name, str(f)) for f in audio_files_found],
        description='Select file:',
        style={'description_width': 'initial'},
        layout=ipw.Layout(width='80%')
    )
    
    # Model dropdown
    model_dropdown = ipw.Dropdown(
        options=['tiny', 'base', 'small', 'medium', 'large'],
        value='base',
        description='Model:',
    )
    
    # Language dropdown
    lang_dropdown = ipw.Dropdown(
        options=[('Auto-detect', None), ('English', 'en'), ('Portuguese', 'pt')],
        value=None,
        description='Language:',
    )
    
    # Format dropdown
    format_dropdown = ipw.Dropdown(
        options=['txt', 'srt', 'vtt', 'all'],
        value='txt',
        description='Format:',
    )
    
    # Transcribe button
    button = ipw.Button(
        description='Transcribe Selected File',
        button_style='success',
        icon='play'
    )
    
    output = ipw.Output()
    
    def on_button_click(b):
        with output:
            output.clear_output()
            selected_file = Path(file_dropdown.value)
            print(f"Transcribing: {selected_file.name}")
            
            # Load model if needed
            model = load_whisper_model(model_dropdown.value)
            
            # Transcribe
            result = transcribe_file(
                audio_file=selected_file,
                model=model,
                language=lang_dropdown.value,
                output_format=format_dropdown.value,
                multilingual=False
            )
    
    button.on_click(on_button_click)
    
    # Display widgets
    display(file_dropdown)
    display(ipw.HBox([model_dropdown, lang_dropdown, format_dropdown]))
    display(button)
    display(output)
    
else:
    print("No audio files found in your Google Drive.")
    print("Make sure you have audio files in /content/drive/MyDrive/")


In [None]:
## 🔧 Troubleshooting

### Common Issues:

1. **"No GPU detected"**
   - Go to Runtime → Change runtime type → GPU
   - Select T4 GPU (free tier)
   - Restart runtime

2. **"Audio file not found"**
   - Check the file path is correct
   - Ensure Google Drive is mounted
   - Use forward slashes (/) not backslashes

3. **"Out of memory"**
   - Use a smaller model (tiny or base)
   - Process files one at a time
   - Restart runtime to clear memory

4. **"Import error"**
   - Run the installation cell again
   - Restart runtime after installation
   - Check for any error messages during pip install

5. **Slow transcription**
   - Ensure GPU is enabled (check first cell)
   - Use a smaller model for faster processing
   - Check file size - very long audio files will take time

### 📞 Need Help?

- Check the [GitHub repository](https://github.com/gustavo-meilus/transcriber) for updates
- Report issues with specific error messages
- For Colab-specific issues, check [Colab FAQ](https://research.google.com/colaboratory/faq.html)

---

**Created by**: Gustavo Meilus  
**License**: MIT
