# 🎙️ Google Colab Audio Transcription Notebook

This notebook allows you to transcribe audio/video files using OpenAI Whisper directly in Google Colab with GPU acceleration. Features include:

- 🚀 **GPU acceleration** for faster processing
- 📁 **Batch processing** of multiple files
- 💾 **Direct Google Drive integration**
- 🔄 **Automatic resume** if interrupted
- 📊 **Segment-by-segment progress tracking** with real-time ETA
- ⏰ **Connection keepalive** for long transcriptions
- 🎯 **Multiple output formats** (TXT, SRT, VTT)
- 🌍 **Multilingual support** with language detection
- 📦 **Automatic chunking** for large audio files
- 🔧 **Built-in error recovery and retry logic**

## Getting Started

Follow the steps below to transcribe your audio files:

1. **Mount Google Drive** - Connect your Drive to access files
2. **Install Dependencies** - Set up required packages
3. **Configure Settings** - Choose model, language, and output format
4. **Run Transcription** - Process your files with progress tracking


## Step 1: Mount Google Drive

First, let's mount your Google Drive to access audio files and save transcriptions.


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("✅ Google Drive mounted successfully!")
print("Your files are accessible at: /content/drive/MyDrive/")


## Step 2: Install Dependencies

Now let's install the required packages. This will install faster-whisper and other dependencies.


In [None]:
# Install required packages
%pip install -q faster-whisper rich tinytag

print("✅ All dependencies installed successfully!")


## Step 3: Set up the Transcription Module

This cell creates the enhanced transcription module with segment-based progress tracking and connection keepalive.


In [None]:
# Create the enhanced file_transcribe.py module with all features
file_transcribe_code = '''#!/usr/bin/env python3
"""Enhanced file transcription module with segment-based progress and keepalive for Colab."""

import os
import sys
import json
import time
import gc
import threading
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from datetime import datetime

from faster_whisper import WhisperModel
from rich.console import Console
from rich.progress import (
    Progress, BarColumn, TaskProgressColumn, 
    TimeElapsedColumn, SpinnerColumn, TextColumn, ProgressColumn
)
from rich.panel import Panel
from rich.table import Table
from rich import print as rprint
from tinytag import TinyTag

console = Console()

# Supported audio/video extensions
AUDIO_EXTENSIONS = {
    ".mp3", ".wav", ".flac", ".m4a", ".aac", ".ogg", ".opus", ".wma",
    ".mp4", ".avi", ".mkv", ".mov", ".webm", ".m4v", ".flv", ".wmv"
}

# Language mapping
LANGUAGE_MAP = {
    "auto": None,
    "en": "en",
    "pt": "pt"
}

class RemainingAudioDurationColumn(ProgressColumn):
    """Column showing estimated time remaining based on audio processing rate."""
    
    def __init__(self):
        super().__init__()
        self.task_etas = {}
    
    def set_eta(self, task_id, eta_seconds):
        """Set the ETA for a specific task."""
        self.task_etas[task_id] = eta_seconds
    
    def render(self, task):
        """Render the remaining time."""
        eta = self.task_etas.get(task.id, 0)
        if eta > 0:
            hours, remainder = divmod(int(eta), 3600)
            minutes, seconds = divmod(remainder, 60)
            if hours > 0:
                return f"ETA: {hours:02d}:{minutes:02d}:{seconds:02d}"
            else:
                return f"ETA: {minutes:02d}:{seconds:02d}"
        return "ETA: --:--"

class ColabKeepalive:
    """Keeps Colab connection alive during long-running operations."""
    
    def __init__(self):
        self.running = False
        self.thread = None
        self.last_status = ""
        self.update_count = 0
    
    def _keep_alive(self):
        """Background thread that updates periodically."""
        while self.running:
            # Create a hidden div with current timestamp
            timestamp = datetime.now().strftime("%H:%M:%S")
            self.update_count += 1
            
            # Use HTML comment to avoid display conflicts
            from IPython.display import HTML, display
            display(HTML(f"<!-- Keepalive pulse {self.update_count} at {timestamp} -->"))
            
            # Sleep for 20 seconds between pulses
            time.sleep(20)
    
    def start(self):
        """Start the keepalive thread."""
        if not self.running:
            self.running = True
            self.thread = threading.Thread(target=self._keep_alive, daemon=True)
            self.thread.start()
    
    def stop(self):
        """Stop the keepalive thread."""
        self.running = False
        if self.thread:
            self.thread.join(timeout=1)
    
    def update_status(self, status: str):
        """Update the current status (for logging purposes)."""
        self.last_status = status
    
    def pulse(self):
        """Manual pulse (does nothing if thread is running)."""
        pass  # The thread handles everything

class TranscriptionCheckpoint:
    """Manages checkpoint files for resuming interrupted transcriptions."""
    
    def __init__(self, checkpoint_dir: Path):
        self.checkpoint_dir = checkpoint_dir
        self.checkpoint_file = checkpoint_dir / "transcription_checkpoint.json"
        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
        self.state = {
            "files_completed": [],
            "current_file": None,
            "current_position": 0,
            "segments": [],
            "timestamp": None
        }
    
    def save(self):
        """Save current checkpoint state."""
        self.state["timestamp"] = datetime.now().isoformat()
        with open(self.checkpoint_file, 'w') as f:
            json.dump(self.state, f, indent=2)
    
    def load(self) -> bool:
        """Load checkpoint if exists."""
        if self.checkpoint_file.exists():
            try:
                with open(self.checkpoint_file, 'r') as f:
                    self.state = json.load(f)
                return True
            except:
                return False
        return False
    
    def set_current_file(self, filename: str):
        """Set the file currently being processed."""
        self.state["current_file"] = filename
        self.state["current_position"] = 0
        self.state["segments"] = []
    
    def update_progress(self, segment: Dict, position: float):
        """Update progress for current file."""
        self.state["segments"].append(segment)
        self.state["current_position"] = position
    
    def mark_file_complete(self, filename: str):
        """Mark a file as completed."""
        if filename not in self.state["files_completed"]:
            self.state["files_completed"].append(filename)
        if self.state["current_file"] == filename:
            self.state["current_file"] = None
            self.state["current_position"] = 0
            self.state["segments"] = []
        self.save()
    
    def get_resume_info(self) -> Dict:
        """Get information for resuming transcription."""
        return {
            "completed_files": self.state["files_completed"],
            "current_file": self.state["current_file"],
            "current_position": self.state["current_position"]
        }
    
    def clear(self):
        """Clear checkpoint data."""
        if self.checkpoint_file.exists():
            self.checkpoint_file.unlink()
        self.state = {
            "files_completed": [],
            "current_file": None,
            "current_position": 0,
            "segments": [],
            "timestamp": None
        }

def format_duration(seconds: float) -> str:
    """Format duration in seconds to human-readable format."""
    hours, remainder = divmod(int(seconds), 3600)
    minutes, seconds = divmod(remainder, 60)
    
    if hours > 0:
        return f"{hours}h {minutes}m {seconds}s"
    elif minutes > 0:
        return f"{minutes}m {seconds}s"
    else:
        return f"{seconds}s"

def format_timestamp(seconds: float) -> str:
    """Format timestamp for subtitle files."""
    hours, remainder = divmod(int(seconds), 3600)
    minutes, seconds = divmod(remainder, 60)
    milliseconds = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"

def get_audio_duration(file_path: Path) -> float:
    """Get audio duration using tinytag."""
    try:
        tag = TinyTag.get(str(file_path))
        return tag.duration or 0
    except Exception as e:
        console.print(f"[yellow]Warning: Could not get duration for {file_path.name}: {e}[/yellow]")
        return 0

class StreamingTranscriptionWriter:
    """Writes transcription output in streaming fashion."""
    
    def __init__(self, output_file: Path, format_type: str = "txt", multilingual: bool = False):
        self.output_file = output_file
        self.format_type = format_type
        self.multilingual = multilingual
        self.file_handle = None
        self.segment_count = 0
        
        # Open file for writing
        self.file_handle = open(output_file, 'w', encoding='utf-8')
        
        # Write headers if needed
        if format_type == "vtt":
            self.file_handle.write("WEBVTT\\n\\n")
    
    def write_segment(self, segment):
        """Write a single segment to the output file."""
        self.segment_count += 1
        
        if self.format_type == "txt":
            if self.multilingual and hasattr(segment, "language"):
                self.file_handle.write(f"[{segment.language}] {segment.text.strip()}\\n")
            else:
                self.file_handle.write(f"{segment.text.strip()}\\n")
        
        elif self.format_type == "srt":
            self.file_handle.write(f"{self.segment_count}\\n")
            start = format_timestamp(segment.start).replace(".", ",")
            end = format_timestamp(segment.end).replace(".", ",")
            self.file_handle.write(f"{start} --> {end}\\n")
            if self.multilingual and hasattr(segment, "language"):
                self.file_handle.write(f"[{segment.language}] {segment.text.strip()}\\n\\n")
            else:
                self.file_handle.write(f"{segment.text.strip()}\\n\\n")
        
        elif self.format_type == "vtt":
            start = format_timestamp(segment.start)
            end = format_timestamp(segment.end)
            self.file_handle.write(f"{start} --> {end}\\n")
            if self.multilingual and hasattr(segment, "language"):
                self.file_handle.write(f"[{segment.language}] {segment.text.strip()}\\n\\n")
            else:
                self.file_handle.write(f"{segment.text.strip()}\\n\\n")
        
        self.file_handle.flush()
    
    def close(self):
        """Close the output file."""
        if self.file_handle:
            self.file_handle.close()

def detect_device_and_compute_type() -> Tuple[str, str]:
    """Detect if GPU is available and return appropriate device and compute type."""
    try:
        import torch
        if torch.cuda.is_available():
            gpu_name = torch.cuda.get_device_name(0)
            console.print(f"[green]✓[/green] GPU detected: {gpu_name}")
            return "cuda", "float16"
    except ImportError:
        pass
    
    console.print("[yellow]ℹ[/yellow] Using CPU for transcription")
    return "cpu", "int8"

def load_whisper_model_with_retry(model_size: str, device: str, compute_type: str, max_retries: int = 3):
    """Load Whisper model with retry logic for handling Hugging Face connectivity issues."""
    import time
    
    for attempt in range(max_retries):
        try:
            console.print(f"[cyan]Loading model (attempt {attempt + 1}/{max_retries})...[/cyan]")
            
            # Try different loading strategies
            try:
                # First try: Normal loading with local_files_only=False
                model = WhisperModel(
                    model_size, 
                    device=device, 
                    compute_type=compute_type,
                    local_files_only=False
                )
                return model
            except Exception as e1:
                console.print(f"[yellow]Standard loading failed: {str(e1)[:100]}...[/yellow]")
                
                # Second try: Download path explicitly
                try:
                    # Set environment variable to avoid the HF token warning
                    os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
                    
                    # Try with download_root
                    download_root = "/content/whisper_models"
                    os.makedirs(download_root, exist_ok=True)
                    
                    model = WhisperModel(
                        model_size,
                        device=device,
                        compute_type=compute_type,
                        download_root=download_root
                    )
                    return model
                except Exception as e2:
                    console.print(f"[yellow]Download root method failed: {str(e2)[:100]}...[/yellow]")
                    
                    # Third try: Use alternative loading
                    if attempt < max_retries - 1:
                        wait_time = (attempt + 1) * 5
                        console.print(f"[yellow]Waiting {wait_time} seconds before retry...[/yellow]")
                        time.sleep(wait_time)
                    else:
                        raise Exception(f"Failed to load model after {max_retries} attempts")
                        
        except Exception as e:
            if attempt == max_retries - 1:
                console.print(f"[red]Failed to load model: {e}[/red]")
                console.print("[yellow]Troubleshooting tips:[/yellow]")
                console.print("1. Try restarting the Colab runtime")
                console.print("2. Check your internet connection")
                console.print("3. Try a different model size (e.g., 'tiny' or 'small')")
                console.print("4. Clear Colab cache: !rm -rf /root/.cache/huggingface")
                raise
    
    return None

def transcribe_file_with_progress(
    audio_file: Path,
    model: WhisperModel,
    output_dir: Path,
    language: str = "auto",
    output_format: str = "txt",
    multilingual: bool = False,
    keepalive: Optional[ColabKeepalive] = None,
    progress: Optional[Progress] = None,
    task_id: Optional[int] = None,
    eta_column: Optional[RemainingAudioDurationColumn] = None,
    checkpoint: Optional[TranscriptionCheckpoint] = None,
    resume_position: float = 0,
    chunk_large_files: bool = True,
    chunk_duration: int = 600,  # 10 minutes
    chunk_threshold: int = 900  # 15 minutes
) -> Dict:
    """Transcribe a single audio file with segment-based progress tracking and chunking support."""
    
    # Get audio duration
    audio_duration = get_audio_duration(audio_file)
    
    # Check if file needs chunking
    needs_chunking = chunk_large_files and chunk_duration > 0 and audio_duration > chunk_threshold
    
    if needs_chunking:
        console.print(Panel(
            f"[yellow]Large file detected: {audio_file.name}[/yellow]\\n"
            f"Duration: {format_duration(audio_duration)}\\n"
            f"Will split into {int(audio_duration / chunk_duration) + 1} chunks of {chunk_duration/60:.0f} minutes each\\n\\n"
            "[cyan]Benefits:[/cyan]\\n"
            "• Prevents memory issues\\n"
            "• Allows better progress tracking\\n"
            "• Enables partial recovery if interrupted",
            title="[bold yellow]Chunking Large File[/bold yellow]",
            border_style="yellow"
        ))
        return transcribe_chunked_file(
            audio_file, model, output_dir, language, output_format, 
            multilingual, keepalive, progress, task_id, eta_column, 
            checkpoint, chunk_duration
        )
    
    # Regular transcription for smaller files
    # Prepare output file
    output_file = output_dir / f"{audio_file.stem}.{output_format}"
    
    # Create writer
    writer = StreamingTranscriptionWriter(output_file, output_format, multilingual)
    
    # Start transcription
    console.print(f"[cyan]Processing:[/cyan] {audio_file.name} ({format_duration(audio_duration)})")
    start_time = time.time()
    
    # Update keepalive status
    if keepalive:
        keepalive.update_status(f"Transcribing {audio_file.name}")
    
    try:
        # Transcribe
        segments_iterator, info = model.transcribe(
            str(audio_file),
            language=LANGUAGE_MAP.get(language),
            beam_size=5,
            vad_filter=True,
            vad_parameters=dict(min_silence_duration_ms=500),
            multilingual=multilingual
        )
        
        # Process segments with progress tracking
        segment_count = 0
        audio_position = 0
        session_start_time = time.time()
        
        # Update progress bar total if available
        if progress and task_id is not None:
            progress.update(task_id, total=info.duration)
        
        for segment in segments_iterator:
            # Skip if resuming and segment is before resume position
            if resume_position > 0 and segment.end <= resume_position:
                continue
                
            # Write segment
            writer.write_segment(segment)
            segment_count += 1
            audio_position = segment.end
            
            # Update checkpoint
            if checkpoint:
                segment_data = {
                    "text": segment.text,
                    "start": segment.start,
                    "end": segment.end,
                    "language": getattr(segment, "language", None)
                }
                checkpoint.update_progress(segment_data, audio_position)
            
            # Update progress
            if progress and task_id is not None:
                progress.update(task_id, completed=audio_position)
                
                # Calculate and update ETA
                if eta_column and audio_position > 5:  # Wait for 5 seconds before calculating ETA
                    elapsed_time = time.time() - session_start_time
                    rate = audio_position / elapsed_time
                    if rate > 0:
                        remaining_audio = info.duration - audio_position
                        eta = remaining_audio / rate
                        eta_column.set_eta(task_id, eta)
            
            # Update keepalive with segment info
            if keepalive and segment_count % 5 == 0:  # Update every 5 segments
                keepalive.update_status(
                    f"Transcribing {audio_file.name} - {segment_count} segments, "
                    f"{audio_position:.1f}s/{info.duration:.1f}s ({audio_position/info.duration*100:.1f}%)"
                )
                
            # Periodic memory cleanup to prevent kernel crashes
            if segment_count % 100 == 0:
                gc.collect()
                try:
                    import torch
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                except:
                    pass
        
        # Close writer
        writer.close()
        
        # Calculate stats
        process_time = time.time() - start_time
        speed = info.duration / process_time if process_time > 0 else 0
        
        # Final progress update
        if progress and task_id is not None:
            progress.update(task_id, completed=info.duration)
        
        console.print(
            f"[green]✓[/green] {audio_file.name} "
            f"[dim]({info.duration:.1f}s @ {speed:.1f}x speed, {segment_count} segments)[/dim]"
        )
        
        return {
            "success": True,
            "file": audio_file.name,
            "duration": info.duration,
            "segments": segment_count,
            "process_time": process_time,
            "speed": speed,
            "output_file": str(output_file),
            "detected_language": info.language if language == "auto" else language
        }
        
    except Exception as e:
        writer.close()
        console.print(f"[red]✗[/red] Error processing {audio_file.name}: {e}")
        return {
            "success": False,
            "file": audio_file.name,
            "error": str(e)
        }

def split_large_audio_file(audio_file: Path, chunk_duration: int = 600, temp_dir: Optional[Path] = None, show_progress: bool = True) -> List[Path]:
    """Split large audio file into chunks using ffmpeg.
    
    Args:
        audio_file: Path to the audio file
        chunk_duration: Duration of each chunk in seconds (default: 600 = 10 minutes)
        temp_dir: Directory for temporary chunk files
        show_progress: Whether to show progress (set to False when called from within a progress context)
        
    Returns:
        List of paths to chunk files
    """
    if temp_dir is None:
        temp_dir = audio_file.parent / f".chunks_{audio_file.stem}"
    
    temp_dir.mkdir(parents=True, exist_ok=True)
    
    # Get audio duration
    duration = get_audio_duration(audio_file)
    
    if duration <= chunk_duration:
        # File is small enough, no need to split
        return [audio_file]
    
    console.print(f"[yellow]Splitting file into {chunk_duration/60:.0f}-minute chunks...[/yellow]")
    
    chunk_files = []
    num_chunks = int(duration / chunk_duration) + (1 if duration % chunk_duration > 0 else 0)
    
    console.print(f"[cyan]Creating {num_chunks} chunks from {audio_file.name}[/cyan]")
    
    for i in range(num_chunks):
        start_time = i * chunk_duration
        chunk_file = temp_dir / f"{audio_file.stem}_chunk_{i+1:03d}.wav"
        
        # Show progress without using Progress bar
        console.print(f"[dim]Creating chunk {i+1}/{num_chunks}...[/dim]")
        
        # Use ffmpeg to extract chunk
        cmd = [
            "ffmpeg", "-y",  # Overwrite output files
            "-i", str(audio_file),  # Input file
            "-ss", str(start_time),  # Start time
            "-t", str(chunk_duration),  # Duration
            "-acodec", "pcm_s16le",  # Use WAV codec for compatibility
            "-ar", "16000",  # Resample to 16kHz for faster processing
            "-ac", "1",  # Convert to mono
            str(chunk_file)  # Output file
        ]
        
        # Run ffmpeg quietly
        import subprocess
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode == 0 and chunk_file.exists():
            chunk_files.append(chunk_file)
            console.print(f"[green]✓[/green] Chunk {i+1} created")
        else:
            console.print(f"[red]Error creating chunk {i+1}: {result.stderr}[/red]")
    
    console.print(f"[green]✓ Split into {len(chunk_files)} chunks[/green]")
    return chunk_files

def cleanup_chunk_files(chunk_dir: Path):
    """Clean up temporary chunk files."""
    if chunk_dir.exists() and chunk_dir.name.startswith(".chunks_"):
        try:
            import shutil
            shutil.rmtree(chunk_dir)
            console.print(f"[green]✓ Cleaned up temporary chunk files[/green]")
        except Exception as e:
            console.print(f"[yellow]Warning: Could not clean up chunks: {e}[/yellow]")

def transcribe_chunked_file(
    audio_file: Path,
    model: WhisperModel,
    output_dir: Path,
    language: str = "auto",
    output_format: str = "txt",
    multilingual: bool = False,
    keepalive: Optional[ColabKeepalive] = None,
    progress: Optional[Progress] = None,
    task_id: Optional[int] = None,
    eta_column: Optional[RemainingAudioDurationColumn] = None,
    checkpoint: Optional[TranscriptionCheckpoint] = None,
    chunk_duration: int = 600
) -> Dict:
    """Transcribe a large file by splitting it into chunks."""
    
    # Get total audio duration first
    audio_duration = get_audio_duration(audio_file)
    
    # Split the file into chunks (without showing progress bar since we're already in a progress context)
    chunk_dir = audio_file.parent / f".chunks_{audio_file.stem}"
    chunk_files = split_large_audio_file(audio_file, chunk_duration, chunk_dir, show_progress=False)
    
    if len(chunk_files) == 1:
        # File didn't need splitting after all
        return transcribe_file_with_progress(
            audio_file, model, output_dir, language, output_format,
            multilingual, keepalive, progress, task_id, eta_column,
            checkpoint, 0, False  # Disable chunking to avoid recursion
        )
    
    # Prepare output file
    output_file = output_dir / f"{audio_file.stem}.{output_format}"
    temp_output_dir = output_dir / f".temp_{audio_file.stem}"
    temp_output_dir.mkdir(parents=True, exist_ok=True)
    
    console.print(f"[cyan]Processing {len(chunk_files)} chunks for {audio_file.name}[/cyan]")
    
    # Show chunk summary
    console.print(Panel(
        f"File: {audio_file.name}\\n"
        f"Total duration: {format_duration(audio_duration)}\\n"
        f"Chunks: {len(chunk_files)} × {chunk_duration/60:.0f} minutes\\n\\n"
        "[dim]Progress will be shown for each chunk...[/dim]",
        title="[bold cyan]Chunk Processing[/bold cyan]",
        border_style="cyan"
    ))
    
    start_time = time.time()
    total_segments = 0
    chunk_results = []
    
    # Process chunks without creating a new progress bar
    try:
        for i, chunk_file in enumerate(chunk_files):
            chunk_num = i + 1
            console.print(f"\\n[yellow]Processing chunk {chunk_num}/{len(chunk_files)}[/yellow]")
            
            # Update keepalive
            if keepalive:
                keepalive.update_status(
                    f"Transcribing {audio_file.name} - Chunk {chunk_num}/{len(chunk_files)}"
                )
            
            # Update main progress description if available
            if progress and task_id is not None:
                try:
                    progress.update(
                        task_id, 
                        description=f"[yellow]{audio_file.name} - Chunk {chunk_num}/{len(chunk_files)}[/yellow]"
                    )
                except:
                    # Ignore if progress update fails
                    pass
            
            # Transcribe chunk
            chunk_output = temp_output_dir / f"chunk_{chunk_num:03d}.{output_format}"
            writer = StreamingTranscriptionWriter(chunk_output, output_format, multilingual)
            
            try:
                segments_iterator, info = model.transcribe(
                    str(chunk_file),
                    language=LANGUAGE_MAP.get(language),
                    beam_size=5,
                    vad_filter=True,
                    vad_parameters=dict(min_silence_duration_ms=500),
                    multilingual=multilingual
                )
                
                chunk_segments = 0
                detected_language = None
                
                for segment in segments_iterator:
                    writer.write_segment(segment)
                    chunk_segments += 1
                    total_segments += 1
                    
                    if not detected_language and hasattr(info, 'language'):
                        detected_language = info.language
                    
                    # Update progress for main file
                    if progress and task_id is not None and not getattr(progress, '_closed', False):
                        # Estimate position based on chunk progress
                        chunk_position = (i * chunk_duration) + segment.end
                        try:
                            progress.update(task_id, completed=min(chunk_position, audio_duration))
                        except:
                            # Ignore if progress is closed or has issues
                            pass
                    
                    # Memory cleanup every 50 segments
                    if chunk_segments % 50 == 0:
                        gc.collect()
                
                writer.close()
                chunk_results.append({
                    "chunk": chunk_num,
                    "segments": chunk_segments,
                    "duration": info.duration,
                    "language": detected_language
                })
                
                console.print(f"[green]✓ Chunk {chunk_num}: {chunk_segments} segments[/green]")
                
            except Exception as e:
                console.print(f"[red]Error in chunk {chunk_num}: {e}[/red]")
                writer.close()
            
            # Show chunk completion with progress
            progress_pct = ((i + 1) / len(chunk_files)) * 100
            console.print(f"[bold green]Chunk {chunk_num}/{len(chunk_files)} complete - Overall progress: {progress_pct:.0f}%[/bold green]\\n")
            
            # Clean up chunk file immediately to save space
            try:
                chunk_file.unlink()
            except:
                pass
        
        # Combine all chunk outputs into final file
        console.print(f"\\n[cyan]Combining {len(chunk_files)} chunks into final output...[/cyan]")
        combine_chunk_outputs(temp_output_dir, output_file, output_format)
        
        # Clean up
        cleanup_chunk_files(chunk_dir)
        import shutil
        if temp_output_dir.exists():
            shutil.rmtree(temp_output_dir)
            
        # Restore progress bar description if needed
        if progress and task_id is not None:
            progress.update(task_id, description=f"[yellow]{audio_file.name}[/yellow]")
        
        # Calculate final stats
        process_time = time.time() - start_time
        speed = audio_duration / process_time if process_time > 0 else 0
        
        # Detect primary language from chunks
        detected_language = None
        if chunk_results and chunk_results[0].get("language"):
            detected_language = chunk_results[0]["language"]
        
        console.print(
            f"[green]✓[/green] {audio_file.name} "
            f"[dim]({audio_duration:.1f}s @ {speed:.1f}x speed, {total_segments} segments across {len(chunk_files)} chunks)[/dim]"
        )
        
        return {
            "success": True,
            "file": audio_file.name,
            "duration": audio_duration,
            "segments": total_segments,
            "process_time": process_time,
            "speed": speed,
            "output_file": str(output_file),
            "detected_language": detected_language,
            "chunks_processed": len(chunk_files)
        }
        
    except Exception as e:
        # Clean up on error
        cleanup_chunk_files(chunk_dir)
        import shutil
        if temp_output_dir.exists():
            shutil.rmtree(temp_output_dir)
        
        console.print(f"[red]✗[/red] Error processing chunked file {audio_file.name}: {e}")
        return {
            "success": False,
            "file": audio_file.name,
            "error": str(e)
        }

def combine_chunk_outputs(temp_dir: Path, output_file: Path, format_type: str):
    """Combine multiple chunk output files into a single file."""
    chunk_files = sorted(temp_dir.glob(f"chunk_*.{format_type}"))
    
    if not chunk_files:
        console.print("[red]No chunk outputs found to combine![/red]")
        return
    
    with open(output_file, 'w', encoding='utf-8') as outfile:
        if format_type == "vtt":
            outfile.write("WEBVTT\\n\\n")
        
        segment_number = 1
        
        for chunk_file in chunk_files:
            with open(chunk_file, 'r', encoding='utf-8') as infile:
                content = infile.read()
                
                if format_type == "txt":
                    # Simple concatenation for text files
                    outfile.write(content)
                    if not content.endswith('\\n'):
                        outfile.write('\\n')
                
                elif format_type == "srt":
                    # Renumber segments for SRT
                    lines = content.strip().split('\\n')
                    i = 0
                    while i < len(lines):
                        if lines[i].strip().isdigit():
                            outfile.write(f"{segment_number}\\n")
                            segment_number += 1
                            i += 1
                            # Write timestamp and text lines
                            while i < len(lines) and lines[i].strip():
                                outfile.write(lines[i] + '\\n')
                                i += 1
                            outfile.write('\\n')
                        else:
                            i += 1
                
                elif format_type == "vtt":
                    # Skip WEBVTT header for chunks
                    lines = content.strip().split('\\n')
                    skip_header = True
                    for line in lines:
                        if skip_header and line.strip() == "WEBVTT":
                            skip_header = False
                            continue
                        if skip_header and not line.strip():
                            skip_header = False
                            continue
                        if line.strip():
                            outfile.write(line + '\\n')
                    outfile.write('\\n')
    
    console.print(f"[green]✓ Combined {len(chunk_files)} chunks into {output_file.name}[/green]")

def find_audio_files(input_path: Path) -> List[Path]:
    """Find all audio files in the given path."""
    audio_files = []
    
    if input_path.is_file():
        if input_path.suffix.lower() in AUDIO_EXTENSIONS:
            audio_files.append(input_path)
    else:
        for ext in AUDIO_EXTENSIONS:
            audio_files.extend(input_path.glob(f"*{ext}"))
            audio_files.extend(input_path.glob(f"*{ext.upper()}"))
    
    return sorted(audio_files)

def transcribe_folder(
    input_path: Path,
    output_path: Path,
    model_size: str = "base",
    language: str = "auto",
    output_format: str = "txt",
    multilingual: bool = False,
    checkpoint_dir: Optional[Path] = None,
    chunk_duration: int = 600,
    chunk_threshold: int = 900
) -> List[Dict]:
    """Transcribe all audio files in a folder with checkpoint support."""
    
    # Set checkpoint directory
    if checkpoint_dir is None:
        checkpoint_dir = output_path / ".checkpoints"
    
    # Initialize checkpoint
    checkpoint = TranscriptionCheckpoint(checkpoint_dir)
    
    # Check for existing checkpoint
    resume_info = None
    if checkpoint.load():
        console.print(Panel(
            f"[yellow]Found previous transcription checkpoint[/yellow]\\n"
            f"Completed files: {len(checkpoint.state['files_completed'])}\\n"
            f"Timestamp: {checkpoint.state['timestamp']}",
            title="[bold cyan]Resume Transcription?[/bold cyan]",
            border_style="cyan"
        ))
        
        try:
            # In Colab, default to resume to avoid losing progress
            resume = True
            console.print("[green]Resuming from checkpoint...[/green]")
        except:
            resume = True
            
        if resume:
            resume_info = checkpoint.get_resume_info()
        else:
            checkpoint.clear()
    
    # Find audio files
    audio_files = find_audio_files(input_path)
    
    if not audio_files:
        console.print("[red]No audio files found![/red]")
        return []
    
    # Filter out completed files if resuming
    if resume_info and resume_info["completed_files"]:
        original_count = len(audio_files)
        audio_files = [f for f in audio_files if f.name not in resume_info["completed_files"]]
        console.print(f"[green]Skipping {original_count - len(audio_files)} already completed files[/green]")
    
    console.print(f"[green]Found {len(audio_files)} audio files to process[/green]")
    
    # Analyze total audio duration
    total_duration = 0
    console.print("[cyan]Analyzing audio files...[/cyan]")
    for audio_file in audio_files:
        duration = get_audio_duration(audio_file)
        total_duration += duration
        console.print(f"  • {audio_file.name}: {format_duration(duration)}")
    
    console.print(f"[green]Total audio duration: {format_duration(total_duration)}[/green]\\n")
    
    # Create output directory
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Load model with retry logic
    device, compute_type = detect_device_and_compute_type()
    console.print(f"[cyan]Loading Whisper {model_size} model...[/cyan]")
    
    try:
        # Don't use console.status if we're already in a progress context
        console.print("[cyan]Loading model, please wait...[/cyan]")
        model = load_whisper_model_with_retry(model_size, device=device, compute_type=compute_type)
        console.print(f"[green]✓[/green] Model loaded successfully!")
    except Exception as e:
        console.print(f"[red]Error loading model: {e}[/red]")
        console.print("\\n[yellow]Alternative solution:[/yellow]")
        console.print("Try running this command first to clear cache:")
        console.print("[cyan]!rm -rf /root/.cache/huggingface[/cyan]")
        console.print("Then restart the runtime and try again.")
        return []
    
    # Initialize keepalive
    keepalive = ColabKeepalive()
    keepalive.start()
    
    # Process files with enhanced progress
    results = []
    eta_column = RemainingAudioDurationColumn()
    
    try:
        with Progress(
            SpinnerColumn(spinner_name="dots12", style="cyan"),
            TextColumn("[bold blue]{task.description}"),
            BarColumn(bar_width=40, style="cyan", complete_style="green"),
            TaskProgressColumn(),
            "•",
            TimeElapsedColumn(),
            "•",
            eta_column,
            console=console,
            refresh_per_second=1
        ) as progress:
            # Main task for overall progress
            main_task = progress.add_task(
                f"[cyan]Overall Progress ({len(audio_files)} files)", 
                total=total_duration
            )
            
            # Individual file task
            file_task = progress.add_task("[yellow]Current file", visible=False)
            
            total_processed = 0
            
            for idx, audio_file in enumerate(audio_files, 1):
                # Check if this is the file we need to resume
                resume_position = 0
                if resume_info and resume_info["current_file"] == audio_file.name:
                    resume_position = resume_info["current_position"]
                    console.print(f"[yellow]Resuming {audio_file.name} from position {resume_position:.1f}s[/yellow]")
                
                # Set current file in checkpoint
                checkpoint.set_current_file(audio_file.name)
                
                # Update file task
                progress.update(
                    file_task, 
                    description=f"[yellow]File {idx}/{len(audio_files)}: {audio_file.name}",
                    visible=True,
                    completed=resume_position
                )
                
                try:
                    # Transcribe with progress
                    result = transcribe_file_with_progress(
                        audio_file, model, output_path, 
                        language, output_format, multilingual,
                        keepalive, progress, file_task, eta_column,
                        checkpoint, resume_position,
                        chunk_large_files=True,
                        chunk_duration=chunk_duration,
                        chunk_threshold=chunk_threshold
                    )
                    
                    results.append(result)
                    
                    # Update overall progress
                    if result["success"]:
                        total_processed += result["duration"]
                        progress.update(main_task, completed=total_processed)
                        # Mark file as complete in checkpoint
                        checkpoint.mark_file_complete(audio_file.name)
                    
                    # Hide file task between files
                    progress.update(file_task, visible=False)
                    
                    # Force save checkpoint after each file
                    checkpoint.save()
                    
                    # Memory cleanup between files
                    gc.collect()
                    try:
                        import torch
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                    except:
                        pass
                        
                except Exception as e:
                    console.print(f"[red]Error processing {audio_file.name}: {e}[/red]")
                    results.append({
                        "success": False,
                        "file": audio_file.name,
                        "error": str(e)
                    })
                    # Continue with next file even if one fails
                    continue
            
            # Ensure main task shows completion
            progress.update(main_task, completed=total_duration)
    
    finally:
        # Stop keepalive
        keepalive.stop()
        print()  # Clear the keepalive line
    
    return results

def display_results_summary(results: List[Dict]):
    """Display a summary of transcription results."""
    if not results:
        return
    
    # Create summary table
    table = Table(title="📊 Transcription Results", show_header=True, header_style="bold cyan")
    table.add_column("File", style="cyan", width=30)
    table.add_column("Status", style="green")
    table.add_column("Duration", justify="right")
    table.add_column("Segments", justify="right")
    table.add_column("Speed", justify="right")
    table.add_column("Chunks", justify="right")
    
    successful = 0
    failed = 0
    total_duration = 0
    total_time = 0
    
    for result in results:
        if result["success"]:
            successful += 1
            total_duration += result["duration"]
            total_time += result["process_time"]
            
            chunks_info = str(result.get("chunks_processed", 1))
            if result.get("chunks_processed", 1) > 1:
                chunks_info = f"[yellow]{chunks_info}[/yellow]"
            
            table.add_row(
                result["file"][:30] + "..." if len(result["file"]) > 30 else result["file"],
                "✅ Success",
                format_duration(result["duration"]),
                str(result["segments"]),
                f"{result['speed']:.1f}x",
                chunks_info
            )
        else:
            failed += 1
            error_msg = str(result["error"])[:20] + "..." if len(str(result["error"])) > 20 else str(result["error"])
            table.add_row(
                result["file"][:30] + "..." if len(result["file"]) > 30 else result["file"],
                f"❌ {error_msg}",
                "-",
                "-",
                "-",
                "-"
            )
    
    console.print("\\n")
    console.print(table)
    
    # Summary statistics
    if successful > 0:
        avg_speed = total_duration / total_time if total_time > 0 else 0
        console.print("\\n")
        console.print(Panel(
            f"[bold]Summary:[/bold]\\n\\n"
            f"✅ Successful: {successful} files\\n"
            f"❌ Failed: {failed} files\\n"
            f"⏱️  Total audio: {format_duration(total_duration)}\\n"
            f"⚡ Processing time: {format_duration(total_time)}\\n"
            f"🚀 Average speed: {avg_speed:.1f}x realtime",
            title="[bold green]Transcription Complete![/bold green]",
            border_style="green"
        ))
'''

with open('/content/transcript_pkg/file_transcribe.py', 'w') as f:
    f.write(file_transcribe_code)

print("✅ Enhanced file_transcribe.py created with segment progress tracking and keepalive!")


### ⚠️ Troubleshooting: Model Loading Issues

If you encounter a **"502 Bad Gateway"** or similar error when loading the model, this is usually due to Hugging Face connectivity issues. The notebook now includes retry logic to handle this automatically. However, if the issue persists:


In [None]:
# Clear Hugging Face cache if you encounter model loading errors
# Uncomment and run if needed:
# !rm -rf /root/.cache/huggingface
# !rm -rf /content/whisper_models

# After running this, restart the runtime: Runtime → Restart runtime


## Step 4: Configure Transcription Settings

Modify the parameters below according to your needs:


In [None]:
# Configuration parameters - MODIFY THESE AS NEEDED

# Input path in Google Drive (can be a file or folder)
INPUT_PATH = "/content/drive/MyDrive/AudioFiles"  # @param {type:"string"}

# Output path in Google Drive  
OUTPUT_PATH = "/content/drive/MyDrive/Transcriptions"  # @param {type:"string"}

# Model size: tiny, base, small, medium, large
MODEL_SIZE = "base"  # @param ["tiny", "base", "small", "medium", "large"]

# Language: en (English), pt (Portuguese), auto (auto-detect)
LANGUAGE = "auto"  # @param ["auto", "en", "pt"]

# Output format: txt, srt, vtt
OUTPUT_FORMAT = "txt"  # @param ["txt", "srt", "vtt"]

# Enable multilingual mode (shows language for each segment)
MULTILINGUAL = False  # @param {type:"boolean"}

# Chunk duration in minutes for large files (0 to disable chunking)
CHUNK_DURATION_MINUTES = 10  # @param {type:"slider", min:0, max:30, step:5}

# File size threshold for chunking (in minutes)
CHUNK_THRESHOLD_MINUTES = 15  # @param {type:"slider", min:10, max:60, step:5}

print("📋 Configuration:")
print(f"  Input: {INPUT_PATH}")
print(f"  Output: {OUTPUT_PATH}")
print(f"  Model: {MODEL_SIZE}")
print(f"  Language: {LANGUAGE}")
print(f"  Format: {OUTPUT_FORMAT}")
print(f"  Multilingual: {MULTILINGUAL}")
if CHUNK_DURATION_MINUTES > 0:
    print(f"  Chunking: Files > {CHUNK_THRESHOLD_MINUTES}min split into {CHUNK_DURATION_MINUTES}min chunks")
else:
    print(f"  Chunking: Disabled")


### Optional: Clear Previous Checkpoints

If you want to start fresh and clear any previous checkpoints, run this cell:


In [None]:
# Optional: Clear checkpoints to start fresh
# Uncomment the lines below if you want to remove previous checkpoint data

# import shutil
# checkpoint_path = Path(OUTPUT_PATH) / ".checkpoints"
# if checkpoint_path.exists():
#     shutil.rmtree(checkpoint_path)
#     print("✅ Checkpoints cleared!")
# else:
#     print("ℹ️ No checkpoints found.")


## Step 5: Run Transcription

Execute the cell below to start the transcription process. The notebook will:
- Keep your Colab connection alive during long transcriptions
- Show segment-by-segment progress with ETA
- Automatically handle connection issues
- Save checkpoints for resume capability
- Split large files into chunks automatically


In [None]:
from pathlib import Path
import sys
sys.path.append('/content')

from transcript_pkg.file_transcribe import transcribe_folder, display_results_summary
from rich.console import Console
from rich.panel import Panel

console = Console()

# Convert paths
input_path = Path(INPUT_PATH)
output_path = Path(OUTPUT_PATH)

# Check if input exists
if not input_path.exists():
    console.print(f"[red]Error: Input path does not exist: {INPUT_PATH}[/red]")
    console.print("[yellow]Please check your INPUT_PATH and ensure the folder/file exists in your Google Drive.[/yellow]")
else:
    # Run transcription
    console.print(Panel(
        "[bold green]Starting Transcription Process[/bold green]\n\n"
        "This may take a while depending on the size and number of files.\n"
        "The process will use GPU acceleration if available.\n\n"
        "[cyan]Features:[/cyan]\n"
        "• Segment-by-segment progress tracking\n"
        "• Real-time ETA calculations\n"
        "• Automatic checkpoint saving\n"
        "• Connection keepalive for long runs\n"
        "• Automatic chunking for large files",
        title="[bold cyan]Audio Transcription[/bold cyan]",
        border_style="cyan"
    ))
    
    # Convert chunk parameters from minutes to seconds
    chunk_duration = CHUNK_DURATION_MINUTES * 60 if CHUNK_DURATION_MINUTES > 0 else 0
    chunk_threshold = CHUNK_THRESHOLD_MINUTES * 60
    
    # Run transcription with all parameters
    results = transcribe_folder(
        input_path=input_path,
        output_path=output_path,
        model_size=MODEL_SIZE,
        language=LANGUAGE,
        output_format=OUTPUT_FORMAT,
        multilingual=MULTILINGUAL,
        chunk_duration=chunk_duration,
        chunk_threshold=chunk_threshold
    )
    
    # Display results summary
    display_results_summary(results)
    
    console.print(f"\n[green]✅ All transcriptions saved to:[/green] {OUTPUT_PATH}")
