In [None]:

# Install/upgrade OpenAI Whisper package (latest version from PyPI)
!pip install -U openai-whisper

# Update system package list and install FFmpeg (required for audio/video processing)
!sudo apt update && sudo apt install -y ffmpeg

In [None]:
import os
from google.colab import drive

# Use v2-8TPU

# Mount Google Drive to access files stored in your Drive
drive.mount('/content/drive')

# Path to the directory containing converted audio files
audios_p = '/content/drive/MyDrive/Colab_Notebooks/RAG_AI/converted_audios'

# Get list of all audio files in the specified directory
audios = os.listdir(audios_p)

import whisper
import json
import torch

# ----------------------------------------------------------
# Check for GPU availability (important for Whisper speed)
# ----------------------------------------------------------
if torch.cuda.is_available():
    device = "cuda"
    print(f"🎯 Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"🎯 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    device = "cpu"
    print("⚠️ Using CPU - performance will be slow")

# Free up any leftover GPU memory before loading the model
torch.cuda.empty_cache()

# ----------------------------------------------------------
# Load Whisper large-v2 model (runs faster on GPU)
# ----------------------------------------------------------
print("Loading Whisper large-v2 model...")
model = whisper.load_model("large-v2", device=device)
print(f"✅ Model loaded on {device.upper()}")

# ----------------------------------------------------------
# Process each audio file in the directory
# ----------------------------------------------------------

file_count = 0

for audio in audios:
    # Skip specific sample file if present
    if audio == "sample1.mp3":
        continue

    # Extract numeric ID and title from filename
    number = audio.split("_")[0]
    title = audio[:-4]   # remove file extension (.mp3)

    # ------------------------------------------------------
    # Transcribe & translate audio using Whisper
    # - language="hi" → input audio is Hindi
    # - task="translate" → output is English translation
    # - fp16=True → enables faster half-precision on GPU
    # ------------------------------------------------------
    result = model.transcribe(
        audio=f"/content/drive/MyDrive/Colab_Notebooks/RAG_AI/converted_audios/{audio}", # Path to audio file
        # audio="/content/drive/MyDrive/Colab_Notebooks/RAG_AI/converted_audios/sample1.mp3", # Example for testing
        language="hi",
        task="translate",
        word_timestamps=False,
        fp16=True if device == "cuda" else False
    )

    # Collect structured transcript segments with metadata
    chunks = []
    for segment in result["segments"]:
        chunks.append({
            "number": number,
            "title": title,
            "start": segment["start"],   # segment start time (sec)
            "end": segment["end"],       # segment end time (sec)
            "text": segment["text"]      # transcribed text
        })

    # Wrap chunks + full transcription into one JSON object
    chunks_with_metadata = {
        "text": result["text"],   # full transcript
        "chunks": chunks          # segmented transcript with timestamps
    }

    # ------------------------------------------------------
    # Save transcription results as JSON in Google Drive
    # ------------------------------------------------------
    output_dir = '/content/drive/MyDrive/Colab_Notebooks/RAG_AI/jsons'
    os.makedirs(output_dir, exist_ok=True)  # create directory if missing

    # Define JSON output path (same name as audio file, but .json)
    output_path = os.path.join(output_dir, f"{audio.split('.')[0]}.json")

    # Write JSON file with UTF-8 encoding (keeps Hindi/Unicode safe)
    with open(output_path, "w", encoding='utf-8') as f:
        json.dump(chunks_with_metadata, f, ensure_ascii=False, indent=2)

    file_count += 1
    print(f"{file_count} Done...")
