<a href="https://colab.research.google.com/github/gangaaws/openAI-Lab/blob/main/eai_day3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai-whisper

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/803.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=cd5f2f0aad724309eaf30dec0a173d33512c84f2b2557090d9341c658435fc81
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d1

In [2]:
import whisper
import urllib.request

In [21]:
AUDIO_URL = "https://github.com/fenago/whisper/raw/refs/heads/main/test_audio_files/dutch_the_netherlands.mp3"
AUDIO_FILE = "dutch_the_netherlands.mp3"

In [22]:
urllib.request.urlretrieve(AUDIO_URL, AUDIO_FILE)

('dutch_the_netherlands.mp3', <http.client.HTTPMessage at 0x7edc29e45160>)

In [23]:
model = whisper.load_model("medium")

In [28]:
def detect_language_and_transcribe(audio_file: str, confidence_threshold: float = 0.5):
    """
    Detect language and transcribe audio with confidence checking.

    Args:
        audio_file: Path to the audio file
        confidence_threshold: Minimum confidence required (default: 0.5)

    Returns:
        Tuple of (transcribed_text, detected_language, confidence)
    """
    try:
        # Load and prepare audio
        audio = whisper.load_audio(audio_file)
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)

        # Detect language
        _, language_probs = model.detect_language(mel)
        detected_language: str = max(language_probs, key=language_probs.get)
        confidence: float = language_probs[detected_language]

        # Clear printout of language detection
        print("=" * 50)
        print(f"🎯 LANGUAGE DETECTION RESULT")
        print("=" * 50)
        print(f"Detected Language: {detected_language.upper()}")
        print(f"Confidence Score: {confidence:.2%}")
        print("=" * 50)

        # Check confidence threshold
        if confidence < confidence_threshold:
            error_msg = (
                f"⚠️  Low confidence warning: Language detection confidence "
                f"({confidence:.2%}) is below threshold ({confidence_threshold:.2%}). "
                f"Detected language '{detected_language}' may be incorrect."
            )
            print(error_msg)
            raise ValueError(error_msg)

        # Transcribe with detected language
        print(f"\n📝 Transcribing in {detected_language}...\n")
        options = whisper.DecodingOptions(language=detected_language, task="transcribe")
        result = whisper.decode(model, mel, options)

        return result.text, detected_language, confidence

    except ValueError as e:
        # Re-raise confidence errors
        raise e
    except Exception as e:
        error_msg = f"❌ Error during language detection or transcription: {str(e)}"
        print(error_msg)
        raise RuntimeError(error_msg) from e

print("\n" + "="*50)
print("EXAMPLE 1: Language Detection with Confidence Check")
print("="*50 + "\n")

try:
    text, language, confidence = detect_language_and_transcribe(
        AUDIO_FILE,
        confidence_threshold=0.5
    )
    print(f"✅ Transcription successful!")
    print(f"Text: {text}")
except ValueError as e:
    print(f"⚠️  Continuing despite low confidence: {e}")
except Exception as e:
    print(f"❌ Unexpected error: {e}")

print("\n" + "="*50)
print("EXAMPLE 2: Transcription with Translation to English")
print("="*50 + "\n")

try:
    result = model.transcribe(
        AUDIO_FILE,
        verbose=True,
        language="nl",  # Explicitly set Dutch
        task="translate",  # Translate to English
    )
    print("\n" + "="*50)
    print("📄 TRANSLATION RESULT")
    print("="*50)
    print(f"Original Language: Dutch (nl)")
    print(f"Translated Text: {result['text']}")
    print("="*50)
except Exception as e:
    print(f"❌ Translation error: {e}")


print("\n" + "="*50)
print("EXAMPLE 3: Top Language Predictions")
print("="*50 + "\n")

try:
    audio = whisper.load_audio(AUDIO_FILE)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    _, language_probs = model.detect_language(mel)

    # Sort and display top 3
    sorted_languages = sorted(language_probs.items(), key=lambda x: x[1], reverse=True)
    print("Top 3 Language Predictions:")
    for i, (lang, prob) in enumerate(sorted_languages[:3], 1):
        print(f"  {i}. {lang.upper()}: {prob:.2%}")
    print("="*50)
except Exception as e:
    print(f"❌ Error showing predictions: {e}")


EXAMPLE 1: Language Detection with Confidence Check

🎯 LANGUAGE DETECTION RESULT
Detected Language: NL
Confidence Score: 98.67%

📝 Transcribing in nl...

✅ Transcription successful!
Text: Hoi allemaal, dit is weer een testbestandje. Deze keer om te testen of de Nederlandse taal goed herkend gaat worden. Hierna kunnen we ook proberen deze tekst te laten vertalen naar het Engels om te zien hoe goed dat gaat. Ik ben benieuwd.

EXAMPLE 2: Transcription with Translation to English





[00:00.000 --> 00:03.000]  Hey everyone, this is another test file.
[00:03.000 --> 00:07.000]  This time to test whether the Dutch language will be recognized well.
[00:07.000 --> 00:13.000]  After this we can also try to translate this text into English to see how well that goes.
[00:13.000 --> 00:14.000]  I'm curious.

📄 TRANSLATION RESULT
Original Language: Dutch (nl)
Translated Text:  Hey everyone, this is another test file. This time to test whether the Dutch language will be recognized well. After this we can also try to translate this text into English to see how well that goes. I'm curious.

EXAMPLE 3: Top Language Predictions

Top 3 Language Predictions:
  1. NL: 98.67%
  2. EN: 0.59%
  3. DE: 0.50%
