<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/omnilingual_asr_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://ai.meta.com/blog/omnilingual-asr-advancing-automatic-speech-recognition/?brid=r816qqOfNVjm8AxrSoRtdw

In [None]:
# using pip
!pip install omnilingual-asr -q

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
# Remove all NVIDIA and CUDA-related packages
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove -y
!apt-get update

In [None]:
# 1. Download and install the CUDA GPG key and repository metadata
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
!dpkg -i cuda-keyring_1.1-1_all.deb

# 2. Update package lists to include the new NVIDIA repository
!apt-get update

# 3. Install the specific CUDA 12.8 toolkit
# NOTE: Specifying 'cuda-toolkit-12-8' ensures the exact version is pulled
!apt-get -y install cuda-toolkit-12-8

In [5]:
# 1. Update the PATH variable for the current session
import os
os.environ['PATH'] += ':/usr/local/cuda-12.8/bin'

# 2. Verify the new version
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:23:50_PST_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0


In [None]:
# This is a common pattern for installing PyTorch with specific CUDA
# (The exact URL/command may change, check PyTorch website for the current link)
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

In [7]:
!pip install tensorflow -q

In [8]:
import torch
print(f"PyTorch built with CUDA: {torch.version.cuda}")
print(f"Is GPU available: {torch.cuda.is_available()}")

PyTorch built with CUDA: 12.6
Is GPU available: True


In [None]:
!pip uninstall torch torchvision torchaudio -y

In [None]:
!pip uninstall torch torchvision torchaudio -y

# Install PyTorch 2.8.0 only
!pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128 -q

In [None]:
!pip install torchvision torchaudio

In [1]:
import torch
import torchvision
import torchaudio

print(f"PyTorch version: {torch.__version__}")
print(f"Torchvision version: {torchvision.__version__}")
print(f"Torchaudio version: {torchaudio.__version__}")

PyTorch version: 2.9.1+cu128
Torchvision version: 0.24.1+cu128
Torchaudio version: 2.9.1+cu128


In [None]:
!pip uninstall torch torchvision torchaudio -y

In [None]:
!pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cu128

In [None]:
!pip install torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu128

In [5]:
import torch
import torchvision
import torchaudio
print(f"PyTorch version: {torch.__version__}")
print(f"Torchvision version: {torchvision.__version__}")
print(f"Torchaudio version: {torchaudio.__version__}")

# Try running your import code immediately after this check
# from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline

PyTorch version: 2.9.1+cu128
Torchvision version: 0.24.1+cu128
Torchaudio version: 2.9.1+cu128


In [6]:
import torch
print(torch.version.cuda)

12.8


In [None]:
!pip install soundfile

In [2]:
import numpy as np
import soundfile as sf
import os

# Define file paths
flac_path = '/content/eng_audio1.flac'
wav_path = '/content/deu_audio2.wav'

# --- 1. Create a 1-second English FLAC file ---
samplerate = 44100
duration = 1.0  # seconds
frequency = 440  # A4 note
# Generate a simple sine wave
t = np.linspace(0., duration, int(samplerate * duration))
data = 0.5 * np.sin(2. * np.pi * frequency * t)
# Save the data as FLAC
sf.write(flac_path, data, samplerate, format='FLAC')

# --- 2. Create a 1-second German WAV file ---
# Save the same data as WAV
sf.write(wav_path, data, samplerate, format='WAV')

print("‚úÖ Audio files created successfully!")
print("-" * 30)

# Verify the files are now in the correct location and have non-zero size
!ls -lh /content/ | grep -E 'eng_audio1.flac|deu_audio2.wav'

‚úÖ Audio files created successfully!
------------------------------
-rw-r--r-- 1 root root  87K Nov 18 02:44 deu_audio2.wav
-rw-r--r-- 1 root root  14K Nov 18 02:44 eng_audio1.flac


In [3]:
# Re-run the omnilingual-asr code after updating PyTorch
from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline

pipeline = ASRInferencePipeline(model_card="omniASR_LLM_7B")

audio_files = ["/content/eng_audio1.flac", "/content/deu_audio2.wav"]
lang = ["eng_Latn", "deu_Latn"]
transcriptions = pipeline.transcribe(audio_files, lang=lang, batch_size=2)

print(transcriptions)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29.1G/29.1G [02:02<00:00, 255MB/s]


Output()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 85.6k/85.6k [00:00<00:00, 76.3MB/s]


['i was a stranger to the first and the first', 'dann']


In [None]:
#from google.colab import files

#print("Please upload your actual English audio file now.")
#files.upload()

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [6]:
# --- File Check and Generation ---

# 1. Install soundfile (critical for creating audio files from scratch)
!pip install soundfile > /dev/null 2>&1

# 2. Programmatically create valid, non-zero audio files under the CORRECT names.
import numpy as np
import soundfile as sf
import torch
from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline

print("üîÑ Generating guaranteed valid audio files under correct names...")

# --- A. English FLAC: Creating a 15-second valid audio tone ---
flac_path = '/content/eng_audio1.flac'
samplerate = 16000 # Standard ASR sample rate
duration = 15.0 # Well under the 40s limit
frequency = 400
t = np.linspace(0., duration, int(samplerate * duration))
data = 0.5 * np.sin(2. * np.pi * frequency * t)
sf.write(flac_path, data, samplerate, format='FLAC')

# --- B. German WAV: Creating a 1-second valid audio tone ---
wav_path = '/content/deu_audio2.wav'
duration_wav = 1.0
data_wav = 0.5 * np.sin(2. * np.pi * 880 * t[:int(samplerate * duration_wav)])
sf.write(wav_path, data_wav, samplerate, format='WAV')

print("‚úÖ Files /content/eng_audio1.flac and /content/deu_audio2.wav created successfully.")
print("-" * 30)

# Final file verification
!ls -lh /content/ | grep -E 'eng_audio1.flac|deu_audio2.wav'

# --- Pipeline Loading (using the smaller model, omniASR_LLM_300M) ---

print("\nüîÑ Re-loading ASR Pipeline...")
try:
    pipeline = ASRInferencePipeline(
        model_card="omniASR_LLM_300M",
        dtype=torch.bfloat16,
        device=torch.device('cuda')
    )
except Exception as e:
    print(f"‚ùå Failed to load model. Error: {e}")
    raise

print("‚úÖ Model loaded successfully!")

# --- Transcription ---
audio_files = ["/content/eng_audio1.flac", "/content/deu_audio2.wav"]
lang = ["eng_Latn", "deu_Latn"]

print("‚ñ∂Ô∏è Starting Transcription...")
transcriptions = pipeline.transcribe(audio_files, lang=lang, batch_size=2)

print("\nTranscription Results:")
print(transcriptions)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 85.5k/85.5k [00:00<00:00, 91.6MB/s]


‚úÖ Model loaded successfully!
‚ñ∂Ô∏è Starting Transcription...

Transcription Results:
['the protection is the protection of the protection that is the protection of the protection that is the protection of the protection that is the protection of the protection of the protection of the protection of the protection of the protection of t', 'das ist einer schrecken stellt']


In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!rm -rf /content/*.flac
!rm -rf /content/*.wav

In [3]:
!ls -ltha /content/gdrive/MyDrive/data/barackobamatransitionaddress1.mp3

-rw-------+ 1 root root 2.1M Nov 18 02:50 /content/gdrive/MyDrive/data/barackobamatransitionaddress1.mp3


In [13]:
import os
import sys
import torch
import numpy as np
import soundfile as sf
from pydub import AudioSegment # Note: Requires ffmpeg, which is installed below.
from google.colab import drive
from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline

# --- Setup FFmpeg and Libraries ---
os.system("apt-get update > /dev/null 2>&1")
os.system("apt-get install -y ffmpeg > /dev/null 2>&1")
os.system("pip install pydub soundfile > /dev/null 2>&1")
print("‚úÖ Tools and libraries are installed.")

# --- Define the Conversion Function (Handles MP3 source from Drive) ---

def convert_and_slice_mp3_to_flac(input_filepath, output_filepath, clip_duration_seconds):
    """
    Loads an MP3, extracts a clip of the specified duration (starting from 0:00),
    and exports it as a FLAC file. Includes robust error handling.
    """
    clip_duration_ms = clip_duration_seconds * 1000

    if not os.path.exists(input_filepath):
        print(f"‚ùå Error: Input file '{input_filepath}' not found. Conversion aborted.", file=sys.stderr)
        return False

    try:
        print(f"Loading {input_filepath}...")
        audio = AudioSegment.from_file(input_filepath, format="mp3")

        # Slice the audio: [start_ms : end_ms]
        clip = audio[:clip_duration_ms]
        print(f"Successfully sliced the first {clip_duration_seconds} seconds.")

        # Export the audio clip to FLAC format
        clip.export(output_filepath, format="flac")
        print(f"‚úÖ Conversion and slicing successful! Output file: {output_filepath}")
        return True

    except Exception as e:
        print(f"‚ùå An error occurred during conversion (FFmpeg/Pydub issue): {e}", file=sys.stderr)
        return False

‚úÖ Tools and libraries are installed.


In [None]:
# FIXED FINAL CELL ‚Äì SeamlessM4T v2 Text-to-Text (Correct Decoding from HF Docs)
import os
import torch
from pydub import AudioSegment
from google.colab import drive
from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline

# Hugging Face SeamlessM4T v2
from transformers import AutoProcessor, SeamlessM4Tv2Model

# -------------------------------------------------
# 0. Mount Drive & Prepare 30-second English FLAC
# -------------------------------------------------
drive.mount('/content/gdrive')

INPUT_MP3   = "/content/gdrive/MyDrive/data/barackobamatransitionaddress1.mp3"
OUTPUT_FLAC = "/content/eng_audio1.flac"
TARGET_SEC  = 30

if not os.path.exists(OUTPUT_FLAC):
    print("Converting MP3 ‚Üí FLAC (30s)...")
    audio = AudioSegment.from_mp3(INPUT_MP3)
    clip = audio[:TARGET_SEC * 1000]
    clip.export(OUTPUT_FLAC, format="flac")
    print("FLAC created")
else:
    print("Using existing FLAC")

# -------------------------------------------------
# 1. OmniASR ‚Üí English text
# -------------------------------------------------
print("\nLoading OmniASR (300M)...")
asr = ASRInferencePipeline(model_card="omniASR_LLM_300M", dtype=torch.bfloat16, device="cuda")
print("OmniASR ready")

english_text = asr.transcribe([OUTPUT_FLAC], lang=["eng_Latn"])[0]
print("\nEnglish transcription:")
print(english_text)

# -------------------------------------------------
# 2. SeamlessM4T v2 ‚Üí German translation (FIXED: .tolist()[0] + max_new_tokens)
# -------------------------------------------------
print("\nLoading SeamlessM4T v2 (facebook/seamless-m4t-v2-large)...")
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large").to(device)

# Prepare inputs for text-to-text (src_lang only; tgt_lang in generate)
inputs = processor(
    text=english_text,
    src_lang="eng",
    return_tensors="pt"
).to(device)

print("Translating to German...")
with torch.no_grad():
    # Generate translation (text-to-text: generate_speech=False)
    output_tokens = model.generate(
        **inputs,
        tgt_lang="deu",
        generate_speech=False,  # Text output only
        max_new_tokens=256  # FIXED: Use this instead of max_length (avoids warning)
    )

# FIXED: Decode using official HF pattern (tolist()[0] extracts generated sequence)
german_text = processor.decode(
    output_tokens[0].tolist()[0],  # [0] for batch, [0] for generated seq (post-input)
    skip_special_tokens=True
)

In [2]:
# -------------------------------------------------
# 3. Final results
# -------------------------------------------------
print("\n" + "="*80)
print("FINAL RESULTS (PyTorch 2.8.0 + cu128)")
print("="*80)
print("English (OmniASR):")
print(english_text)
print("\nGerman (SeamlessM4T v2):")
print(german_text)
print("="*80)

# Save to files
with open("/content/english.txt", "w") as f: f.write(english_text)
with open("/content/german.txt", "w") as f: f.write(german_text)
print("\nSaved ‚Üí /content/english.txt  &  /content/german.txt")


FINAL RESULTS (PyTorch 2.8.0 + cu128)
English (OmniASR):
on tuesday american stood in line that stretched around schools and churches in numbers this nation has never seen it in matter who they were or where they came from and what they looked like or what party they belonged to they came out and cast their balance because they believed that in this country our destiny is not written for us but by us we should all take pride in the fact that we once again displayed for the world the power of our democracy and reaffirmed the great american ideal

German (SeamlessM4T v2):
am dienstag standen amerikaner in einer schlange, die sich um schulen und kirchen erstreckte, in zahlen, die diese nation noch nie gesehen hat, egal wer sie waren oder woher sie kamen und wie sie aussahen oder welcher partei sie angeh√∂rten sie kamen heraus und warfen ihre waage, weil sie glaubten, dass in diesem land unser schicksal nicht f√ºr uns geschrieben ist, sondern von uns wir sollten alle stolz auf die tatsach

In [16]:
!ls -ltha

total 3.3M
drwxr-xr-x 1 root root 4.0K Nov 18 02:54 .
-rw-r--r-- 1 root root  550 Nov 18 02:54 german.txt
-rw-r--r-- 1 root root  493 Nov 18 02:54 english.txt
-rw-r--r-- 1 root root  32K Nov 18 02:52 deu_audio2.wav
-rw-r--r-- 1 root root 1.2M Nov 18 02:52 eng_audio1.flac
drwx------ 6 root root 4.0K Nov 18 02:51 gdrive
-rw-r--r-- 1 root root 2.1M Nov 18 02:50 barackobamatransitionaddress1.mp3
drwxr-xr-x 1 root root 4.0K Nov 18 02:08 ..
drwxr-xr-x 1 root root 4.0K Nov 12 14:30 sample_data
drwxr-xr-x 4 root root 4.0K Nov 12 14:30 .config
-rw-r--r-- 1 root root 4.3K Apr 20  2023 cuda-keyring_1.1-1_all.deb
