### Asr evaluation part 2

В этом ноутбуке протестим некоторые модели на датасете русских голосовых команд для редактирования изображений.

Датасет: ```arood0/mmm_project_with_audio_ru_final```

Модели:
- Whisper
- GigaAM-v3
- T-one
- NVIDIA Parakeet-TDT-0.6B-v3

In [None]:
import librosa
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModel
from pathlib import Path
import soundfile as sf
import sys
from datasets import load_dataset, Audio
from tqdm import tqdm
import pandas as pd
from jiwer import wer, cer
import numpy as np
import re

[NeMo W 2025-12-07 07:03:34 nemo_logging:405] Megatron num_microbatches_calculator not found, using Apex version.
OneLogger: Setting error_handling_strategy to DISABLE_QUIETLY_AND_REPORT_METRIC_ERROR for rank (rank=0) with OneLogger disabled. To override: explicitly set error_handling_strategy parameter.
No exporters were provided. This means that no telemetry data will be collected.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [3]:
dataset = load_dataset("arood0/mmm_project_with_audio_ru_final")

dataset['train']

Dataset({
    features: ['IMAGE_ID', 'EDITING_TYPE', 'CORE', 'MASK', 'EDITING_INSTRUCTION', 'OUTPUT_DESCRIPTION', 'INPUT_CAPTION_BY_LLAMA', 'OUTPUT_CAPTION_BY_LLAMA', 'INPUT_IMG', 'MASK_IMG', 'OUTPUT_IMG', 'EDITING_INSTRUCTION_RU', 'audio'],
    num_rows: 2000
})

In [4]:
eval_dataset = dataset['train']
eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
print(f"Dataset size: {len(eval_dataset)}")

Dataset size: 2000


In [None]:
AUDIO_CACHE_DIR = Path("/home/sallundina/voice-image-editor/notebooks/audio_cache")
AUDIO_CACHE_DIR.mkdir(exist_ok=True)

def download_audio_from_sample(sample, sample_idx):
    audio_file = AUDIO_CACHE_DIR / f"sample_{sample_idx}.wav"
    
    if audio_file.exists():
        return str(audio_file)
    
    audio_obj = sample['audio']
    
    if hasattr(audio_obj, 'get_all_samples'):
        try:
            samples = audio_obj.get_all_samples()
            if hasattr(samples, 'data'):
                audio_array = np.array(samples.data, dtype=np.float32)
                
            sr = 16000
        except Exception as e:
            raise ValueError(f"Error getting samples: {e}")
    
    audio_array = np.asarray(audio_array, dtype=np.float32)
    
    if len(audio_array.shape) > 1:
        audio_array = audio_array.flatten()
    
    if len(audio_array) == 0:
        raise ValueError(f"Empty audio array for sample {sample_idx}")
    
    max_val = np.abs(audio_array).max()
    if max_val > 1.0:
        audio_array = audio_array / max_val
    
    sf.write(str(audio_file), audio_array, int(sr), subtype='PCM_16', format='WAV')
    return str(audio_file)

for i in tqdm(range(len(eval_dataset)), desc="Downloading audio"):
    sample = eval_dataset[i]
    download_audio_from_sample(sample, i)

### Whisper

In [5]:
def transcribe_whisper(audio_input, processor=None, model=None, sr=None, language='ru'):
    if processor is None:
        processor = processor
    if model is None:
        model = model
    
    WHISPER_SR = 16000
    
    audio_array = librosa.load(audio_input, sr=WHISPER_SR, mono=True)[0]
    sr = WHISPER_SR
    
    input_features = processor(audio_array, sampling_rate=sr, return_tensors="pt").input_features.to(device)
    
    generated_ids = model.generate(
        input_features,
        language=language,
        task="transcribe"
    )
    
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription.lower()

def create_whisper_transcriber(model_name):
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
    
    def transcribe_func(audio_input):
        return transcribe_whisper(audio_input, processor=processor, model=model)
    
    return transcribe_func

### GigaAM-v3

In [6]:
revision = "e2e_rnnt"
gigaam_model = AutoModel.from_pretrained(
    "ai-sage/GigaAM-v3",
    revision=revision,
    trust_remote_code=True,
)

In [7]:
def transcribe_gigaam(audio_input):
    try:
        transcription = gigaam_model.transcribe(audio_input)
    except ValueError as e:
        if "Too long" in str(e):
            transcription = gigaam_model.transcribe_longform(audio_input)
        else:
            raise
    return transcription.lower()

### T-one

In [8]:
tone_repo_path = Path("/tmp/T-one")
if not tone_repo_path.exists():
    import subprocess
    subprocess.run(["git", "clone", "https://github.com/voicekit-team/T-one.git", str(tone_repo_path)], check=True)

if str(tone_repo_path) not in sys.path:
    sys.path.insert(0, str(tone_repo_path))

from tone import StreamingCTCPipeline, read_audio

tone_pipeline = StreamingCTCPipeline.from_hugging_face()

In [9]:
def transcribe_tone(audio_input):
    audio = read_audio(audio_input)
    phrases = tone_pipeline.forward_offline(audio)
    transcription = " ".join([p.text for p in phrases]).strip()
    return transcription.lower()

### NVIDIA Parakeet-TDT-0.6B-v3

In [10]:
import nemo.collections.asr as nemo_asr

parakeet_model = nemo_asr.models.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v3")

parakeet_model.eval()

if hasattr(parakeet_model, 'decoding'):
    if hasattr(parakeet_model.decoding, 'decoding_computer'):
        decoding_computer = parakeet_model.decoding.decoding_computer
        if hasattr(decoding_computer, 'cuda_graphs_mode'):
            decoding_computer.cuda_graphs_mode = None
        if hasattr(parakeet_model.decoding, 'cfg') and hasattr(parakeet_model.decoding.cfg, 'cuda_graphs_mode'):
            parakeet_model.decoding.cfg.cuda_graphs_mode = None

parakeet_model_cpu = parakeet_model.cpu()

[NeMo I 2025-12-07 07:03:50 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 8192 tokens


[NeMo W 2025-12-07 07:03:55 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    use_lhotse: true
    skip_missing_manifest_entries: true
    input_cfg: null
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    num_workers: 2
    pin_memory: true
    max_duration: 10.0
    min_duration: 1.0
    text_field: answer
    batch_duration: null
    max_tps: null
    use_bucketing: true
    bucket_duration_bins: null
    bucket_batch_size: null
    num_buckets: 30
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2025-12-07 07:03:55 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    V

[NeMo I 2025-12-07 07:03:55 nemo_logging:393] PADDING: 0
[NeMo I 2025-12-07 07:04:00 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}
[NeMo I 2025-12-07 07:04:00 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}
[NeMo I 2025-12-07 07:04:00 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}
[NeMo I 2025-12-07 07:04:03 nemo_logging:393] Model EncDecRNNTBPEModel was successfully restored from /home/sallundina/.cache/huggingface/hub/models--nvidia--parakeet-tdt-0.6b-v3/snapshots/6d590f77001d318fb17a0b5bf7ee329a91b52598/parakeet-tdt-0.6b-v3.nemo.


In [11]:
def transcribe_parakeet(audio_input):
    if hasattr(parakeet_model_cpu, 'decoding') and hasattr(parakeet_model_cpu.decoding, 'decoding_computer'):
        if hasattr(parakeet_model_cpu.decoding.decoding_computer, 'cuda_graphs_mode'):
            parakeet_model_cpu.decoding.decoding_computer.cuda_graphs_mode = None
            
    outputs = parakeet_model.transcribe([audio_input], batch_size=1, num_workers=0)
    text = getattr(outputs[0], "text", outputs[0])
    return text.lower()



### Evaluation


In [12]:
def normalize_text(text):
    text = text.lower().strip()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [13]:
AUDIO_CACHE_DIR = Path('/home/sallundina/voice-image-editor/notebooks/audio_cache')

def get_audio_path(sample_idx):
    audio_file = AUDIO_CACHE_DIR / f"sample_{sample_idx}.wav"
    return str(audio_file)

In [14]:
# Посмотрим на один пример
test_sample_idx = 1
test_sample = eval_dataset[test_sample_idx]

print(f"\nReference (raw): {test_sample['EDITING_INSTRUCTION_RU']}")
print(f"Reference (normalized): {normalize_text(test_sample['EDITING_INSTRUCTION_RU'])}")

audio_path = get_audio_path(test_sample_idx)

print("\n" + "-" * 100)
print("WHISPER LARGE-V3:")
whisper_large_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
whisper_large_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").to(device)
whisper_large_raw = transcribe_whisper(audio_path, processor=whisper_large_processor, model=whisper_large_model)
whisper_large_norm = normalize_text(whisper_large_raw)
print(f"Raw transcription: {whisper_large_raw}")
print(f"Normalized transcription: {whisper_large_norm}")
print(f"WER: {wer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), whisper_large_norm):.4f}")
print(f"CER: {cer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), whisper_large_norm):.4f}")

print("\n" + "-" * 100)
print("WHISPER SMALL:")
whisper_small_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_small_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
whisper_small_raw = transcribe_whisper(audio_path, processor=whisper_small_processor, model=whisper_small_model)
whisper_small_norm = normalize_text(whisper_small_raw)
print(f"Raw transcription: {whisper_small_raw}")
print(f"Normalized transcription: {whisper_small_norm}")
print(f"WER: {wer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), whisper_small_norm):.4f}")
print(f"CER: {cer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), whisper_small_norm):.4f}")

print("\n" + "-" * 100)
print("GIGAAM-V3:")
gigaam_raw = transcribe_gigaam(audio_path)
gigaam_norm = normalize_text(gigaam_raw)
print(f"Raw transcription: {gigaam_raw}")
print(f"Normalized transcription: {gigaam_norm}")
print(f"WER: {wer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), gigaam_norm):.4f}")
print(f"CER: {cer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), gigaam_norm):.4f}")

print("\n" + "-" * 100)
print("T-ONE:")
tone_raw = transcribe_tone(audio_path)
tone_norm = normalize_text(tone_raw)
print(f"Raw transcription: {tone_raw}")
print(f"Normalized transcription: {tone_norm}")
print(f"WER: {wer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), tone_norm):.4f}")
print(f"CER: {cer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), tone_norm):.4f}")

print("\n" + "-" * 100)
print("NVIDIA PARAKEET:")
parakeet_raw = transcribe_parakeet(audio_path)
parakeet_norm = normalize_text(parakeet_raw)
print(f"Raw transcription: {parakeet_raw}")
print(f"Normalized transcription: {parakeet_norm}")
print(f"WER: {wer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), parakeet_norm):.4f}")
print(f"CER: {cer(normalize_text(test_sample['EDITING_INSTRUCTION_RU']), parakeet_norm):.4f}")


Reference (raw): Заставь птицу сложить крылья.
Reference (normalized): заставь птицу сложить крылья

----------------------------------------------------------------------------------------------------
WHISPER LARGE-V3:


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Raw transcription:  заставь птицу сложить крылья.
Normalized transcription: заставь птицу сложить крылья
WER: 0.0000
CER: 0.0000

----------------------------------------------------------------------------------------------------
WHISPER SMALL:
Raw transcription:  застав птицу сложить крылья.
Normalized transcription: застав птицу сложить крылья
WER: 0.2500
CER: 0.0357

----------------------------------------------------------------------------------------------------
GIGAAM-V3:
Raw transcription: заставь птицу сложить крылья.
Normalized transcription: заставь птицу сложить крылья
WER: 0.0000
CER: 0.0000

----------------------------------------------------------------------------------------------------
T-ONE:


[NeMo W 2025-12-07 07:04:16 nemo_logging:405] The following configuration keys are ignored by Lhotse dataloader: use_start_end_token
[NeMo W 2025-12-07 07:04:16 nemo_logging:405] You are using a non-tarred dataset and requested tokenization during data sampling (pretokenize=True). This will cause the tokenization to happen in the main (GPU) process,possibly impacting the training speed if your tokenizer is very large.If the impact is noticable, set pretokenize=False in dataloader config.(note: that will disable token-per-second filtering and 2D bucketing features)


Raw transcription: заставь птицу сложить крылья
Normalized transcription: заставь птицу сложить крылья
WER: 0.0000
CER: 0.0000

----------------------------------------------------------------------------------------------------
NVIDIA PARAKEET:


Transcribing: 1it [00:00,  2.07it/s]

Raw transcription: заставь птицу сложить крылья.
Normalized transcription: заставь птицу сложить крылья
WER: 0.0000
CER: 0.0000





In [15]:
def evaluate_model(dataset, transcribe_func, model_name, max_samples=None):
    results = []
    all_references = []
    all_hypotheses = []
    
    num_samples = min(len(dataset), max_samples) if max_samples else len(dataset)
    
    print(f"\nEvaluating {model_name} on {num_samples} samples...")
    
    for i in tqdm(range(num_samples), desc=f"{model_name}"):
        try:
            sample = dataset[i]
            
            reference_raw = sample['EDITING_INSTRUCTION_RU']
            reference = normalize_text(reference_raw)
            
            audio_path = get_audio_path(i)
            
            hypothesis_raw = transcribe_func(audio_path)
            hypothesis = normalize_text(hypothesis_raw)
            
            all_references.append(reference)
            all_hypotheses.append(hypothesis)
            
            sample_wer = wer(reference, hypothesis)
            sample_cer = cer(reference, hypothesis)
            
            results.append({
                'sample_id': i,
                'reference_raw': reference_raw,
                'reference_normalized': reference,
                'hypothesis_raw': hypothesis_raw,
                'hypothesis_normalized': hypothesis,
                'wer': sample_wer,
                'cer': sample_cer
            })
            
        except Exception as e:
            print(f"\nError processing sample {i}: {e}")
            continue
    
    if len(all_references) == 0:
        return {
            'model_name': model_name,
            'wer': None,
            'cer': None,
            'num_samples': 0,
            'results': []
        }
    
    overall_wer = wer(all_references, all_hypotheses)
    overall_cer = cer(all_references, all_hypotheses)
    
    return {
        'model_name': model_name,
        'wer': overall_wer,
        'cer': overall_cer,
        'num_samples': len(all_references),
        'results': results
    }

In [16]:
MAX_SAMPLES = 1000

In [17]:
transcribe_whisper_small = create_whisper_transcriber("openai/whisper-large-v3")
results_whisper_large = evaluate_model(eval_dataset, transcribe_whisper_small, "Whisper Large", MAX_SAMPLES)


Evaluating Whisper Large on 1000 samples...


Whisper Large: 100%|██████████| 1000/1000 [09:25<00:00,  1.77it/s]


In [18]:
transcribe_whisper_small = create_whisper_transcriber("openai/whisper-small")
results_whisper_small = evaluate_model(eval_dataset, transcribe_whisper_small, "Whisper Small", MAX_SAMPLES)


Evaluating Whisper Small on 1000 samples...


Whisper Small: 100%|██████████| 1000/1000 [04:00<00:00,  4.15it/s]


In [19]:
results_gigaam = evaluate_model(eval_dataset, transcribe_gigaam, "GigaAM-v3", MAX_SAMPLES)


Evaluating GigaAM-v3 on 1000 samples...


GigaAM-v3: 100%|██████████| 1000/1000 [04:21<00:00,  3.83it/s]


In [20]:
results_tone = evaluate_model(eval_dataset, transcribe_tone, "T-one", MAX_SAMPLES)


Evaluating T-one on 1000 samples...


T-one: 100%|██████████| 1000/1000 [21:40<00:00,  1.30s/it]


In [None]:
results_parakeet = evaluate_model(eval_dataset, transcribe_parakeet, "NVIDIA Parakeet", MAX_SAMPLES)

In [22]:
summary_data = [
    {
        'Model': results_whisper_large['model_name'],
        'WER': results_whisper_large['wer'],
        'CER': results_whisper_large['cer'],
        'Samples': results_whisper_large['num_samples']
    },
    {
        'Model': results_whisper_small['model_name'],
        'WER': results_whisper_small['wer'],
        'CER': results_whisper_small['cer'],
        'Samples': results_whisper_small['num_samples']
    },
    {
        'Model': results_gigaam['model_name'],
        'WER': results_gigaam['wer'],
        'CER': results_gigaam['cer'],
        'Samples': results_gigaam['num_samples']
    },
    {
        'Model': results_tone['model_name'],
        'WER': results_tone['wer'],
        'CER': results_tone['cer'],
        'Samples': results_tone['num_samples']
    },
    {
        'Model': results_parakeet['model_name'],
        'WER': results_parakeet['wer'],
        'CER': results_parakeet['cer'],
        'Samples': results_parakeet['num_samples']
    }
]

In [25]:
summary_df = pd.DataFrame(summary_data)

In [28]:
print(summary_df.to_string(index=False))

          Model      WER      CER  Samples
  Whisper Large 0.066721 0.024128     1000
  Whisper Small 0.108343 0.034571     1000
      GigaAM-v3 0.052747 0.015834     1000
          T-one 0.061060 0.017139     1000
NVIDIA Parakeet 0.060302 0.016288     1000


Лидер - GigaAM-v3, но результаты достаточно близкие для всего кроме Whisper-small. Но с учетом размера (Whisper-large и Parakeet больше) и скорости инференса (T-one сильно медленнее) и примеров, которые смотрели глазами, остановили свой выбор на GigaAM-v3. Также GigaAM-v3 возвращает текст с правильной пунктуацией, капитализацией и тд, что очень удобно.