# Setup

In [None]:
audios_dir = None
save_dir = None

In [None]:
import os

dir_path = os.getcwd()
parent_dir_path = os.path.dirname(dir_path)

if not audios_dir:
    audios_dir = f'{dir_path}/Audio'

if not save_dir:
    save_dir = f'{dir_path}/audio_recognition'

In [None]:
! pip install tqdm
! pip install torch
! pip install transformers
! pip install librosa



In [None]:
import os
import json
import glob
from tqdm import tqdm
import torch
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import gc

  from .autonotebook import tqdm as notebook_tqdm


# Parse data path and audio info

In [None]:
def parse_audio_info(audios_dir):
    """Parse audio information from the directory structure."""
    all_audio_paths = {}
    for part in sorted(os.listdir(audios_dir)):
        data_part = part.split('/')[-1]
        all_audio_paths[data_part] = {}
    
    for data_part in sorted(all_audio_paths.keys()):
        data_part_path = f'{audios_dir}/{data_part}'
        audio_dirs = sorted(os.listdir(data_part_path))
        audio_ids = [audio_dir.split('/')[-1] for audio_dir in audio_dirs]
        for audio_id, audio_dir in zip(audio_ids, audio_dirs):
            audio_paths = sorted(glob.glob(f'{data_part_path}/{audio_dir}/*.mp3'))
            all_audio_paths[data_part][audio_id] = audio_paths
    
    return all_audio_paths

# Audio detection

In [None]:
def create_directory(path):
    """Create a directory if it does not exist."""
    if not os.path.exists(path):
        os.makedirs(path)


def load_whisper_model():
    """Load the Whisper large v3 model and processor."""
    processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
    model.config.forced_decoder_ids = None
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    return processor, model

def transcribe_and_translate(audio_path, processor, model):
    """
    Transcribe and translate audio using Whisper ASR model.
    
    Parameters:
    - audio_path: Path to the audio file.
    - processor: Whisper processor.
    - model: Whisper model.
    - target_language: Target language for translation.
    
    Returns:
    - transcription: Transcribed and translated text from the audio.
    """
    try:
        # Load audio file
        audio, sr = librosa.load(audio_path, sr=16000)
        
        # Process audio
        input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
        input_features = input_features.to(model.device)
        
        # Generate token ids
        forced_decoder_ids = processor.get_decoder_prompt_ids(language=target_language, task="translate")
        with torch.no_grad():
            predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
        
        # Decode token ids to text
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        # Clear CUDA cache
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return transcription
    except Exception as e:
        print(f"Error processing audio {audio_path}: {e}")
        return ""

def process_and_save(all_audio_paths, save_dir, processor, model):
    """Transcribe and translate audio for each audio file and save the results as JSON files."""
    create_directory(save_dir)
    
    for key in tqdm(all_audio_paths.keys(), desc="Processing parts"):
        key_dir = os.path.join(save_dir, key)
        create_directory(key_dir)
        
        audio_paths_dict = all_audio_paths[key]
        audio_ids = sorted(audio_paths_dict.keys())
        
        for audio_id in tqdm(audio_ids, desc=f"Processing {key}"):
            transcriptions = []
            for idx, audio_path in enumerate(audio_paths_dict[audio_id]):
                transcription = transcribe_and_translate(audio_path, processor, model)
                segment_id = audio_path.split("/")[-1].split(".")[0]
                transcriptions.append({
                    "segment_id": f"{segment_id}",
                    "audio_path": audio_path,
                    "transcription": transcription
                })
            
            json_path = os.path.join(key_dir, f"{audio_id}.json")
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(transcriptions, f, ensure_ascii=False, indent=4)
            print(f"Transcription and translation saved to {json_path}")
            
            # Force garbage collection
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()


In [None]:
# Parse audio information
all_audio_paths = parse_audio_info(audios_dir)

# Load Whisper model
processor, model = load_whisper_model()

# Set target language (change this to your desired language)
target_language = "english"

# Process and save results
process_and_save(all_audio_paths, save_dir, processor, model)

Processing parts:   0%|          | 0/1 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription and translation saved to /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/audio/audio_recognition/L01/V001.json


Processing L01: 100%|██████████| 2/2 [02:55<00:00, 87.55s/it]
Processing parts: 100%|██████████| 1/1 [02:55<00:00, 175.10s/it]

Transcription and translation saved to /home/heigatvu/my-project/competition/HCMC-AI/pipeline-hcm-ai/notebooks/data_extraction/audio/audio_recognition/L01/V002.json



