In [None]:
# !pip install datasets
# !pip install transformers
# !pip install huggingface
# !pip install ffmpeg-python
# !pip install bitsandbytes accelerate loralib

In [None]:
# importing the model
model_name = "openai/whisper-small"
language = "sw"
task = "transcribe"
dataset_name = "whispere"

output_file_name = "/models"

In [None]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, AutomaticSpeechRecognitionPipeline
from transformers import pipeline
import ffmpeg
import torch


# Using a pipeline as a high-level helper

pipe = pipeline("automatic-speech-recognition", model=model_name)

In [None]:
# Load model directly

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name, local_files_only=True)

In [None]:
whisper_asr = pipeline(
    "automatic-speech-recognition",
    model=model_name,
    device="cuda" if torch.cuda.is_available() else "cpu",
    chunk_length_s=30,
)

In [None]:
def format_time(seconds):
    hours = int(seconds / 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    return f"{hours:02}:{minutes:02}:{seconds:02}".replace('.',',')

In [None]:
def process_audio_and_create_vtt(audio_file_name, audio_type, whisper_asr):
    prediction = whisper_asr(f"{audio_file_name}.{audio_type}", return_timestamps=True)
    
    vtt_file = output_file_name if output_file_name else f"{audio_file_name}.vtt"
    
    with open(vtt_file, "w", encoding='utf-8') as f:
        f.write(f"ONAVTT\n\n")
        for chunk in prediction['chunks']:
            start, end = chunk['timestamp']
            start_time = format_time(start)
            end_time = format_time(end)
            text = chunk['text']
            f.write(f"{start_time} --> {end_time}\n{text}\n\n")

In [None]:
# Now processing the audios and providing the transcripts
process_audio_and_create_vtt('./mp3/train', 'mp3', whisper_asr=whisper_asr)