In [7]:
import torch
import pandas as pd
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from pathlib import Path
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, 
    torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, 
    attn_implementation="flash_attention_2"
)

model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [10]:
df_audios = pd.read_csv('data/audios.csv')
df_audios.head()

Unnamed: 0,path,sentence,duration,sr
0,data/audio/00010.mp3,Já é uma sorte que você é poupado de um acidente.,9.648,48000
1,data/audio/00076.mp3,"esquadrão da morte, jogo do bicho, máfia do ap...",9.756,48000
2,data/audio/00083.mp3,"burnout, anorexia nervosa, bulimia, periódica,...",9.972,48000
3,data/audio/00084.mp3,Duas mulheres e uma menina levantam com troféus.,10.176,48000
4,data/audio/00092.mp3,Mesa de sinuca no bar,10.62,48000


In [11]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    # chunk_length_s=30,
    batch_size=16,  # batch size for inference - set based on your device
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use cuda:0


In [14]:
executions = []

for p in df_audios.path:
    start = datetime.now()
    result = pipe(inputs=p, generate_kwargs={"language": "portuguese"})
    tot_seconds = (datetime.now() - start).total_seconds()
    executions.append((Path(p).name, result['text'], tot_seconds))    

In [13]:
df = pd.DataFrame(executions, columns=['file', 'transcription', 'total_seconds'])
df.to_csv('data/whisper-large-v3-turbo.csv', index=False)
df.head()

Unnamed: 0,file,transcription,total_seconds
0,00010.mp3,já é uma sorte que você é poupado de um acidente,0.67328
1,00076.mp3,"Esquadrão da Morte, Jogo do Bicho, Máfia do A...",0.285897
2,00083.mp3,"...tornal, temorexia nervosa, bulimia periódi...",0.316535
3,00084.mp3,Duas mulheres e uma menina levantam com troféus.,0.185692
4,00092.mp3,Mesa de sinuca no bar.,0.143915
