In [1]:
import pandas as pd
from datasets import Dataset, Audio
from tqdm import tqdm
import os
from pydub import AudioSegment
import torch
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset, Audio
import evaluate

  from .autonotebook import tqdm as notebook_tqdm
W0214 16:17:32.662000 16480 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:

print(torch.cuda.is_available())        # True olmalı
print(torch.version.cuda)               # 11.8 yazmalı
print(torch.cuda.get_device_name(0))    # GeForce RTX 3050ti


True
11.8
NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [3]:
#mp3 den wav a dönüştürme scripti
def prepare_wav_files(csv_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    df = pd.read_csv(csv_path)
    print(f"Toplam {len(df)} dosya işleniyor...")

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        mp3_path = row['filename']
        file_name = os.path.basename(mp3_path).replace(".mp3", ".wav")
        dest_path = os.path.join(output_folder, file_name)

        if not os.path.exists(dest_path):
            try:
                audio = AudioSegment.from_mp3(mp3_path)
                # Whisper standardı: 16kHz, Mono
                audio = audio.set_frame_rate(16000).set_channels(1)
                audio.export(dest_path, format="wav")
            except Exception as e:
                continue

    return output_folder

WAV_DIR = prepare_wav_files("cv-valid-train.csv", "cv-valid-train-wav")

Toplam 195776 dosya işleniyor...


100%|██████████| 195776/195776 [5:19:45<00:00, 10.20it/s]  


In [3]:
df = pd.read_csv("cv-valid-train.csv")
df['audio'] = df['filename'].apply(lambda x: os.path.join("cv-valid-train-wav", os.path.basename(x).replace(".mp3", ".wav")))
df = df.rename(columns={'text': 'sentence'})[['audio', 'sentence']]

df

Unnamed: 0,audio,sentence
0,cv-valid-train-wav\sample-000000.wav,learn to recognize omens and follow them the o...
1,cv-valid-train-wav\sample-000001.wav,everything in the universe evolved he said
2,cv-valid-train-wav\sample-000002.wav,you came so that you could learn about your dr...
3,cv-valid-train-wav\sample-000003.wav,so now i fear nothing because it was those ome...
4,cv-valid-train-wav\sample-000004.wav,if you start your emails with greetings let me...
...,...,...
195771,cv-valid-train-wav\sample-195771.wav,the englishman said nothing
195772,cv-valid-train-wav\sample-195772.wav,the irish man sipped his tea
195773,cv-valid-train-wav\sample-195773.wav,what do you know about that
195774,cv-valid-train-wav\sample-195774.wav,the phone rang while she was awake


In [4]:
dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# Tüm veriyi işlemek yerine önce küçük bir kısmını seç
dataset = dataset.select(range(50000)) # Sadece ilk 20 bin dosyayı al
# Sonra yukarıdaki optimize edilmiş map işlemini yap

dataset

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 50000
})

In [5]:
model_id = "openai/whisper-base"
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    model_id,
    load_in_4bit=True,
    device_map="auto"
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [6]:
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05, bias="none"
)
model = get_peft_model(model, config)

In [7]:
model

PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 512)
          (layers): ModuleList(
            (0-5): 6 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear4bit(in_features=512, out_features=512, bias=False)
                (v_proj): lora.Linear4bit(
                  (base_layer): Linear4bit(in_features=512, out_features=512, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=512, out_features=32, bias=False)
                  )
                  (lora_B): 

In [8]:
def prepare_dataset(batch):
    audio = batch["audio"]
    
    # Modelin beklediği giriş özelliklerini (input_features) hesapla
    batch["input_features"] = processor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    
    # Hedef metni (sentence) tokenize ederek etiketlere (labels) çevir
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

In [10]:
print(f"Mevcut Sütunlar: {dataset['train'].column_names}")

Mevcut Sütunlar: ['input_features', 'labels']


In [9]:

dataset = dataset.map(
    prepare_dataset, 
    remove_columns=dataset.column_names,
    batched=False,             # Her seferinde tek dosya işle
    batch_size=1,              # RAM'de birikmeyi önle
    keep_in_memory=False,      # RAM yerine diski kullan (ÇOK KRİTİK!)
    load_from_cache_file=True, # Daha önce işlenenleri tekrar yapma
    num_proc=1                 # Çoklu işlem bazen RAM hatasını tetikler, 1 ile dene
)

# Önce böl
dataset = dataset.train_test_split(test_size=0.1)

dataset["train"] = dataset["train"].map(
    prepare_dataset, 
    # Sütunları silerken spesifik olarak train setinin sütunlarını hedefle
    remove_columns=dataset["train"].column_names, 
    keep_in_memory=False,
 ) # RAM yerine disk kullan

dataset["test"] = dataset["test"].map(
    prepare_dataset, 
    remove_columns=dataset["test"].column_names, 
    keep_in_memory=False,
    batched=False
)

Map: 100%|██████████| 50000/50000 [29:46<00:00, 27.99 examples/s]   
Map:   0%|          | 0/45000 [00:00<?, ? examples/s]


KeyError: 'audio'

In [12]:
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Pad tokenlarını (özel karakterleri) temizle
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Eğitim Parametreleri
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-omer-project",
    per_device_train_batch_size=1, # VRAM dolmaması için 1
    gradient_accumulation_steps=16, # Efektif batch size = 16
    learning_rate=1e-3,
    max_steps=5000,
    gradient_checkpointing=True,
    fp16=True, # 3050 Ti'ın hızını kullanmak için
    evaluation_strategy="steps",
    save_steps=1000,
    logging_steps=25,
    eval_steps=100,
    predict_with_generate=True,
    remove_unused_columns=False,
    per_device_eval_batch_size=1,     # Değerlendirme sırasında tek tek işle
    eval_accumulation_steps=1,        # Bellekte sonuç biriktirme, hemen boşalt
    generation_max_length=128,        # Cümle uzunluğunu sınırla (VRAM tasarrufu)
    logging_first_step=True,
    
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"], # Sadece bu kalsın
    eval_dataset=dataset["test"],
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

# Eğitimi başlat
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
wandb: ERROR Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: omerfarukcirpan (omerfarukcirpan-none) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
                                                   

{'loss': 7.1076, 'grad_norm': 17.79623794555664, 'learning_rate': 0.0009998000000000001, 'epoch': 0.0}


                                                    

{'loss': 0.9048, 'grad_norm': 1.119550347328186, 'learning_rate': 0.000995, 'epoch': 0.01}


                                                    

{'loss': 0.2879, 'grad_norm': 0.5590431094169617, 'learning_rate': 0.00099, 'epoch': 0.02}


                                                    

{'loss': 0.2789, 'grad_norm': 0.6967057585716248, 'learning_rate': 0.000985, 'epoch': 0.03}


                                                     

{'loss': 0.2496, 'grad_norm': 0.8860144019126892, 'learning_rate': 0.00098, 'epoch': 0.04}


  2%|▏         | 100/5000 [13:16<10:37:22,  7.80s/it]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.

  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<33:50,  2.46it/s]
  0%|          | 3/5000 [00:04<2:27:12,  1.77s/it]
  0%|          | 4/5000 [00:05<1:59:04,  1.43s/it]
  0%|          | 5/5000 [00:06<1:38:47,  1.19s/it]
  0%|          | 6/5000 [00:06<1:22:24,  1.01it/s]
  0%|          | 7/5000 [00:07<1:18:00,  1.07it/s]
  0%|          | 8/5000 [00:08<1:10:56,  1.17it/s]
  0%|          | 9/5000 [00:09<1:12:50,  1.14it/s]
  0%|          | 10/5000 [00:09<1:06:39,  1.25it/s]
  0%|          | 11/5000 [00:10<1:06:46,  1.25it/s]
  0%|          | 12/5000 [00:11<1

{'eval_loss': 0.24041089415550232, 'eval_wer': 46.63064208518754, 'eval_runtime': 5152.4765, 'eval_samples_per_second': 0.97, 'eval_steps_per_second': 0.97, 'epoch': 0.04}


  2%|▏         | 100/5000 [1:39:08<10:37:22,  7.80s/it]
100%|██████████| 5000/5000 [1:25:50<00:00,  1.24it/s]
                                                           

{'loss': 0.2275, 'grad_norm': 0.855502724647522, 'learning_rate': 0.000975, 'epoch': 0.04}


                                                       

{'loss': 0.2503, 'grad_norm': 0.682451069355011, 'learning_rate': 0.0009699999999999999, 'epoch': 0.05}


                                                       

{'loss': 0.2304, 'grad_norm': 0.668432891368866, 'learning_rate': 0.000965, 'epoch': 0.06}


                                                       

{'loss': 0.2225, 'grad_norm': 0.7974264621734619, 'learning_rate': 0.00096, 'epoch': 0.07}


  4%|▍         | 200/5000 [1:52:04<10:28:14,  7.85s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<31:29,  2.64it/s]
  0%|          | 3/5000 [00:01<57:12,  1.46it/s]
  0%|          | 4/5000 [00:02<59:01,  1.41it/s]
  0%|          | 5/5000 [00:03<59:55,  1.39it/s]
  0%|          | 6/5000 [00:04<57:42,  1.44it/s]
  0%|          | 7/5000 [00:04<1:03:09,  1.32it/s]
  0%|          | 8/5000 [00:05<1:00:25,  1.38it/s]
  0%|          | 9/5000 [00:06<1:05:29,  1.27it/s]
  0%|          | 10/5000 [00:07<1:03:33,  1.31it/s]
  0%|          | 11/5000 [00:08<1:07:03,  1.24it/s]
  0%|          | 12/5000 [00:08<1:04:37,  1.29it/s]
  0%|          | 13/5000 [00:09<1:03:13,  1.31it/s]
  0%|          | 14/5000 [00:10<1:07:00,  1.24it/s]
  0%|          | 15/5000 [00:11<1:10:51,  1.17it/s]
  0%|          | 16/5000 [00:12<1:07:04,  1.24it/s]
  0%|          | 17/5000 [00:12<1:05:57,  1.26it/s]
  0%|          | 18/5000 [00:13<1:10:02,  1.19it/s]
  0%|          | 19/5000 [00:14<1:09:

{'eval_loss': 0.22043775022029877, 'eval_wer': 13.030303030303031, 'eval_runtime': 4980.2317, 'eval_samples_per_second': 1.004, 'eval_steps_per_second': 1.004, 'epoch': 0.07}


  4%|▍         | 200/5000 [3:15:05<10:28:14,  7.85s/it]
100%|██████████| 5000/5000 [1:22:58<00:00,  1.31it/s]
                                                           

{'loss': 0.21, 'grad_norm': 0.6116511821746826, 'learning_rate': 0.000955, 'epoch': 0.08}


                                                       

{'loss': 0.2231, 'grad_norm': 0.6082059741020203, 'learning_rate': 0.00095, 'epoch': 0.09}


                                                       

{'loss': 0.2254, 'grad_norm': 0.8561515808105469, 'learning_rate': 0.000945, 'epoch': 0.1}


                                                       

{'loss': 0.1917, 'grad_norm': 1.6629853248596191, 'learning_rate': 0.00094, 'epoch': 0.11}


  6%|▌         | 300/5000 [3:28:24<11:25:24,  8.75s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<32:33,  2.56it/s]
  0%|          | 3/5000 [00:01<59:39,  1.40it/s]
  0%|          | 4/5000 [00:02<1:01:49,  1.35it/s]
  0%|          | 5/5000 [00:03<1:02:56,  1.32it/s]
  0%|          | 6/5000 [00:04<1:00:11,  1.38it/s]
  0%|          | 7/5000 [00:05<1:05:16,  1.28it/s]
  0%|          | 8/5000 [00:05<1:02:10,  1.34it/s]
  0%|          | 9/5000 [00:06<1:09:02,  1.20it/s]
  0%|          | 10/5000 [00:07<1:05:58,  1.26it/s]
  0%|          | 11/5000 [00:08<1:08:02,  1.22it/s]
  0%|          | 12/5000 [00:09<1:05:46,  1.26it/s]
  0%|          | 13/5000 [00:09<1:04:29,  1.29it/s]
  0%|          | 14/5000 [00:10<1:08:10,  1.22it/s]
  0%|          | 15/5000 [00:11<1:11:20,  1.16it/s]
  0%|          | 16/5000 [00:12<1:07:34,  1.23it/s]
  0%|          | 17/5000 [00:13<1:06:45,  1.24it/s]
  0%|          | 18/5000 [00:14<1:11:06,  1.17it/s]
  0%|          | 19/5000 [00:15

{'eval_loss': 0.21554690599441528, 'eval_wer': 24.10680228862047, 'eval_runtime': 4817.5108, 'eval_samples_per_second': 1.038, 'eval_steps_per_second': 1.038, 'epoch': 0.11}


  6%|▌         | 300/5000 [4:48:41<11:25:24,  8.75s/it]
100%|██████████| 5000/5000 [1:20:16<00:00,  1.82it/s]
                                                           

{'loss': 0.2242, 'grad_norm': 1.1119201183319092, 'learning_rate': 0.0009350000000000001, 'epoch': 0.12}


                                                      

{'loss': 0.2439, 'grad_norm': 1.1242116689682007, 'learning_rate': 0.00093, 'epoch': 0.12}


                                                      

{'loss': 0.2177, 'grad_norm': 0.7278023958206177, 'learning_rate': 0.000925, 'epoch': 0.13}


                                                      

{'loss': 0.2328, 'grad_norm': 1.481063723564148, 'learning_rate': 0.00092, 'epoch': 0.14}


  8%|▊         | 400/5000 [4:58:05<7:13:26,  5.65s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<24:33,  3.39it/s]
  0%|          | 3/5000 [00:01<43:47,  1.90it/s]
  0%|          | 4/5000 [00:02<45:26,  1.83it/s]
  0%|          | 5/5000 [00:02<44:13,  1.88it/s]
  0%|          | 6/5000 [00:02<40:07,  2.07it/s]
  0%|          | 7/5000 [00:03<42:00,  1.98it/s]
  0%|          | 8/5000 [00:03<38:49,  2.14it/s]
  0%|          | 9/5000 [00:04<42:00,  1.98it/s]
  0%|          | 10/5000 [00:04<39:49,  2.09it/s]
  0%|          | 11/5000 [00:05<40:59,  2.03it/s]
  0%|          | 12/5000 [00:05<39:01,  2.13it/s]
  0%|          | 13/5000 [00:06<38:24,  2.16it/s]
  0%|          | 14/5000 [00:06<40:34,  2.05it/s]
  0%|          | 15/5000 [00:07<42:14,  1.97it/s]
  0%|          | 16/5000 [00:07<39:49,  2.09it/s]
  0%|          | 17/5000 [00:08<39:34,  2.10it/s]
  0%|          | 18/5000 [00:08<42:45,  1.94it/s]
  0%|          | 19/5000 [00:09<43:56,  1.89it/s]
  0%|       

{'eval_loss': 0.2140578329563141, 'eval_wer': 18.14155541428269, 'eval_runtime': 3908.1227, 'eval_samples_per_second': 1.279, 'eval_steps_per_second': 1.279, 'epoch': 0.14}


  8%|▊         | 400/5000 [6:03:14<7:13:26,  5.65s/it]
100%|██████████| 5000/5000 [1:05:07<00:00,  1.25it/s]
                                                           

{'loss': 0.2365, 'grad_norm': 0.7594870924949646, 'learning_rate': 0.000915, 'epoch': 0.15}


                                                       

{'loss': 0.1952, 'grad_norm': 0.923914909362793, 'learning_rate': 0.00091, 'epoch': 0.16}


                                                       

{'loss': 0.2473, 'grad_norm': 1.7052769660949707, 'learning_rate': 0.0009050000000000001, 'epoch': 0.17}


                                                       

{'loss': 0.2142, 'grad_norm': 0.7207598686218262, 'learning_rate': 0.0009000000000000001, 'epoch': 0.18}


 10%|█         | 500/5000 [6:18:36<11:31:21,  9.22s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<37:44,  2.21it/s]
  0%|          | 3/5000 [00:02<1:06:41,  1.25it/s]
  0%|          | 4/5000 [00:03<1:08:40,  1.21it/s]
  0%|          | 5/5000 [00:03<1:09:06,  1.20it/s]
  0%|          | 6/5000 [00:04<1:05:28,  1.27it/s]
  0%|          | 7/5000 [00:05<1:10:03,  1.19it/s]
  0%|          | 8/5000 [00:06<1:06:56,  1.24it/s]
  0%|          | 9/5000 [00:07<1:12:34,  1.15it/s]
  0%|          | 10/5000 [00:08<1:09:35,  1.19it/s]
  0%|          | 11/5000 [00:09<1:12:21,  1.15it/s]
  0%|          | 12/5000 [00:09<1:09:34,  1.19it/s]
  0%|          | 13/5000 [00:10<1:07:14,  1.24it/s]
  0%|          | 14/5000 [00:11<1:10:38,  1.18it/s]
  0%|          | 15/5000 [00:12<1:15:03,  1.11it/s]
  0%|          | 16/5000 [00:13<1:11:31,  1.16it/s]
  0%|          | 17/5000 [00:14<1:12:53,  1.14it/s]
  0%|          | 18/5000 [00:15<1:18:47,  1.05it/s]
  0%|          | 19/5000 [00:

{'eval_loss': 0.20556758344173431, 'eval_wer': 18.317440135621954, 'eval_runtime': 5340.9605, 'eval_samples_per_second': 0.936, 'eval_steps_per_second': 0.936, 'epoch': 0.18}


 10%|█         | 500/5000 [7:47:37<11:31:21,  9.22s/it]
100%|██████████| 5000/5000 [1:28:59<00:00,  1.22it/s]
                                                           

{'loss': 0.1876, 'grad_norm': 0.6129462718963623, 'learning_rate': 0.0008950000000000001, 'epoch': 0.19}


                                                       

{'loss': 0.2023, 'grad_norm': 0.5926770567893982, 'learning_rate': 0.0008900000000000001, 'epoch': 0.2}


                                                       

{'loss': 0.2192, 'grad_norm': 0.9693043231964111, 'learning_rate': 0.000885, 'epoch': 0.2}


                                                       

{'loss': 0.1961, 'grad_norm': 1.1430362462997437, 'learning_rate': 0.00088, 'epoch': 0.21}


 12%|█▏        | 600/5000 [8:02:41<11:04:08,  9.06s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<33:23,  2.50it/s]
  0%|          | 3/5000 [00:02<1:02:19,  1.34it/s]
  0%|          | 4/5000 [00:02<1:04:39,  1.29it/s]
  0%|          | 5/5000 [00:03<1:06:04,  1.26it/s]
  0%|          | 6/5000 [00:04<1:02:49,  1.32it/s]
  0%|          | 7/5000 [00:05<1:07:06,  1.24it/s]
  0%|          | 8/5000 [00:05<1:04:11,  1.30it/s]
  0%|          | 9/5000 [00:06<1:09:00,  1.21it/s]
  0%|          | 10/5000 [00:07<1:05:43,  1.27it/s]
  0%|          | 11/5000 [00:08<1:08:59,  1.21it/s]
  0%|          | 12/5000 [00:09<1:06:44,  1.25it/s]
  0%|          | 13/5000 [00:10<1:05:25,  1.27it/s]
  0%|          | 14/5000 [00:11<1:12:03,  1.15it/s]
  0%|          | 15/5000 [00:12<1:16:33,  1.09it/s]
  0%|          | 16/5000 [00:12<1:12:40,  1.14it/s]
  0%|          | 17/5000 [00:13<1:11:32,  1.16it/s]
  0%|          | 18/5000 [00:14<1:15:18,  1.10it/s]
  0%|          | 19/5000 [00:

{'eval_loss': 0.20183782279491425, 'eval_wer': 13.763509218054674, 'eval_runtime': 5160.3838, 'eval_samples_per_second': 0.969, 'eval_steps_per_second': 0.969, 'epoch': 0.21}


 12%|█▏        | 600/5000 [9:28:42<11:04:08,  9.06s/it]
100%|██████████| 5000/5000 [1:25:58<00:00,  1.27it/s]
                                                           

{'loss': 0.2032, 'grad_norm': 0.695920467376709, 'learning_rate': 0.000875, 'epoch': 0.22}


                                                       

{'loss': 0.1874, 'grad_norm': 0.4890848696231842, 'learning_rate': 0.00087, 'epoch': 0.23}


                                                       

{'loss': 0.2185, 'grad_norm': 1.2768309116363525, 'learning_rate': 0.0008652, 'epoch': 0.24}


                                                       

{'loss': 0.2026, 'grad_norm': 0.4620298743247986, 'learning_rate': 0.0008602, 'epoch': 0.25}


 14%|█▍        | 700/5000 [9:43:19<10:35:11,  8.86s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<33:27,  2.49it/s]
  0%|          | 3/5000 [00:02<1:01:13,  1.36it/s]
  0%|          | 4/5000 [00:02<1:03:21,  1.31it/s]
  0%|          | 5/5000 [00:03<1:05:12,  1.28it/s]
  0%|          | 6/5000 [00:04<1:02:35,  1.33it/s]
  0%|          | 7/5000 [00:05<1:06:43,  1.25it/s]
  0%|          | 8/5000 [00:05<1:03:08,  1.32it/s]
  0%|          | 9/5000 [00:06<1:08:26,  1.22it/s]
  0%|          | 10/5000 [00:07<1:05:01,  1.28it/s]
  0%|          | 11/5000 [00:08<1:09:49,  1.19it/s]
  0%|          | 12/5000 [00:09<1:06:37,  1.25it/s]
  0%|          | 13/5000 [00:09<1:04:38,  1.29it/s]
  0%|          | 14/5000 [00:10<1:08:25,  1.21it/s]
  0%|          | 15/5000 [00:11<1:13:18,  1.13it/s]
  0%|          | 16/5000 [00:12<1:10:20,  1.18it/s]
  0%|          | 17/5000 [00:13<1:10:05,  1.18it/s]
  0%|          | 18/5000 [00:14<1:14:24,  1.12it/s]
  0%|          | 19/5000 [00:

{'eval_loss': 0.19779914617538452, 'eval_wer': 14.085611358338632, 'eval_runtime': 5078.1627, 'eval_samples_per_second': 0.985, 'eval_steps_per_second': 0.985, 'epoch': 0.25}


 14%|█▍        | 700/5000 [11:07:58<10:35:11,  8.86s/it]
100%|██████████| 5000/5000 [1:24:36<00:00,  1.30it/s]
                                                            

{'loss': 0.1992, 'grad_norm': 0.8980417251586914, 'learning_rate': 0.0008552, 'epoch': 0.26}


                                                        

{'loss': 0.1897, 'grad_norm': 0.7098341584205627, 'learning_rate': 0.0008502, 'epoch': 0.27}


                                                        

{'loss': 0.1985, 'grad_norm': 0.8628239035606384, 'learning_rate': 0.0008451999999999999, 'epoch': 0.28}


                                                        

{'loss': 0.1985, 'grad_norm': 0.5362813472747803, 'learning_rate': 0.0008401999999999999, 'epoch': 0.28}


 16%|█▌        | 800/5000 [11:22:32<10:21:02,  8.87s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<32:28,  2.57it/s]
  0%|          | 3/5000 [00:01<1:00:28,  1.38it/s]
  0%|          | 4/5000 [00:02<1:03:42,  1.31it/s]
  0%|          | 5/5000 [00:03<1:04:34,  1.29it/s]
  0%|          | 6/5000 [00:04<1:01:19,  1.36it/s]
  0%|          | 7/5000 [00:05<1:05:53,  1.26it/s]
  0%|          | 8/5000 [00:05<1:02:18,  1.34it/s]
  0%|          | 9/5000 [00:06<1:07:52,  1.23it/s]
  0%|          | 10/5000 [00:07<1:04:53,  1.28it/s]
  0%|          | 11/5000 [00:08<1:07:20,  1.23it/s]
  0%|          | 12/5000 [00:09<1:04:46,  1.28it/s]
  0%|          | 13/5000 [00:09<1:03:19,  1.31it/s]
  0%|          | 14/5000 [00:10<1:07:59,  1.22it/s]
  0%|          | 15/5000 [00:11<1:12:33,  1.15it/s]
  0%|          | 16/5000 [00:12<1:09:36,  1.19it/s]
  0%|          | 17/5000 [00:13<1:09:10,  1.20it/s]
  0%|          | 18/5000 [00:14<1:13:25,  1.13it/s]
  0%|          | 19/5000 [00

{'eval_loss': 0.19235444068908691, 'eval_wer': 13.693579148124602, 'eval_runtime': 5089.0931, 'eval_samples_per_second': 0.982, 'eval_steps_per_second': 0.982, 'epoch': 0.28}


 16%|█▌        | 800/5000 [12:47:21<10:21:02,  8.87s/it]
100%|██████████| 5000/5000 [1:24:47<00:00,  1.30it/s]
                                                            

{'loss': 0.1905, 'grad_norm': 0.6919968128204346, 'learning_rate': 0.0008352, 'epoch': 0.29}


                                                        

{'loss': 0.1988, 'grad_norm': 2.833432197570801, 'learning_rate': 0.0008302, 'epoch': 0.3}


                                                        

{'loss': 0.1872, 'grad_norm': 0.4417954385280609, 'learning_rate': 0.0008252000000000001, 'epoch': 0.31}


                                                        

{'loss': 0.1834, 'grad_norm': 1.200722336769104, 'learning_rate': 0.0008202000000000001, 'epoch': 0.32}


 18%|█▊        | 900/5000 [13:02:00<10:06:23,  8.87s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<33:04,  2.52it/s]
  0%|          | 3/5000 [00:02<1:05:02,  1.28it/s]
  0%|          | 4/5000 [00:02<1:06:30,  1.25it/s]
  0%|          | 5/5000 [00:03<1:06:23,  1.25it/s]
  0%|          | 6/5000 [00:04<1:02:40,  1.33it/s]
  0%|          | 7/5000 [00:05<1:06:40,  1.25it/s]
  0%|          | 8/5000 [00:05<1:03:03,  1.32it/s]
  0%|          | 9/5000 [00:06<1:08:08,  1.22it/s]
  0%|          | 10/5000 [00:07<1:04:40,  1.29it/s]
  0%|          | 11/5000 [00:08<1:07:54,  1.22it/s]
  0%|          | 12/5000 [00:09<1:05:50,  1.26it/s]
  0%|          | 13/5000 [00:09<1:04:30,  1.29it/s]
  0%|          | 14/5000 [00:10<1:09:00,  1.20it/s]
  0%|          | 15/5000 [00:11<1:13:48,  1.13it/s]
  0%|          | 16/5000 [00:12<1:10:14,  1.18it/s]
  0%|          | 17/5000 [00:13<1:09:51,  1.19it/s]
  0%|          | 18/5000 [00:14<1:14:51,  1.11it/s]
  0%|          | 19/5000 [00

{'eval_loss': 0.18540406227111816, 'eval_wer': 11.792752701843611, 'eval_runtime': 5124.9982, 'eval_samples_per_second': 0.976, 'eval_steps_per_second': 0.976, 'epoch': 0.32}


 18%|█▊        | 900/5000 [14:27:25<10:06:23,  8.87s/it]
100%|██████████| 5000/5000 [1:25:23<00:00,  1.29it/s]
                                                            

{'loss': 0.1816, 'grad_norm': 1.0943645238876343, 'learning_rate': 0.0008152000000000001, 'epoch': 0.33}


                                                        

{'loss': 0.1913, 'grad_norm': 0.8008790016174316, 'learning_rate': 0.0008102000000000001, 'epoch': 0.34}


                                                        

{'loss': 0.2259, 'grad_norm': 1.0650213956832886, 'learning_rate': 0.0008052000000000001, 'epoch': 0.35}


                                                        

{'loss': 0.1773, 'grad_norm': 1.293294906616211, 'learning_rate': 0.0008002, 'epoch': 0.36}


 20%|██        | 1000/5000 [14:42:04<9:56:37,  8.95s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<32:53,  2.53it/s]
  0%|          | 3/5000 [00:02<1:02:52,  1.32it/s]
  0%|          | 4/5000 [00:02<1:05:05,  1.28it/s]
  0%|          | 5/5000 [00:03<1:05:36,  1.27it/s]
  0%|          | 6/5000 [00:04<1:02:34,  1.33it/s]
  0%|          | 7/5000 [00:05<1:06:17,  1.26it/s]
  0%|          | 8/5000 [00:05<1:02:48,  1.32it/s]
  0%|          | 9/5000 [00:06<1:08:05,  1.22it/s]
  0%|          | 10/5000 [00:07<1:06:14,  1.26it/s]
  0%|          | 11/5000 [00:08<1:10:59,  1.17it/s]
  0%|          | 12/5000 [00:09<1:07:56,  1.22it/s]
  0%|          | 13/5000 [00:10<1:06:29,  1.25it/s]
  0%|          | 14/5000 [00:11<1:09:58,  1.19it/s]
  0%|          | 15/5000 [00:12<1:13:41,  1.13it/s]
  0%|          | 16/5000 [00:12<1:09:47,  1.19it/s]
  0%|          | 17/5000 [00:13<1:09:38,  1.19it/s]
  0%|          | 18/5000 [00:14<1:13:16,  1.13it/s]
  0%|          | 19/5000 [00

{'eval_loss': 0.18308520317077637, 'eval_wer': 12.049162958253866, 'eval_runtime': 5192.0933, 'eval_samples_per_second': 0.963, 'eval_steps_per_second': 0.963, 'epoch': 0.36}


 20%|██        | 1000/5000 [16:08:36<9:56:37,  8.95s/it]
100%|██████████| 5000/5000 [1:26:30<00:00,  1.27it/s]
  return fn(*args, **kwargs)
                                                             

{'loss': 0.1987, 'grad_norm': 0.8330914378166199, 'learning_rate': 0.0007952, 'epoch': 0.36}


                                                         

{'loss': 0.1604, 'grad_norm': 0.8253604769706726, 'learning_rate': 0.0007902, 'epoch': 0.37}


                                                         

{'loss': 0.2007, 'grad_norm': 0.4940810203552246, 'learning_rate': 0.0007852, 'epoch': 0.38}


                                                         

{'loss': 0.1879, 'grad_norm': 0.9557390809059143, 'learning_rate': 0.0007802, 'epoch': 0.39}


 22%|██▏       | 1100/5000 [16:23:46<9:58:58,  9.21s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<34:31,  2.41it/s]
  0%|          | 3/5000 [00:02<1:04:42,  1.29it/s]
  0%|          | 4/5000 [00:02<1:06:27,  1.25it/s]
  0%|          | 5/5000 [00:03<1:08:23,  1.22it/s]
  0%|          | 6/5000 [00:04<1:05:13,  1.28it/s]
  0%|          | 7/5000 [00:05<1:09:29,  1.20it/s]
  0%|          | 8/5000 [00:06<1:05:51,  1.26it/s]
  0%|          | 9/5000 [00:07<1:10:38,  1.18it/s]
  0%|          | 10/5000 [00:07<1:08:05,  1.22it/s]
  0%|          | 11/5000 [00:08<1:10:49,  1.17it/s]
  0%|          | 12/5000 [00:09<1:08:13,  1.22it/s]
  0%|          | 13/5000 [00:10<1:06:44,  1.25it/s]
  0%|          | 14/5000 [00:11<1:11:06,  1.17it/s]
  0%|          | 15/5000 [00:12<1:16:24,  1.09it/s]
  0%|          | 16/5000 [00:13<1:12:38,  1.14it/s]
  0%|          | 17/5000 [00:13<1:11:31,  1.16it/s]
  0%|          | 18/5000 [00:15<1:16:05,  1.09it/s]
  0%|          | 19/5000 [00

{'eval_loss': 0.1812552958726883, 'eval_wer': 10.180122907395635, 'eval_runtime': 5363.7879, 'eval_samples_per_second': 0.932, 'eval_steps_per_second': 0.932, 'epoch': 0.39}


 22%|██▏       | 1100/5000 [17:53:10<9:58:58,  9.21s/it]
100%|██████████| 5000/5000 [1:29:22<00:00,  1.20it/s]
                                                             

{'loss': 0.187, 'grad_norm': 0.5019859075546265, 'learning_rate': 0.0007752, 'epoch': 0.4}


                                                         

{'loss': 0.1996, 'grad_norm': 0.9426894783973694, 'learning_rate': 0.0007702, 'epoch': 0.41}


                                                         

{'loss': 0.176, 'grad_norm': 0.5935992002487183, 'learning_rate': 0.0007652000000000001, 'epoch': 0.42}


                                                         

{'loss': 0.1703, 'grad_norm': 0.8239011764526367, 'learning_rate': 0.0007602, 'epoch': 0.43}


 24%|██▍       | 1200/5000 [18:08:09<10:00:08,  9.48s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<35:51,  2.32it/s]
  0%|          | 3/5000 [00:02<1:05:50,  1.26it/s]
  0%|          | 4/5000 [00:03<1:08:03,  1.22it/s]
  0%|          | 5/5000 [00:03<1:08:49,  1.21it/s]
  0%|          | 6/5000 [00:04<1:05:41,  1.27it/s]
  0%|          | 7/5000 [00:05<1:09:45,  1.19it/s]
  0%|          | 8/5000 [00:06<1:06:22,  1.25it/s]
  0%|          | 9/5000 [00:07<1:11:51,  1.16it/s]
  0%|          | 10/5000 [00:08<1:09:22,  1.20it/s]
  0%|          | 11/5000 [00:09<1:14:28,  1.12it/s]
  0%|          | 12/5000 [00:09<1:12:26,  1.15it/s]
  0%|          | 13/5000 [00:10<1:11:02,  1.17it/s]
  0%|          | 14/5000 [00:11<1:15:59,  1.09it/s]
  0%|          | 15/5000 [00:12<1:22:07,  1.01it/s]
  0%|          | 16/5000 [00:13<1:17:43,  1.07it/s]
  0%|          | 17/5000 [00:14<1:16:12,  1.09it/s]
  0%|          | 18/5000 [00:15<1:21:33,  1.02it/s]
  0%|          | 19/5000 [0

{'eval_loss': 0.1795942783355713, 'eval_wer': 11.18457300275482, 'eval_runtime': 5357.9996, 'eval_samples_per_second': 0.933, 'eval_steps_per_second': 0.933, 'epoch': 0.43}


 24%|██▍       | 1200/5000 [19:37:27<10:00:08,  9.48s/it]
100%|██████████| 5000/5000 [1:29:16<00:00,  1.15it/s]
                                                             

{'loss': 0.1944, 'grad_norm': 1.0388617515563965, 'learning_rate': 0.0007552, 'epoch': 0.44}


                                                         

{'loss': 0.1865, 'grad_norm': 0.5866698026657104, 'learning_rate': 0.0007502, 'epoch': 0.44}


                                                        

{'loss': 0.1744, 'grad_norm': 0.9438706040382385, 'learning_rate': 0.0007452, 'epoch': 0.45}


                                                         

{'loss': 0.1811, 'grad_norm': 1.963351845741272, 'learning_rate': 0.0007402, 'epoch': 0.46}


 26%|██▌       | 1300/5000 [19:53:04<9:50:20,  9.57s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<36:10,  2.30it/s]
  0%|          | 3/5000 [00:02<1:07:01,  1.24it/s]
  0%|          | 4/5000 [00:03<1:09:59,  1.19it/s]
  0%|          | 5/5000 [00:04<1:13:44,  1.13it/s]
  0%|          | 6/5000 [00:04<1:09:11,  1.20it/s]
  0%|          | 7/5000 [00:05<1:13:24,  1.13it/s]
  0%|          | 8/5000 [00:06<1:10:27,  1.18it/s]
  0%|          | 9/5000 [00:07<1:16:17,  1.09it/s]
  0%|          | 10/5000 [00:08<1:12:01,  1.15it/s]
  0%|          | 11/5000 [00:09<1:14:23,  1.12it/s]
  0%|          | 12/5000 [00:10<1:11:12,  1.17it/s]
  0%|          | 13/5000 [00:10<1:09:49,  1.19it/s]
  0%|          | 14/5000 [00:11<1:14:56,  1.11it/s]
  0%|          | 15/5000 [00:13<1:18:57,  1.05it/s]
  0%|          | 16/5000 [00:13<1:15:09,  1.11it/s]
  0%|          | 17/5000 [00:14<1:14:27,  1.12it/s]
  0%|          | 18/5000 [00:15<1:19:11,  1.05it/s]
  0%|          | 19/5000 [00

{'eval_loss': 0.1810889095067978, 'eval_wer': 12.400932400932401, 'eval_runtime': 5421.0607, 'eval_samples_per_second': 0.922, 'eval_steps_per_second': 0.922, 'epoch': 0.46}


 26%|██▌       | 1300/5000 [21:23:25<9:50:20,  9.57s/it]
100%|██████████| 5000/5000 [1:30:19<00:00,  1.23it/s]
                                                             

{'loss': 0.16, 'grad_norm': 0.6998048424720764, 'learning_rate': 0.0007352, 'epoch': 0.47}


                                                        

{'loss': 0.1691, 'grad_norm': 1.2596937417984009, 'learning_rate': 0.0007302, 'epoch': 0.48}


                                                        

{'loss': 0.223, 'grad_norm': 2.3699045181274414, 'learning_rate': 0.0007252, 'epoch': 0.49}


                                                        

{'loss': 0.1693, 'grad_norm': 0.9143420457839966, 'learning_rate': 0.0007201999999999999, 'epoch': 0.5}


 28%|██▊       | 1400/5000 [21:37:53<8:05:09,  8.09s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<36:23,  2.29it/s]
  0%|          | 3/5000 [00:01<59:17,  1.40it/s]
  0%|          | 4/5000 [00:02<1:03:03,  1.32it/s]
  0%|          | 5/5000 [00:03<1:06:38,  1.25it/s]
  0%|          | 6/5000 [00:04<1:03:30,  1.31it/s]
  0%|          | 7/5000 [00:05<1:08:42,  1.21it/s]
  0%|          | 8/5000 [00:05<1:02:09,  1.34it/s]
  0%|          | 9/5000 [00:06<1:05:38,  1.27it/s]
  0%|          | 10/5000 [00:07<1:00:43,  1.37it/s]
  0%|          | 11/5000 [00:08<1:06:19,  1.25it/s]
  0%|          | 12/5000 [00:09<1:04:47,  1.28it/s]
  0%|          | 13/5000 [00:09<1:00:35,  1.37it/s]
  0%|          | 14/5000 [00:10<1:03:09,  1.32it/s]
  0%|          | 15/5000 [00:11<1:07:20,  1.23it/s]
  0%|          | 16/5000 [00:12<1:05:09,  1.27it/s]
  0%|          | 17/5000 [00:12<1:06:10,  1.26it/s]
  0%|          | 18/5000 [00:13<1:09:55,  1.19it/s]
  0%|          | 19/5000 [00:1

{'eval_loss': 0.18024279177188873, 'eval_wer': 10.057215511760967, 'eval_runtime': 5190.8358, 'eval_samples_per_second': 0.963, 'eval_steps_per_second': 0.963, 'epoch': 0.5}


 28%|██▊       | 1400/5000 [23:04:24<8:05:09,  8.09s/it]
100%|██████████| 5000/5000 [1:26:29<00:00,  1.22it/s]
                                                             

{'loss': 0.1898, 'grad_norm': 0.948160707950592, 'learning_rate': 0.0007151999999999999, 'epoch': 0.51}


                                                        

{'loss': 0.1774, 'grad_norm': 0.9148449897766113, 'learning_rate': 0.0007102, 'epoch': 0.52}


                                                        

{'loss': 0.1897, 'grad_norm': 0.6572932004928589, 'learning_rate': 0.0007052, 'epoch': 0.52}


                                                        

{'loss': 0.2097, 'grad_norm': 0.5185699462890625, 'learning_rate': 0.0007002000000000001, 'epoch': 0.53}


 30%|███       | 1500/5000 [23:17:52<7:55:06,  8.14s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<33:20,  2.50it/s]
  0%|          | 3/5000 [00:02<1:02:46,  1.33it/s]
  0%|          | 4/5000 [00:02<1:04:45,  1.29it/s]
  0%|          | 5/5000 [00:03<1:06:02,  1.26it/s]
  0%|          | 6/5000 [00:04<1:03:20,  1.31it/s]
  0%|          | 7/5000 [00:05<1:08:12,  1.22it/s]
  0%|          | 8/5000 [00:06<1:04:49,  1.28it/s]
  0%|          | 9/5000 [00:07<1:10:07,  1.19it/s]
  0%|          | 10/5000 [00:07<1:06:57,  1.24it/s]
  0%|          | 11/5000 [00:08<1:09:12,  1.20it/s]
  0%|          | 12/5000 [00:09<1:06:58,  1.24it/s]
  0%|          | 13/5000 [00:10<1:05:12,  1.27it/s]
  0%|          | 14/5000 [00:11<1:09:21,  1.20it/s]
  0%|          | 15/5000 [00:12<1:13:48,  1.13it/s]
  0%|          | 16/5000 [00:12<1:09:40,  1.19it/s]
  0%|          | 17/5000 [00:13<1:09:10,  1.20it/s]
  0%|          | 18/5000 [00:14<1:14:22,  1.12it/s]
  0%|          | 19/5000 [00

{'eval_loss': 0.17224851250648499, 'eval_wer': 9.970332697605425, 'eval_runtime': 5243.2151, 'eval_samples_per_second': 0.954, 'eval_steps_per_second': 0.954, 'epoch': 0.53}


 30%|███       | 1500/5000 [24:45:15<7:55:06,  8.14s/it]
100%|██████████| 5000/5000 [1:27:21<00:00,  1.28it/s]
                                                             

{'loss': 0.2034, 'grad_norm': 1.3878836631774902, 'learning_rate': 0.0006952000000000001, 'epoch': 0.54}


                                                        

{'loss': 0.1867, 'grad_norm': 1.5498467683792114, 'learning_rate': 0.0006902000000000001, 'epoch': 0.55}


                                                        

{'loss': 0.1802, 'grad_norm': 0.964836835861206, 'learning_rate': 0.0006852000000000001, 'epoch': 0.56}


                                                        

{'loss': 0.1898, 'grad_norm': 0.7967449426651001, 'learning_rate': 0.0006802, 'epoch': 0.57}


 32%|███▏      | 1600/5000 [24:58:38<7:40:48,  8.13s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]
  0%|          | 2/5000 [00:00<33:51,  2.46it/s]
  0%|          | 3/5000 [00:02<1:02:38,  1.33it/s]
  0%|          | 4/5000 [00:02<1:04:35,  1.29it/s]
  0%|          | 5/5000 [00:03<1:05:06,  1.28it/s]
  0%|          | 6/5000 [00:04<1:02:07,  1.34it/s]
  0%|          | 7/5000 [00:05<1:06:15,  1.26it/s]
  0%|          | 8/5000 [00:05<1:02:54,  1.32it/s]
  0%|          | 9/5000 [00:06<1:09:32,  1.20it/s]
  0%|          | 10/5000 [00:07<1:07:09,  1.24it/s]
  0%|          | 11/5000 [00:08<1:10:15,  1.18it/s]
  0%|          | 12/5000 [00:09<1:08:12,  1.22it/s]
  0%|          | 13/5000 [00:10<1:06:04,  1.26it/s]
  0%|          | 14/5000 [00:11<1:09:58,  1.19it/s]
  0%|          | 15/5000 [00:12<1:14:43,  1.11it/s]
  0%|          | 16/5000 [00:12<1:11:04,  1.17it/s]
  0%|          | 17/5000 [00:13<1:10:06,  1.18it/s]
  0%|          | 18/5000 [00:14<1:14:39,  1.11it/s]
  0%|          | 19/5000 [00

KeyboardInterrupt: 

In [3]:
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from peft import PeftModel
import librosa
import numpy as np

# 1. Ayarlar ve Dosya Yolları
model_id = "openai/whisper-base" 
checkpoint_path = "./whisper-omer-project/checkpoint-1000" 
test_audio_file = "mehmet_deneme.mp3" # Buraya kendi dosyanı yaz

print("Model ve İşlemci yükleniyor...")

# 2. Processor ve Model Yükleme (Öncekiyle aynı)
processor = WhisperProcessor.from_pretrained(
    model_id, 
    language="English", 
    task="transcribe",
    local_files_only=True
)

model = WhisperForConditionalGeneration.from_pretrained(
    model_id, 
    load_in_8bit=False, 
    device_map="auto",
    local_files_only=True
)

# LoRA Adapter entegrasyonu
model = PeftModel.from_pretrained(model, checkpoint_path)

print("Model hazır! Ses dosyası işleniyor...")

# 5. KUSURSUZ DÖNÜŞTÜRME KATMANI
# librosa.load fonksiyonuna sr=16000 eklediğimizde, dosya ne olursa olsun 
# otomatik olarak 16kHz'e resample edilir (yeniden örneklenir).
# mono=True ise sesi tek kanala indirger.
speech, sr = librosa.load(test_audio_file, sr=16000, mono=True)

# 5.1. Normalizasyon (Opsiyonel ama önerilir)
# Ses çok kısık veya çok yüksekse dengeler.
if np.abs(speech).max() > 0:
    speech = speech / np.abs(speech).max()

# 6. Özellik Çıkarımı ve Tahmin
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features.to("cuda")

with torch.no_grad():
    # max_length ekleyerek çok uzun dosyalarda kesilmesini önleyebilirsin
    predicted_ids = model.generate(input_features, max_length=255)

# 7. Sonuç
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

print("\n" + "="*40)
print(f"MODEL ÇIKTISI: {transcription}")
print("="*40)

Model ve İşlemci yükleniyor...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model hazır! Ses dosyası işleniyor...

MODEL ÇIKTISI:  Can you hear me baby i'm coming for you you are a beautiful woman and i'll fuck you
