### Audio PreProcess

=> Extracting the timestamp and the subtitles from our dataset and preparing it for the next step: Segment the Audio using Timestamps.

In [2]:
import pandas as pd

# Assuming your TXT is tab-separated or can be read as CSV
# You might need to adjust delimiter based on how you extracted it
subtitle_df = pd.read_csv('data/raw_subtitles/eerwi1.txt', sep='\t')
print(subtitle_df.head())

       Time  Unnamed: 1                                           Subtitle  \
0  00:00:00         NaN                                         [تصفيق]      
1  00:00:06         NaN   ثم زوج اخوه واحد قليل وواحد غني والغني هو الل...   
2  00:00:13         NaN   هاك القليل عنده مجريش اولاد والنهار كله هو وا...   
3  00:00:22         NaN       في ساق تغرق ساق ساعه ساعه يمشي الاخوه لكن      
4  00:00:27         NaN   مبسوط يا اخويا عاون يا اخويا راني في اشد التع...   

   Unnamed: 3                                        Translation  
0         NaN                                        [Applause]   
1         NaN  Then his brother  married  one  of  them,  one...  
2         NaN  Here is the poor one who has no children,  and...  
3         NaN  leg, and the other sinks, hour  by  hour,  the...  
4         NaN  happy, my brother. Help, my brother. I was in ...  


In [3]:
print(subtitle_df.columns)

Index(['Time', 'Unnamed: 1', 'Subtitle', 'Unnamed: 3', 'Translation'], dtype='object')


In [4]:
subtitle_df = subtitle_df.drop(columns=["Unnamed: 1", "Unnamed: 3"])

In [5]:
subtitle_df['Next_Time'] = subtitle_df['Time'].shift(-1)
subtitle_df['Time_Range'] = subtitle_df['Time'] + ' - ' + subtitle_df['Next_Time']
print(subtitle_df.columns)
print(subtitle_df.head())
print('--------------------------------------------------------------')
print('--------------------------------------------------------------')

subtitle_df = subtitle_df.drop(index=664)
print(subtitle_df.tail())

Index(['Time', 'Subtitle', 'Translation', 'Next_Time', 'Time_Range'], dtype='object')
       Time                                           Subtitle  \
0  00:00:00                                         [تصفيق]      
1  00:00:06   ثم زوج اخوه واحد قليل وواحد غني والغني هو الل...   
2  00:00:13   هاك القليل عنده مجريش اولاد والنهار كله هو وا...   
3  00:00:22       في ساق تغرق ساق ساعه ساعه يمشي الاخوه لكن      
4  00:00:27   مبسوط يا اخويا عاون يا اخويا راني في اشد التع...   

                                         Translation Next_Time  \
0                                        [Applause]   00:00:06   
1  Then his brother  married  one  of  them,  one...  00:00:13   
2  Here is the poor one who has no children,  and...  00:00:22   
3  leg, and the other sinks, hour  by  hour,  the...  00:00:27   
4  happy, my brother. Help, my brother. I was in ...  00:00:37   

            Time_Range  
0  00:00:00 - 00:00:06  
1  00:00:06 - 00:00:13  
2  00:00:13 - 00:00:22  
3  00:00:22 - 00:00:

In [6]:
subtitle_df = subtitle_df.drop(columns=["Time", "Next_Time"])

In [7]:
# # Rename columns for clarity (if not already done)
# subtitle_df.columns = ['Time_Range', 'Subtitle', 'Translation']

# Parse Time_Range into start and end times
# This will depend on the exact format of your 'Time' column.
# Example format: "00:00:05.123 - 00:00:08.456"
def parse_time_range(time_str):
    start_str, end_str = time_str.split(' - ')
    # Convert HH:MM:SS.mmm to seconds
    def to_seconds(time_part_str):
        h, m, s = map(float, time_part_str.split(':'))
        return h * 3600 + m * 60 + s
    return to_seconds(start_str), to_seconds(end_str)

subtitle_df[['Start_Time', 'End_Time']] = subtitle_df['Time_Range'].apply(lambda x: pd.Series(parse_time_range(x)))

# Now you have subtitle_df with 'Start_Time', 'End_Time', and 'Subtitle'
print(subtitle_df.head())
print('--------------------------------------------------------------')
print('--------------------------------------------------------------')
print(subtitle_df.tail())

                                            Subtitle  \
0                                         [تصفيق]      
1   ثم زوج اخوه واحد قليل وواحد غني والغني هو الل...   
2   هاك القليل عنده مجريش اولاد والنهار كله هو وا...   
3       في ساق تغرق ساق ساعه ساعه يمشي الاخوه لكن      
4   مبسوط يا اخويا عاون يا اخويا راني في اشد التع...   

                                         Translation           Time_Range  \
0                                        [Applause]   00:00:00 - 00:00:06   
1  Then his brother  married  one  of  them,  one...  00:00:06 - 00:00:13   
2  Here is the poor one who has no children,  and...  00:00:13 - 00:00:22   
3  leg, and the other sinks, hour  by  hour,  the...  00:00:22 - 00:00:27   
4  happy, my brother. Help, my brother. I was in ...  00:00:27 - 00:00:37   

   Start_Time  End_Time  
0         0.0       6.0  
1         6.0      13.0  
2        13.0      22.0  
3        22.0      27.0  
4        27.0      37.0  
--------------------------------------------

In [8]:
subtitle_df = subtitle_df.drop(columns="Translation")

In [9]:
print(subtitle_df.head())
print('--------------------------------------------------------------')
print('--------------------------------------------------------------')
print(subtitle_df.tail())

                                            Subtitle           Time_Range  \
0                                         [تصفيق]     00:00:00 - 00:00:06   
1   ثم زوج اخوه واحد قليل وواحد غني والغني هو الل...  00:00:06 - 00:00:13   
2   هاك القليل عنده مجريش اولاد والنهار كله هو وا...  00:00:13 - 00:00:22   
3       في ساق تغرق ساق ساعه ساعه يمشي الاخوه لكن     00:00:22 - 00:00:27   
4   مبسوط يا اخويا عاون يا اخويا راني في اشد التع...  00:00:27 - 00:00:37   

   Start_Time  End_Time  
0         0.0       6.0  
1         6.0      13.0  
2        13.0      22.0  
3        22.0      27.0  
4        27.0      37.0  
--------------------------------------------------------------
--------------------------------------------------------------
                                              Subtitle           Time_Range  \
659   يعرفوش الناس يفطروا في الصباح وهو زاد عمل انف...  01:30:47 - 01:30:55   
660   صباح بقهم خير شد النيه وزورونا وتزكم الايام ا...  01:30:55 - 01:31:02   
661       اخره كيف

In [10]:
subtitle_df.to_csv("data/eerwi1.csv", index=False)

### Audio Segmentation

=> This step is for associating audio with text.

In [11]:
from pydub import AudioSegment

full_audio = AudioSegment.from_wav("data/full_audio/eerwi1.wav")

AudioSegment.converter = "C:\ffmpeg-2025-06-26-git-09cd38e9d5-essentials_build\ffmpeg-2025-06-26-git-09cd38e9d5-essentials_build\bin\ffmpeg.exe"
output_audio_dir = 'data/processed_segments/audio'
output_transcripts_dir = 'data/processed_segments/transcripts'

import os
os.makedirs(output_audio_dir, exist_ok=True)
os.makedirs(output_transcripts_dir, exist_ok=True)

data_records = [] # To store paths and transcripts for later use

for index, row in subtitle_df.iterrows():
    start_ms = int(row['Start_Time'] * 1000) # pydub uses milliseconds
    end_ms = int(row['End_Time'] * 1000)
    subtitle_text = row['Subtitle']
    
    if end_ms - start_ms < 500: # Minimum 0.5 seconds for a valid speech segment
        print(f"Skipping very short segment at index {index}: Duration {end_ms - start_ms} ms")
        continue

    # Generate unique filename for the segment
    segment_filename = f"segment_{index:05d}.wav" # e.g., segment_00000.wav, segment_00001.wav
    segment_filepath = os.path.join(output_audio_dir, segment_filename)

    segment_filepath_relative = os.path.join('data/processed_segments/audio', segment_filename)

    # Save the audio segment
    segment_audio = full_audio[start_ms:end_ms]
    segment_audio.export(segment_filepath, format="wav")

    # Save the corresponding transcript in a separate file or keep in memory
    transcript_filename = f"segment_{index:05d}.txt"
    transcript_filepath = os.path.join(output_transcripts_dir, transcript_filename)
    with open(transcript_filepath, 'w', encoding='utf-8') as f:
        f.write(subtitle_text)

    data_records.append({
        "audio": {"path": segment_filepath_relative},
        "text": subtitle_text
    })
# Now, `data_records` will contain a list of dictionaries,
# each with the path to an audio segment and its corresponding transcript.
# This list or a similar structure will be used to create our Hugging Face Dataset.

In [12]:
print("Audio segments and transcripts have been saved successfully.")
print(f"Total segments created: {len(data_records)}")
print('Exemple record:', data_records[8])

Audio segments and transcripts have been saved successfully.
Total segments created: 664
Exemple record: {'audio': {'path': 'data/processed_segments/audio\\segment_00008.wav'}, 'text': ' هذا ربي سخر وين زعما وخلقه ولا صنيعه وبنادم ولا   '}


In [13]:
import json

# Save the metadata to a .jsonl file
metadata_jsonl_path = os.path.join('data', 'metadata.jsonl')
with open(metadata_jsonl_path, 'w', encoding='utf-8') as f:
    for record in data_records:
        f.write(json.dumps(record, ensure_ascii=False) + '\n') # ensure_ascii=False for Arabic characters

print(f"Generated {len(data_records)} audio segments and saved metadata to {metadata_jsonl_path}")

Generated 664 audio segments and saved metadata to data\metadata.jsonl


### Adapting Data to Whisper's Requirements 

=> Objective: To transform your raw audio segments and metadata.jsonl into a DatasetDict object, ready for Whisper's Trainer.

In [14]:
from datasets import load_dataset, Audio
import os


data_dir = "data/processed_segments" 

# Load the dataset from your metadata.jsonl file
# The 'audio' feature needs to be explicitly cast to Audio type for automatic loading
# and potentially resampling.
dataset = load_dataset(
    "json",
    data_files=os.path.join('data', "metadata.jsonl"),
    split="train" # Load as a single 'train' split initially, we'll split it later
)

# Cast the 'audio' column to Audio feature with the target sampling rate for Whisper
# This will automatically load and resample the audio files to 16kHz
target_sampling_rate = 16000
dataset = dataset.cast_column("audio", Audio(sampling_rate=target_sampling_rate))

print(f"Dataset loaded. Number of examples: {len(dataset)}")
print(dataset[0]) # Print first example to inspect its structure

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 664 examples [00:00, 15132.68 examples/s]


Dataset loaded. Number of examples: 664
{'audio': {'path': 'data/processed_segments/audio\\segment_00000.wav', 'array': array([0.        , 0.        , 0.        , ..., 0.01946811, 0.017743  ,
       0.01702571], shape=(96000,)), 'sampling_rate': 16000}, 'text': ' [تصفيق]   '}


In [15]:
# Load Whisper's Processor (Feature Extractor & Tokenizer)
from transformers import WhisperFeatureExtractor, WhisperTokenizer
# from transformers import WhisperProcessor

# Choose the Whisper model size you'll fine-tune. 'small' or 'base' are good starts.
model_path = "openai/whisper-small" # Or "openai/whisper-base"

# Feature Extractor: processes audio to model inputs
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path)

# Tokenizer: converts text labels to token IDs
# We specify the language ('ar' for Arabic) and the task ('transcribe')
# This is crucial for Whisper's multilingual capabilities.
tokenizer = WhisperTokenizer.from_pretrained(model_path, language="ar", task="transcribe")

# Processor: combines Feature Extractor and Tokenizer
# processor = WhisperProcessor.from_pretrained(model_path, language="ar", task="transcribe")

print("Whisper processor loaded.")

Whisper processor loaded.


=> the next cell prepares Features and Labels (Map Function)

In [None]:
def prepare_dataset(batch, feature_extractor, tokenizer):
    # Load and resample audio data to 16kHz (handled by .cast_column already, but process method ensures it)
    audio = batch["audio"]

    # Compute log-Mel spectrogram from the audio input
    batch["input_features"] = feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]

    # Encode target text to label IDs
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

# Apply the preparation function to the entire dataset
# `num_proc` can speed this up if you have multiple CPU cores available (Kaggle usually does).
# Start with a small num_proc (e.g., 2 or 4) to avoid memory issues on large datasets.
from functools import partial
processed_dataset = dataset.map(
    partial(prepare_dataset, feature_extractor=feature_extractor, tokenizer=tokenizer),
    remove_columns=dataset.column_names, # Remove original columns to save memory
    num_proc=4 # Use available CPU cores or default to 4
)

print("\nProcessed dataset example:")
print(processed_dataset[0].keys()) # Should now have 'input_features' and 'labels'

Map (num_proc=16): 100%|██████████| 664/664 [00:37<00:00, 17.79 examples/s] 



Processed dataset example:
dict_keys(['input_features', 'labels'])


In [21]:
processed_dataset[0]

{'input_features': [[-0.8111094236373901,
   -0.8111094236373901,
   -0.8111094236373901,
   -0.8111094236373901,
   -0.17479372024536133,
   0.24991202354431152,
   0.60640549659729,
   0.6224281191825867,
   0.26175254583358765,
   0.16883915662765503,
   0.1319785714149475,
   0.08948028087615967,
   0.04776662588119507,
   0.0061359405517578125,
   -0.030986666679382324,
   -0.07121026515960693,
   -0.11262989044189453,
   -0.15393221378326416,
   -0.19498848915100098,
   -0.23596429824829102,
   -0.2777611017227173,
   -0.3156243562698364,
   -0.36704015731811523,
   -0.3268296718597412,
   0.03241366147994995,
   0.1494801640510559,
   -0.22148120403289795,
   0.3248598575592041,
   0.3478587865829468,
   0.4405497908592224,
   0.4976752996444702,
   0.08109349012374878,
   0.34023046493530273,
   0.24251878261566162,
   0.2666301727294922,
   0.28643572330474854,
   0.2568800449371338,
   0.2929171323776245,
   0.4187272787094116,
   0.4060930609703064,
   0.17437034845352173,
 

In [23]:
from datasets import DatasetDict

# First, create a temporary train-test split for the test set
# We'll use 5-10% for testing. Let's start with 5%.
# stratify_by_column: Not directly applicable for speech data without specific speaker info.
# We'll rely on shuffle.
train_test_split = processed_dataset.train_test_split(test_size=0.05, seed=42) # 5% for test

# Now, split the 'train' part into actual train and validation sets
# Use 5-10% of the *remaining* data for validation
train_val_split = train_test_split['train'].train_test_split(test_size=0.05, seed=42) # 5% of 95% = ~4.75% of total

# Combine them into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],
    'test': train_test_split['test']
})

print("\nDataset split into:")
print(f"Train size: {len(dataset_dict['train'])}")
print(f"Validation size: {len(dataset_dict['validation'])}")
print(f"Test size: {len(dataset_dict['test'])}")


Dataset split into:
Train size: 598
Validation size: 32
Test size: 34


In [24]:
# You can optionally save these splits to disk if you want to ensure reproducibility
# and avoid re-processing every time, especially for very large datasets.
dataset_dict.save_to_disk("./data/splits")

Saving the dataset (2/2 shards): 100%|██████████| 598/598 [00:02<00:00, 269.74 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 32/32 [00:00<00:00, 35.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 34/34 [00:00<00:00, 56.86 examples/s]


In [25]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 598
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 32
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 34
    })
})

### Fine-Tuning Whisper

I will be only testing 30 fine-tuning steps, to identify if the code works without any errors. This is necessary not to waste time and identify from the start code errors.

=> Data Loading & Preparation, for now as i am using a notebook, we will not take into consideration this part

In [1]:
# 03_fine_tuning_whisper.ipynb or .py

import os
from datasets import load_dataset, Audio, DatasetDict
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
import torch
import evaluate # For WER metric
import gc # For garbage collection
from functools import partial

# --- 1. Define Paths ---
# Adjust this path to where your processed_segments folder is located locally
data_dir = "data/processed_segments"
model_name_or_path = "openai/whisper-small" # Using 'small' as a good starting point
target_sampling_rate = 16000

# --- 2. Load Dataset ---
print("Loading dataset...")
dataset = load_dataset(
    "json",
    data_files=os.path.join('data', "metadata.jsonl"),
    split="train"
)
dataset = dataset.cast_column("audio", Audio(sampling_rate=target_sampling_rate))
print(f"Dataset loaded. Number of examples: {len(dataset)}")
print(dataset[0])

# --- 3. Load Whisper Processor ---
print("Loading Whisper processor...")
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language="Arabic", task="transcribe")
processor = WhisperProcessor.from_pretrained(model_name_or_path, language="Arabic", task="transcribe")
print("Whisper processor loaded.")

# --- 4. Prepare Features and Labels ---
print("Preparing features and labels...")
def prepare_dataset(batch, feature_extractor, tokenizer):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(
        audio["array"], sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch
print("Starting dataset mapping...")

processed_dataset = dataset.map(
    partial(prepare_dataset, feature_extractor=feature_extractor, tokenizer=tokenizer),
    remove_columns=dataset.column_names, # Remove original columns to save memory
    num_proc=4 # Use available CPU cores or default to 4
)

print("Features and labels prepared.")

# --- 5. Data Splitting ---
print("Splitting dataset into train, validation, and test sets...")
# First, create a temporary train-test split for the test set
train_test_split = processed_dataset.train_test_split(test_size=0.05, seed=42) # 5% for test

# Now, split the 'train' part into actual train and validation sets
train_val_split = train_test_split['train'].train_test_split(test_size=0.05, seed=42) # 5% of 95% = ~4.75% of total

dataset_dict = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],
    'test': train_test_split['test']
})

print("\nDataset split into:")
print(f"Train size: {len(dataset_dict['train'])}")
print(f"Validation size: {len(dataset_dict['validation'])}")
print(f"Test size: {len(dataset_dict['test'])}")

# Clean up memory
del dataset, processed_dataset, train_test_split, train_val_split
gc.collect()
torch.cuda.empty_cache() # Clear GPU cache if using CUDA

  from .autonotebook import tqdm as notebook_tqdm


Loading dataset...
Dataset loaded. Number of examples: 664
{'audio': {'path': 'data/processed_segments/audio\\segment_00000.wav', 'array': array([0.        , 0.        , 0.        , ..., 0.01946811, 0.017743  ,
       0.01702571], shape=(96000,)), 'sampling_rate': 16000}, 'text': ' [تصفيق]   '}
Loading Whisper processor...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Whisper processor loaded.
Preparing features and labels...
Starting dataset mapping...


Map (num_proc=4): 100%|██████████| 664/664 [00:18<00:00, 36.80 examples/s]


Features and labels prepared.
Splitting dataset into train, validation, and test sets...

Dataset split into:
Train size: 598
Validation size: 32
Test size: 34


=> Defining the Data Collator

In [2]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

from transformers import WhisperProcessor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels since they have different padding strategies
        # First, pad the inputs (audio features)
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Then, pad the labels (transcriptions)
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding token with -100 to ignore it in loss computation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # If bos token is appended in previous tokenization step,
        # cut bos token from start of label ids, so that it does not receive a gradient
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
print("Data collator initialized.")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Data collator initialized.


In [3]:
import transformers
print(f"Transformers version: {transformers.__version__}")

Transformers version: 4.40.0


=> Defining Evaluation Metrics

In [16]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    import numpy as np
    if isinstance(pred.predictions, tuple):
        pred_ids = pred.predictions[0]  # Take the first element if it's a tuple
    else:
        pred_ids = pred.predictions

    # Ensure pred_ids is a tensor and convert logits to token IDs
    if isinstance(pred_ids, torch.Tensor):
        pred_ids = pred_ids.detach().cpu().numpy()  # Move to CPU and convert to NumPy
    elif isinstance(pred_ids, np.ndarray):
        pass  # Already a NumPy array
    else:
        raise ValueError(f"Unexpected type for pred_ids: {type(pred_ids)}")

    # If pred_ids are logits (shape: batch_size, seq_len, vocab_size), take argmax to get token IDs
    if pred_ids.ndim == 3:  # Check if it's logits
        pred_ids = np.argmax(pred_ids, axis=-1)  # Convert logits to token IDs (batch_size, seq_len)

    # Ensure label_ids are properly handled
    if isinstance(pred.label_ids, torch.Tensor):
        label_ids = pred.label_ids.detach().cpu().numpy()
    else:
        label_ids = pred.label_ids

    # Replace -100 in the labels (used for padding/ignored tokens in loss computation)
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

print("Metrics function defined.")

Metrics function defined.


=> Loading the Pre-trained Whisper Model for Fine-tuning

In [None]:
from transformers import WhisperForConditionalGeneration

model_name_or_path = "openai/whisper-small"  
# Load the model. It automatically loads the pre-trained weights.
#The next two lines are to correct the "Missing Keys" Error: ['proj_out.weight']
model = WhisperForConditionalGeneration.from_pretrained(model_path) #will be using this for now, but when training again i will have to uncomment this and comment the next line
# model = WhisperForConditionalGeneration.from_pretrained(model_path, ignore_mismatched_sizes=True)
print('model loaded')

# Important: Configure the model for sequence generation
# This ensures it generates tokens correctly during inference/evaluation.
model.config.forced_decoder_ids = None
print("Model configured for sequence generation.")
model.config.suppress_tokens = [] # You might suppress certain tokens like numbers if desired, but for general ASR, keep empty.
print("Model suppress tokens set to empty list.")

# If you have multiple GPUs and want to use DataParallel (simple multi-GPU)
# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs for DataParallel.")
#     model = torch.nn.DataParallel(model)

print(f"Whisper model '{model_name_or_path}' loaded.")
print(f"Model device: {model.device}")

model loaded
Model configured for sequence generation.
Model suppress tokens set to empty list.
Whisper model 'openai/whisper-small' loaded.
Model device: cpu


=> Defining Training Arguments

In [6]:
import torch
print(torch.cuda.is_available())

True


In [17]:
from transformers import Seq2SeqTrainingArguments
import os

# Define your output directory. This is where checkpoints will be saved.
output_dir = "whisper_tunisian_small" # Name your model output directory

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=8, # Good starting point for RTX 3050 8GB VRAM
    gradient_accumulation_steps=1, # Keep at 1 for now, increase if batch_size needs to be lower
    learning_rate=1e-5, # Common learning rate for fine-tuning
    warmup_steps=500, # Steps for learning rate warm-up
    max_steps=30, # Total number of training steps. Adjust based on dataset size.
                    # For 20-30 hours, 5000-10000 steps could be a good start.
                    # This overrides num_train_epochs if both are set.
    gradient_checkpointing=True, # Saves memory, but slightly slower. Highly recommended for Whisper.
    fp16=True, # Use mixed-precision training (FP16). Requires NVIDIA GPU. Significantly faster.
    per_device_eval_batch_size=8, # Batch size for evaluation
    generation_max_length=225, # Max tokens for generated output during evaluation
    logging_steps=5, # Log training progress every N steps (very frequent for detailed monitoring)
    eval_steps=10, # Evaluate model on validation set every N steps
    save_steps=10, # Save checkpoint every N steps
    evaluation_strategy="steps", # Evaluate at specified steps
    report_to=["tensorboard"], # Report metrics to TensorBoard for visualization
    load_best_model_at_end=True, # Load the best model based on eval_metric at the end of training
    metric_for_best_model="wer", # Which metric to monitor for best model
    greater_is_better=False, # For WER, lower is better
    push_to_hub=False, # Set to True if you want to push to Hugging Face Hub (requires login)
    dataloader_num_workers=0, # Number of processes for data loading. Using all cores (or 4 if `os.cpu_count()` is None)
    
    # --- CRITICAL FIX ---
    do_train=True,  # Explicitly set to True to enable training!
    do_eval=True,   # Keep this to enable evaluation during training
    
    # Optional but good for clarity if using max_steps:
    # Remove num_train_epochs if you are primarily controlling training with max_steps.
    # If num_train_epochs was implicitly set to a default (e.g., 3.0 from your logs),
    # max_steps will override it, but it's clearer to remove it.
    # num_train_epochs=3.0, # <--- Consider removing if max_steps is your primary control
)

print("Training arguments defined.")

PyTorch: setting up devices


Training arguments defined.


=> Setting up the Trainer and Starting Training

In [8]:
import torch
import transformers;
print('PyTorch:', torch.__version__)
print('Transformers:', transformers.__version__)

PyTorch: 2.7.1+cu126
Transformers: 4.40.0


In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
if torch.cuda.is_available():
    print(torch.cuda.current_device())
    print(torch.cuda.get_device_name(0))

True
1
0
NVIDIA GeForce RTX 3050 Laptop GPU


In [18]:
from transformers import Seq2SeqTrainer
from transformers import logging
logging.enable_progress_bar()
logging.set_verbosity_info()

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer, # Pass tokenizer for saving with the model
    # feature_extractor=feature_extractor, # Pass feature_extractor for saving with the model
)

# Start training!
print("Starting training...")
train_result = trainer.train()
# To resume training from a specific checkpoint: just uncomment the following line and comment the previous one
# trainer.train(resume_from_checkpoint=True) # Resumes from latest checkpoint in output_dir
# OR
# trainer.train(resume_from_checkpoint="./whisper-tunisian-small/checkpoint-2500") # Specify exact path
# Save the final model after training finishes
trainer.save_model()
processor.save_pretrained(training_args.output_dir) # Save processor along with the model
print(f"Training finished. Model and processor saved to {training_args.output_dir}")

# You can also evaluate on the test set after training
print("Evaluating on test set...")
metrics = trainer.evaluate(eval_dataset=dataset_dict["test"])
print(f"Test Set Metrics: {metrics}")

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
 33%|███▎      | 10/30 [21:29<42:58, 128.91s/it]


Starting training...


***** Running training *****
  Num examples = 598
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 30
  Number of trainable parameters = 240,582,912
  return fn(*args, **kwargs)
 17%|█▋        | 5/30 [03:48<21:40, 52.00s/it]

                                               


[A[A[A                                    
 17%|█▋        | 5/30 [03:50<21:40, 52.00s/it]

[A[A

{'loss': 0.0907, 'grad_norm': 4.74794340133667, 'learning_rate': 8e-08, 'epoch': 0.07}


 33%|███▎      | 10/30 [08:44<19:23, 58.17s/it]

                                               


[A[A[A                                    
 33%|███▎      | 10/30 [08:46<19:23, 58.17s/it]

[A[A***** Running Evaluation *****
  Num examples = 32
  Batch size = 8


{'loss': 0.1002, 'grad_norm': 7.114495277404785, 'learning_rate': 1.8e-07, 'epoch': 0.13}



[A                                          

                                               


[A[A[A                                    
 33%|███▎      | 10/30 [09:31<19:23, 58.17s/it]

[A[ASaving model checkpoint to whisper_tunisian_small\checkpoint-10
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Configuration saved in whisper_tunisian_small\checkpoint-10\config.json
Configuration saved in whisper_tunisian_small\checkpoint-10\generation_config.json


{'eval_loss': 0.9757904410362244, 'eval_wer': 62.264150943396224, 'eval_runtime': 44.9377, 'eval_samples_per_second': 0.712, 'eval_steps_per_second': 0.089, 'epoch': 0.13}


Model weights saved in whisper_tunisian_small\checkpoint-10\model.safetensors
tokenizer config file saved in whisper_tunisian_small\checkpoint-10\tokenizer_config.json
Special tokens file saved in whisper_tunisian_small\checkpoint-10\special_tokens_map.json
added tokens file saved in whisper_tunisian_small\checkpoint-10\added_tokens.json
  return fn(*args, **kwargs)
 50%|█████     | 15/30 [15:02<16:46, 67.12s/it]

                                               


[A[A[A                                    
 50%|█████     | 15/30 [15:04<16:46, 67.12s/it]

[A[A

{'loss': 0.1272, 'grad_norm': 7.351455211639404, 'learning_rate': 2.8e-07, 'epoch': 0.2}


 67%|██████▋   | 20/30 [20:15<10:26, 62.62s/it]

                                               


[A[A[A                                    
 67%|██████▋   | 20/30 [20:15<10:26, 62.62s/it]

[A[A***** Running Evaluation *****
  Num examples = 32
  Batch size = 8


{'loss': 0.1237, 'grad_norm': inf, 'learning_rate': 3.6e-07, 'epoch': 0.27}




                                               


[A[A[A                                    
[A                                          
 67%|██████▋   | 20/30 [20:58<10:26, 62.62s/it]

[A[ASaving model checkpoint to whisper_tunisian_small\checkpoint-20
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Configuration saved in whisper_tunisian_small\checkpoint-20\config.json
Configuration saved in whisper_tunisian_small\checkpoint-20\generation_config.json


{'eval_loss': 0.976307213306427, 'eval_wer': 61.45552560646901, 'eval_runtime': 43.625, 'eval_samples_per_second': 0.734, 'eval_steps_per_second': 0.092, 'epoch': 0.27}


Model weights saved in whisper_tunisian_small\checkpoint-20\model.safetensors
tokenizer config file saved in whisper_tunisian_small\checkpoint-20\tokenizer_config.json
Special tokens file saved in whisper_tunisian_small\checkpoint-20\special_tokens_map.json
added tokens file saved in whisper_tunisian_small\checkpoint-20\added_tokens.json
  return fn(*args, **kwargs)
 83%|████████▎ | 25/30 [26:00<05:18, 63.65s/it]

                                               


[A[A[A                                    
 83%|████████▎ | 25/30 [26:02<05:18, 63.65s/it]

[A[A

{'loss': 0.1224, 'grad_norm': 5.600057601928711, 'learning_rate': 4.6000000000000004e-07, 'epoch': 0.33}


100%|██████████| 30/30 [31:00<00:00, 60.64s/it]

                                               


[A[A[A                                    
100%|██████████| 30/30 [31:02<00:00, 60.64s/it]

[A[A***** Running Evaluation *****
  Num examples = 32
  Batch size = 8


{'loss': 0.1444, 'grad_norm': 5.30686616897583, 'learning_rate': 5.6e-07, 'epoch': 0.4}




                                               


[A[A[A                                    
[A                                          
100%|██████████| 30/30 [31:45<00:00, 60.64s/it]

[A[ASaving model checkpoint to whisper_tunisian_small\checkpoint-30
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Configuration saved in whisper_tunisian_small\checkpoint-30\config.json
Configuration saved in whisper_tunisian_small\checkpoint-30\generation_config.json


{'eval_loss': 0.9773169755935669, 'eval_wer': 62.264150943396224, 'eval_runtime': 43.3609, 'eval_samples_per_second': 0.738, 'eval_steps_per_second': 0.092, 'epoch': 0.4}


Model weights saved in whisper_tunisian_small\checkpoint-30\model.safetensors
tokenizer config file saved in whisper_tunisian_small\checkpoint-30\tokenizer_config.json
Special tokens file saved in whisper_tunisian_small\checkpoint-30\special_tokens_map.json
added tokens file saved in whisper_tunisian_small\checkpoint-30\added_tokens.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from whisper_tunisian_small\checkpoint-20 (score: 61.45552560646901).
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


                                               


[A[A[A                                    
100%|██████████| 30/30 [31:54<00:00, 60.64s/it]

100%|██████████| 30/30 [31:54<00:00, 63.81s/it]
Saving model checkpoint to whisper_tunisian_small
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Configuration saved in whisper_tunisian_small\

{'train_runtime': 1914.3443, 'train_samples_per_second': 0.125, 'train_steps_per_second': 0.016, 'train_loss': 0.11811718742052714, 'epoch': 0.4}


Model weights saved in whisper_tunisian_small\model.safetensors
tokenizer config file saved in whisper_tunisian_small\tokenizer_config.json
Special tokens file saved in whisper_tunisian_small\special_tokens_map.json
added tokens file saved in whisper_tunisian_small\added_tokens.json
Feature extractor saved in whisper_tunisian_small\preprocessor_config.json
tokenizer config file saved in whisper_tunisian_small\tokenizer_config.json
Special tokens file saved in whisper_tunisian_small\special_tokens_map.json
added tokens file saved in whisper_tunisian_small\added_tokens.json
***** Running Evaluation *****
  Num examples = 34
  Batch size = 8


Training finished. Model and processor saved to whisper_tunisian_small
Evaluating on test set...


100%|██████████| 5/5 [00:43<00:00,  8.77s/it]

Test Set Metrics: {'eval_loss': 1.0989844799041748, 'eval_wer': 69.82543640897757, 'eval_runtime': 58.64, 'eval_samples_per_second': 0.58, 'eval_steps_per_second': 0.085, 'epoch': 0.4}





### Inference: transcription

In [None]:
import torch
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
import os

# --- 1. Define the path to your saved model ---
# This should be the same as your training_args.output_dir
model_path = "whisper_tunisian_small"

# --- 2. Load the fine-tuned processor and model ---
# It's crucial to load the processor saved with your fine-tuned model
print(f"Loading processor from {model_path}...")
processor = WhisperProcessor.from_pretrained(model_path)
print(f"Loading model from {model_path}...")
#The next two lines are to correct the "Missing Keys" Error: ['proj_out.weight']
model = WhisperForConditionalGeneration.from_pretrained(model_path) #will be using this for now, but when training again i will have to uncomment this and comment the next line
# model = WhisperForConditionalGeneration.from_pretrained(model_path, ignore_mismatched_sizes=True)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading processor from whisper_tunisian_small...
Loading model from whisper_tunisian_small...


In [2]:
# Ensure the model is on GPU if available
if torch.cuda.is_available():
    model = model.to("cuda")
    print("Model moved to GPU (CUDA).")
else:
    print("CUDA not available. Model will run on CPU (slower).")

Model moved to GPU (CUDA).


In [3]:
# --- 3. Create a Hugging Face Pipeline for easy inference ---
# Using the pipeline simplifies the inference process
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30, # Whisper processes audio in 30-second chunks by default
    device=0 if torch.cuda.is_available() else -1 # 0 for first GPU, -1 for CPU
)
print("ASR pipeline created.")

ASR pipeline created.


In [4]:
# --- 4. Prepare some audio for inference ---
# Option A: Use one of your existing processed audio files
audio_dir = "data/processed_segments/audio" # Adjust path if needed
sample_audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith(".wav")]

if sample_audio_files:
    inference_audio_path = sample_audio_files[7] # Testing for the 8th audio file
    print(f"\nPerforming inference on: {inference_audio_path}")

    # The pipeline can directly take a file path
    prediction = pipe(inference_audio_path)
    print(f"Transcription: {prediction['text']}")
    # Option B: Create a dummy audio file (if you don't have existing ones ready)
    # from scipy.io.wavfile import write
    # import numpy as np
    # sampling_rate = 16000
    # duration = 5 # seconds
    # dummy_audio = np.random.uniform(low=-0.2, high=0.2, size=sampling_rate * duration).astype(np.float32)
    # dummy_audio_path = "dummy_audio.wav"
    # write(dummy_audio_path, sampling_rate, dummy_audio)
    # print(f"\nPerforming inference on dummy audio: {dummy_audio_path}")
    # prediction = pipe(dummy_audio_path)
    # print(f"Transcription: {prediction['text']}")
    # os.remove(dummy_audio_path) # Clean up

else:
    print("\nNo audio files found in the processed_segments/audio directory for inference.")
    print("Please ensure your audio files are there or create a dummy one for testing.")


Performing inference on: data/processed_segments/audio\segment_00007.wav


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Transcription:  المسكين حتى طفت في الهم على قلبه المسكين قال   


The predicted transcript was:  المسكين حتى طفت في الهم على قلبه المسكين قال

The real/trained on transcript is:  ابوه حتى طفت في الهم على قلبه المسكين قال بيكلي ربي سخر   

In [9]:
import torch
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
import os

# --- 1. Define the path to your saved model ---
model_path = "./whisper_tunisian_small"

# --- 2. Load the fine-tuned processor and model ---
print(f"Loading processor from {model_path}...")
processor = WhisperProcessor.from_pretrained(model_path)
print(f"Loading model from {model_path}...")
model = WhisperForConditionalGeneration.from_pretrained(
    model_path,
    ignore_mismatched_sizes=True # Keep this to address the missing keys warning
)

# Ensure the model is on GPU if available
if torch.cuda.is_available():
    model = model.to("cuda")
    print("Model moved to GPU (CUDA).")
else:
    print("CUDA not available. Model will run on CPU (slower).")

# --- 3. Create a Hugging Face Pipeline for translation ---
# Crucially, set the 'task' to "translate" and 'language' to "english"
pipe_translation = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    device=0 if torch.cuda.is_available() else -1,
    # *** Corrected parameters for translation ***
    # For Whisper, when you want translation, you pass `language` and `task`
    # as arguments that will be forwarded to the model's `generate` method.
    generate_kwargs={"task": "translate", "language": "english"}
)
print("ASR pipeline configured for Tunisian to English translation.")

# --- 4. Prepare some audio for inference ---
audio_dir = "./data/processed_segments/audio"
sample_audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith(".wav")]

if sample_audio_files:
    inference_audio_path = sample_audio_files[7]
    print(f"\nPerforming translation on: {inference_audio_path}")

    translation_result = pipe_translation(inference_audio_path)
    print(f"Tunisian to English Translation: {translation_result['text']}")
else:
    print("\nNo audio files found for inference.")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading processor from ./whisper_tunisian_small...
Loading model from ./whisper_tunisian_small...
Model moved to GPU (CUDA).
ASR pipeline configured for Tunisian to English translation.

Performing translation on: ./data/processed_segments/audio\segment_00007.wav
Tunisian to English Translation:  the heart of the poor.


Whisper can translate to english, it will just need further training, but it is possible. For now it only understood two words - قلبه المسكين - to which it was translated to the heart of the poor.