In [1]:
import os
import shutil

import torch
import torch.nn as nn

import json

import pandas as pd
#---------------------------- Dependencies ---------------------------------------#
import transformers
from transformers import WhisperFeatureExtractor, WhisperForConditionalGeneration,WhisperProcessor
from transformers import WhisperTokenizer
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments

import torchaudio
import torchaudio.transforms as T

from tqdm import tqdm

import noisereduce as nr

In [2]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
processor.save_pretrained("models")

[]

In [None]:
# import shutil
# if os.path.isdir("results"):
#     shutil.rmtree("results")

In [2]:
from datasets import load_dataset,load_from_disk
from datasets import Dataset as hf_dataset
from sklearn.model_selection import train_test_split

In [None]:
print(torch.__version__)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
#Helper Functions

"""
Process file path for actual data path
"""
def process_data_files(data_path):
    file_list = os.listdir(data_path)
    # total_length = len(os.listdir(data_path))
    new_list = [None] * len(file_list)
    for i in range(len(file_list)):
        corrected_file = f'{file_list[i][0:6]}{i}.wav'
        new_list[i] = os.path.join(data_path,corrected_file)
    
    return new_list

"""
Manual Process Json metadata file 
"""
def process_json(annotation_path:str) -> dict:
    json_dict = {}
    with open (annotation_path,'r') as json_file:
        json_data = list(json_file)
    for idx,line in enumerate(json_data):
        json_dict[idx] = line
    
    return json_dict

"""
Process and resample the audio
"""
def process_audio(self,waveform,original_sample_rate,new_sample_rate=16000):
    waveform = torch.Tensor(waveform)
    if original_sample_rate != new_sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
        waveform = resampler(waveform)
        print(type(waveform))
        print(f"Resampled waveform to {new_sample_rate} Hz")
    # Normalize audio to [-1, 1]
    # waveform = waveform / torch.max(torch.abs(waveform))
    return waveform,new_sample_rate

def denoise_data(audio,rate):
    # Perform noise reduction
    noisy_part = audio[0:int(rate*0.5)]  # Identify the noisy part
    reduced_noise_audio = nr.reduce_noise(y=audio, sr=rate, y_noise=noisy_part)
    return reduced_noise_audio

# Function to load and preprocess audio
def preprocess_data(examples):
    input_values = []
    attention_masks = []
    labels = []

    for audio_path, transcript in zip(examples['data'], examples['annotations']):
        speech_array, sampling_rate = torchaudio.load(audio_path)
        processed = processor(speech_array.squeeze(0), sampling_rate=sampling_rate, return_tensors="pt", padding=True)

        # Process labels with the same processor settings
        with processor.as_target_processor():
            label = processor(transcript, return_tensors="pt", padding=True)

        input_values.append(processed.input_values.squeeze(0))
        # Create attention masks based on the input values
        attention_mask = torch.ones_like(processed.input_values)
        attention_mask[processed.input_values == processor.tokenizer.pad_token_id] = 0  # Set padding tokens to 0
        attention_masks.append(attention_mask.squeeze(0))
        
        # Ensure labels are padded to the same length as inputs if needed
        padded_label = torch.full(processed.input_values.shape[1:], -100, dtype=torch.long)
        actual_length = label.input_ids.shape[1]
        padded_label[:actual_length] = label.input_ids.squeeze(0)
        labels.append(padded_label)

    # Concatenate all batches
    examples['input_values'] = torch.stack(input_values)
    examples['attention_mask'] = torch.stack(attention_masks)
    examples['labels'] = torch.stack(labels)

    return examples

In [29]:
annotation_path = 'advanced/asr.jsonl'

# Pandas Way
json_files = pd.read_json(annotation_path,lines=True)

annotations_list = json_files['transcript'].to_list()
file_list = process_data_files('advanced/audio')

audio_features = []
for file in tqdm(file_list):
    audio_wave, sample_rate = torchaudio.load(file)
    audio_wave = denoise_data(audio_wave,sample_rate)
    # target,new_sample_rate = process_audio(audio_wave,sample_rate,16000)
    audio_features.append({'waveform': audio_wave, 'sampling_rate':sample_rate })

# model_name = "openai/whisper-small"
# processor = WhisperProcessor.from_pretrained(model_name, language="English", task="transcribe")
# tokenizer = WhisperTokenizer.from_pretrained(model_name, language="English", task="transcribe")
# feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
# model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

100%|██████████| 3500/3500 [10:12<00:00,  5.71it/s]


In [30]:
X_train, X_test, y_train, y_test = train_test_split(audio_features, annotations_list, test_size=0.3, random_state=1)

# X_test, X_val, y_test, y_val  = train_test_split(X_test, y_test, test_size=0.5, random_state=1) 

# len(X_train),len(X_test),len(X_val)



In [None]:
train_dataset = hf_dataset.from_dict({'data':X_train, 'annotations': y_train})
# val_dataset = hf_dataset.from_dict({'data':X_val, 'annotations': y_val})
test_dataset = hf_dataset.from_dict({'data':X_test, 'annotations': y_test})

In [None]:
# Crashes the kernel

# dataset_dict = \
# {
#     'data':audio_features,
#     'annotations': annotations_list
# }

# # Convert to a Hugging Face dataset
# dataset = hf_dataset.from_dict(dataset_dict)



# # Shuffle the dataset
# dataset = dataset.shuffle(seed=42)

# # Split the dataset into training, validation, and test sets
# train_size = int(0.8 * len(dataset))
# val_size = int(0.1 * len(dataset))
# test_size = len(dataset) - train_size - val_size

# train_dataset = dataset.select(range(train_size))
# val_dataset = dataset.select(range(train_size, train_size + val_size))
# test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))

In [11]:
train_dataset

Dataset({
    features: ['data', 'annotations'],
    num_rows: 80
})

In [None]:
# model_name = "openai/whisper-small"

model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name, language="English", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained(model_name, language="English", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.generation_config.forced_decoder_ids = None
model.to(device)

In [8]:
# Dataset prep functions

def prepare_dataset(batch):
   # load audio data
    audio = batch["data"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["waveform"], sampling_rate=audio["sampling_rate"]).input_features[0]
    

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["annotations"]).input_ids
    return batch


def augment_dataset(batch):
    # load and (possibly) resample audio data to 16kHz
    audio = batch["data"]

    # apply augmentation
    augmented_waveform = augmentation(audio["waveform"], sample_rate=audio["sampling_rate"])
    batch["data"]["waveform"] = augmented_waveform

    return batch

In [14]:
train_dataset = train_dataset.map(prepare_dataset,remove_columns=train_dataset.column_names)
# val_dataset = val_dataset.map(prepare_dataset,remove_columns=val_dataset.column_names)
test_dataset = test_dataset.map(prepare_dataset,remove_columns=test_dataset.column_names)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [15]:
# For reproducibility
train_dataset.save_to_disk("audio_train_denoised.hf")
# val_dataset.save_to_disk("audio_val_denoised.hf")
test_dataset.save_to_disk("audio_test_denoised.hf")


# train_dataset.save_to_disk("audio_train.hf")
# val_dataset.save_to_disk("audio_val.hf")
# test_dataset.save_to_disk("audio_test.hf")

Saving the dataset (0/1 shards):   0%|          | 0/80 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

<h1> Model Training Phase </h1>

In [54]:
train_dataset = load_from_disk("audio_train_denoised.hf")
val_dataset = load_from_disk("audio_val_denoised.hf")
test_dataset = load_from_disk("audio_test_denoised.hf")

# train_dataset = load_from_disk("audio_train.hf")
# val_dataset = load_from_disk("audio_val.hf")
# test_dataset = load_from_disk("audio_test.hf")

In [10]:
train_dataset

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 2450
})

In [11]:
# Taken from https://huggingface.co/blog/fine-tune-whisper

from dataclasses import dataclass
from typing import Any, Dict, List, Union
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [13]:
import evaluate

metric = evaluate.load("wer")

In [14]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [66]:
# not tuned 

from transformers import Seq2SeqTrainingArguments,EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # change to a repo name of your choice
    per_device_train_batch_size = 4,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    logging_steps = 1,
    # warmup_steps=500,
    # max_steps=5000,
    overwrite_output_dir = True,
    num_train_epochs = 5,
    gradient_checkpointing=True,
    fp16 = True,
    evaluation_strategy="steps",
    eval_steps = 100,
    save_strategy = "steps",
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    generation_max_length=200,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    # push_to_hub=True,
    save_safetensors=False
)


In [71]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args = training_args,
    model = model,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer= processor.feature_extractor
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss,Wer
100,0.1099,0.055672,4.517054
200,0.1394,0.059806,4.568268
300,0.047,0.055305,3.94346
400,0.0115,0.05457,3.871761
500,0.0833,0.058693,4.209772
600,0.0817,0.053524,3.800061
700,0.0409,0.062116,3.912732
800,0.001,0.058827,4.035645
900,0.0013,0.052568,3.318652
1000,0.0423,0.053094,3.666906


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 325

TrainOutput(global_step=3065, training_loss=0.014581798161748004, metrics={'train_runtime': 15737.6645, 'train_samples_per_second': 0.778, 'train_steps_per_second': 0.195, 'total_flos': 3.53517115392e+18, 'train_loss': 0.014581798161748004, 'epoch': 5.0})

In [72]:
trainer.save_model('models/best_ASR_model_small')

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


<h1> INFERENCE CODE </h1>

In [6]:
def denoise_data(audio,rate):
    # Perform noise reduction
    noisy_part = audio[0:int(rate*0.5)]  # Identify the noisy part
    reduced_noise_audio = nr.reduce_noise(y=audio, sr=rate, y_noise=noisy_part)
    return reduced_noise_audio

def prepare_for_inference(audio_path,processor):
   # load audio data
    waveform,sample_rate = torchaudio.load(audio_path)
    print(waveform.shape)
    waveform = denoise_data(waveform,sample_rate)
    # compute log-Mel input features from input audio array 
    input_features = processor(waveform.squeeze(0), sampling_rate = sample_rate,return_tensors="pt").input_features
    
    return input_features

new_model = WhisperForConditionalGeneration.from_pretrained('til-24-base/asr/src/best_ASR_model')
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="English", task="transcribe")
new_model.generation_config.forced_decoder_ids = None
new_model.to(device)

to_be_predicted = prepare_for_inference('advanced/audio/audio_0.wav',processor).to(device)

prediction_ids = new_model.generate(to_be_predicted)

transcription = processor.batch_decode(prediction_ids, skip_special_tokens=True)

transcription

OSError: Incorrect path_or_model_id: 'til-24-base/asr/src/best_ASR_model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.


<b>Currently Obsolete Code<b>

    

In [None]:
from torch.utils.data import Dataset

class ASRDataset(Dataset):
    def __init__(self,data_list,annotations_list,processor,tokenizer):
        self.processor = processor
        self.tokenizer = tokenizer
        # self.transforms = transforms
        self.data_dict = \
        {
            'data':data_list,
            'annotations':annotations_list
        }
        self.dataset = hf_dataset.from_dict(self.data_dict)
            
    """
    Resample if sample rate not 16000
    """
    def process_audio(self,waveform,original_sample_rate,new_sample_rate=16000):
        if waveform.shape[1] != new_sample_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
            waveform = resampler(waveform)
            print(f"Resampled waveform to {new_sample_rate} Hz")
        # Normalize audio to [-1, 1]
        waveform = waveform / torch.max(torch.abs(waveform))
        return waveform , new_sample_rate     
    
    def __len__(self):
        return len(self.data_dict['data'])
    
    def __getitem__(self,index):
        file_path = self.data_dict['data'][index]
        label = self.data_dict['annotations'][index]
        waveform, sample_rate = torchaudio.load(file_path)
        # new_waveform, new_sample_rate = self.process_audio(waveform,original_sample_rate = sample_rate,new_sample_rate = 16000)
        target = self.preprocess_data(waveform,sample_rate,label)

        return target
    
    
        # Function to load and preprocess audio
    def preprocess_data(self,speech_array,sampling_rate,text):
        target = {}
        processed = self.processor(speech_array.squeeze(0), sampling_rate=sampling_rate, return_tensors="pt", padding=True)
        # Process labels with the same processor settings
        with self.processor.as_target_processor():
            label = self.processor(text, return_tensors="pt", padding=True)
            
        input_values = processed.input_values.squeeze(0)
        # Create attention masks based on the input values
        attention_mask = torch.ones_like(processed.input_values)
        attention_mask[processed.input_values == self.processor.tokenizer.pad_token_id] = 0  # Set padding tokens to 0
        attention_masks = attention_mask.squeeze(0)

        # Ensure labels are padded to the same length as inputs if needed
        padded_label = torch.full(processed.input_values.shape[1:], -100, dtype=torch.long)
        actual_length = label.input_ids.shape[1]
        padded_label[:actual_length] = label.input_ids.squeeze(0)
        labels = padded_label

        # Concatenate all batches
        target['input_values'] = input_values
        target['attention_mask'] = attention_masks
        target['labels'] = labels

        return target



In [None]:
class WhisperModelWrapper(self,model,device,weights):
    
    self.model = model
    self.device = device
    
    

In [None]:
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

In [None]:
# Apply preprocessing
train_dataset = train_dataset.map(preprocess_data, batched=True, batch_size=1, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_data, batched=True, batch_size=1, remove_columns=val_dataset.column_names)
test_dataset = test_dataset.map(preprocess_data, batched=True, batch_size=1, remove_columns=test_dataset.column_names)


In [None]:
# Main Train Loop
audio_dataset = ASRDataset(data_list = file_list, annotations_list = annotations_list,processor = processor,tokenizer = tokenizer)

main_dataset = audio_dataset.dataset

print(main_dataset)
# Shuffle the dataset
dataset = main_dataset.shuffle(seed=42)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))



# dataloader = torch.utils.data.DataLoader(audio_dataset, batch_size=2)

# print(audio_dataset.__getitem__(50))


# # validation
# print("-------Actual data-------")
# print(file_list[50])
# print(json_files['audio'][50])
# print(annotations_list[50])

# print("-----Dataset Class------")
# print(audio_dataset.data_dict['data'][50])
# print(audio_dataset.data_dict['annotations'][50])




In [None]:
# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",
#     learning_rate=1e-4,
#     per_device_train_batch_size=1,  # Reduce to one for simplicity
#     num_train_epochs=1,
#     weight_decay=0.005,
#     save_steps=500,
#     eval_steps=500,
#     logging_steps=10,
#     load_best_model_at_end=True
# )

In [None]:
# # Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,  # Use the validation dataset for evaluation
#     tokenizer=processor.feature_extractor
# )

# # Train the model
# trainer.train()