In [2]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.0 responses-0.18.0


In [1]:
import logging
import os
import sys
import pandas as pd
import numpy as np
import warnings
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import datasets
import evaluate
import torch
from datasets import DatasetDict, load_dataset, Dataset, Audio

import transformers
from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    AutoTokenizer,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

In [8]:

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor ([`WhisperProcessor`])
            The processor used for processing the data.
        decoder_start_token_id (`int`)
            The begin-of-sentence of the decoder.
        forward_attention_mask (`bool`)
            Whether to return attention_mask.
    """

    processor: Any
    decoder_start_token_id: int
    forward_attention_mask: bool

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        model_input_name = self.processor.model_input_names[0]
        input_features = [{model_input_name: feature[model_input_name]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        if self.forward_attention_mask:
            batch["attention_mask"] = torch.LongTensor([feature["attention_mask"] for feature in features])

        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [9]:
# # 1. Parse input arguments
# # See all possible arguments in src/transformers/training_args.py
# # or by passing the --help flag to this script.
# # We now keep distinct sets of args, for a cleaner separation of concerns.
# parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))

# if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
#     # If we pass only one argument to the script and it's the path to a json file,
#     # let's parse it to get our arguments.
#     model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
# else:
#     model_args, data_args, training_args = parser.parse_args_into_dataclasses()

# if model_args.use_auth_token is not None:
#     warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
#     if model_args.token is not None:
#         raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
#     model_args.token = model_args.use_auth_token

# # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
# # information sent is the one passed as arguments along with your Python/PyTorch versions.
# send_example_telemetry("run_speech_recognition_seq2seq", model_args, data_args)


In [10]:
output_dir = "/scratch/users/gmenon/hf_seq2seq"
# 3. Detecting last checkpoint and eventually continue from last checkpoint
last_checkpoint = None
last_checkpoint = get_last_checkpoint(output_dir)
# Set seed before initializing model.
set_seed(123)


In [16]:
TRAIN_FILE_PATH = "/scratch/users/gmenon/train_song_metadata_en_demucs_cleaned_filtered_095.csv"
TEST_FILE_PATH = "/scratch/users/gmenon/validation_song_metadata_en_demucs_cleaned_filtered_005.csv"
train_df = pd.read_csv(TRAIN_FILE_PATH)
validation_df = pd.read_csv(TEST_FILE_PATH)

train_dataset = Dataset.from_dict(
                    {"audio": list(train_df["consolidated_file_path"]),
                    "transcription": list(train_df["transcription"])}).cast_column("audio", Audio(sampling_rate=16_000))

val_dataset = Dataset.from_dict(
                    {"audio": list(validation_df["consolidated_file_path"]),
                    "transcription": list(validation_df["transcription"])}).cast_column("audio", Audio(sampling_rate=16_000))

In [17]:

# 4. Load dataset
audio_dataset = DatasetDict()
audio_dataset["train"] = train_dataset
audio_dataset["eval"] = val_dataset

audio_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 9538
    })
    eval: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 507
    })
})

In [18]:
#audio_dataset.push_to_hub("gmenon/slt-lyrics-audio")

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/12 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/794 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Map:   0%|          | 0/794 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/631 [00:00<?, ?B/s]

In [None]:
MODEL_BACKBONE = "" #FILL IN
FEATURE_EXTRACTOR_NAME="" #FILL IN
TOKENIZER_NAME="" #FILL IN
model_type_whiser = False #FILL IN
decoder_start_token_id=101 #FILL IN
freeze_feature_encoder=True #FILL IN
freeze_encoder=True  #FILL IN

# 5. Load pretrained model, tokenizer, and feature extractor
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
config = AutoConfig.from_pretrained(MODEL_BACKBONE)
config.update({"forced_decoder_ids": None, "suppress_tokens": None})

# SpecAugment for whisper models
if model_type_whiser:
    config.update({"apply_spec_augment": True})
else:
    config.update({"apply_spec_augment": False})

feature_extractor = AutoFeatureExtractor.from_pretrained(FEATURE_EXTRACTOR_NAME) #See if some other name is required
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) #See if some other name is required
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_BACKBONE,config=config)

if model.config.decoder_start_token_id is None:
    raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

if freeze_feature_encoder:
    model.freeze_feature_encoder()

if freeze_encoder:
    model.freeze_encoder()
    model.model.encoder.gradient_checkpointing = False


In [None]:

# 7. Preprocessing the datasets.
NUM_WORKERS = 4
do_lower_case = True


# We need to read the audio files as arrays and tokenize the targets.
#max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate
#min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate
#audio_column_name = data_args.audio_column_name
#num_workers = data_args.preprocessing_num_workers
#text_column_name = data_args.text_column_name

model_input_name = feature_extractor.model_input_names[0]

# if SpecAugment is used for whisper models, return attention_mask to guide the mask along time axis
forward_attention_mask = (
    getattr(config, "model_type", None) == "whisper"
    and getattr(config, "apply_spec_augment", False)
    and getattr(config, "mask_time_prob", 0) > 0
)


#audio_datasets["train"] = audio_datasets["train"].select(range(data_args.max_train_samples))

#raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))

def prepare_dataset(batch):
    # process audio
    sample = batch[audio_column_name]
    inputs = feature_extractor(
        sample["array"], sampling_rate=sample["sampling_rate"], return_attention_mask=forward_attention_mask
    )
    # process audio length
    batch[model_input_name] = inputs.get(model_input_name)[0]
    batch["input_length"] = len(sample["array"])
    if forward_attention_mask:
        batch["attention_mask"] = inputs.get("attention_mask")[0]

    # process targets
    input_str = batch[text_column_name].lower() if do_lower_case else batch[text_column_name]
    batch["labels"] = tokenizer(input_str).input_ids
    return batch

#with training_args.main_process_first(desc="dataset map pre-processing"):
vectorized_datasets = raw_datasets.map(
    prepare_dataset,
    remove_columns=next(iter(raw_datasets.values())).column_names,
    num_proc=data_args.preprocessing_num_workers,
    desc="preprocess train dataset"
)

# filter data that is shorter than min_input_length or longer than
# max_input_length
def is_audio_in_length_range(length):
    return length > min_input_length and length < max_input_length

vectorized_datasets = vectorized_datasets.filter(
    is_audio_in_length_range,
    num_proc=num_workers,
    input_columns=["input_length"],
)

# for large datasets it is advised to run the preprocessing on a
# single machine first with `args.preprocessing_only` since there will mostly likely
# be a timeout when running the script in distributed mode.
# In a second step `args.preprocessing_only` can then be set to `False` to load the
# cached dataset
#if data_args.preprocessing_only:
cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
logger.info(f"Data preprocessing finished. Files cached at {cache}.")
return


In [None]:

# 8. Load Metric
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions

    pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    # we do not want to group tokens when computing the metrics
    label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)

    wer = metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [None]:

# 9. Create a single speech processor
# make sure all processes wait until data is saved
with training_args.main_process_first():
    # only the main process saves them
        # save feature extractor, tokenizer and config
    feature_extractor.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    config.save_pretrained(output_dir)

processor = AutoProcessor.from_pretrained(output_dir)

# 10. Define data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
    forward_attention_mask=forward_attention_mask,
)


In [None]:

# 11. Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
    eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
    tokenizer=feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)


In [None]:

# 12. Training
checkpoint = None
if training_args.resume_from_checkpoint is not None:
    checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
    checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()  # Saves the feature extractor too for easy upload
metrics = train_result.metrics
max_train_samples = (
    data_args.max_train_samples
    if data_args.max_train_samples is not None
    else len(vectorized_datasets["train"])
)
metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


In [None]:
NUM_BEAMS = 5
# 13. Evaluation
results = {}
logger.info("*** Evaluate ***")
metrics = trainer.evaluate(
    metric_key_prefix="eval",
    #max_length=training_args.generation_max_length,
    num_beams=NUM_BEAMS
)
max_eval_samples = len(vectorized_datasets["eval"])
metrics["eval_samples"] = max_eval_samples

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


In [None]:

# 14. Write Training Stats
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "automatic-speech-recognition"}
    kwargs["dataset_tags"] = data_args.dataset_name
    if data_args.dataset_config_name is not None:
        kwargs["dataset_args"] = data_args.dataset_config_name
        kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
    else:
        kwargs["dataset"] = data_args.dataset_name

if training_args.push_to_hub:
    trainer.push_to_hub(**kwargs)
else:
    trainer.create_model_card(**kwargs)

return results