## Fine Tune Testing

In [1]:
# installations
!pip install torch
!pip install huggingface_hub
!pip install datasets
!pip install transformers
!pip install numpy==1.22.4
!pip install scipy==1.13.1
!pip install pandas==2.2.2
!pip install opencv-python==4.10.0.82
!pip install numba==0.60.0
!pip install evaluate
!pip install jiwer
!pip install transformers==4.31.0 accelerate==0.30.1
!pip3 install deepspeed
!pip install ipywidgets
!pip install --upgrade transformers huggingface_hub

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip instal

In [2]:
import torch

mdl = 'openai/whisper-large-v2'
dts = 'Jzuluaga/atcosim_corpus'
opd = './' + mdl.split('/')[-1] + '-' + dts.split('/')[-1]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Training Model : {}'.format(mdl))
print('On Dataset     : {}'.format(dts))
print('Output Dir.    : {}'.format(opd))
print('Device         : {}'.format(device))

Training Model : openai/whisper-large-v2
On Dataset     : Jzuluaga/atcosim_corpus
Output Dir.    : ./whisper-large-v2-atcosim_corpus
Device         : cuda


### Initializing Hugging Face

In [3]:
import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9991"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

In [4]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HF_TOKEN")
login(token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/theresa_lyu/.cache/huggingface/token
Login successful


### Load Dataset

In [5]:
from datasets import load_dataset, DatasetDict

dataset = DatasetDict()

dataset['train'] = load_dataset(dts, split="train")
dataset['test']  = load_dataset(dts, split="test")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 7638
    })
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 1901
    })
})


### Import pretrained Whisper models

In [6]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained(mdl)

from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained(mdl, language="English", task="transcribe")

from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(mdl, language="English", task="transcribe")

# make sure sampling rate is 16k otherwise incompatible
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

dataset

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 7638
    })
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 1901
    })
})

In [7]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

# num_proc has to be one, because os.fork is incompatible with multithreaded code
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=1)

dataset["train"] = dataset["train"].select(range(80))
dataset["test"] = dataset["test"].select(range(20))

# dataset should now contain 'input_features' and 'labels'
dataset

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 80
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 20
    })
})

In [8]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [9]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [10]:
import json

deepspeed_config = {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },
    "scheduler": {
        "type": "WarmupDecayLR",
        "params": {
            "last_batch_iteration": -1,
            "total_num_steps": "auto",
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        "allgather_partitions": True,
        "allgather_bucket_size": 2e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": True
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto"
}

# Save DeepSpeed configuration to a JSON file
import json
deepspeed_config_path = 'deepspeed_config.json'
with open(deepspeed_config_path, 'w') as f:
    json.dump(deepspeed_config, f)

In [11]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=opd,
    num_train_epochs=50,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,  # [gradient_accumulation_steps] * [Num of GPUs] = 64
    learning_rate=1e-5,
    warmup_steps=250,
    max_steps=-1,
    gradient_checkpointing=True,
    # fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    # push_to_hub=True,
    deepspeed=deepspeed_config_path,
)



[2024-07-12 03:09:27,742] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2024-07-12 03:09:28,324] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-07-12 03:09:28,325] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


In [12]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(mdl)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias

In [13]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

torch.cuda.empty_cache()

In [14]:
import transformers

transformers.logging.set_verbosity_info()
trainer.train()
trainer.save_model(opd)

kwargs = {
    "dataset_tags": dts.split('/')[-1],
    "dataset": "ATCOSIM - CORPUS",
    "dataset_args": "config: en, split: train",
    "language": "en",
    "model_name": "Whisper Large - ATCOSIM - CORPUS",
    "finetuned_from": mdl,
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}
trainer.push_to_hub()

[2024-07-12 03:09:30,864] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.4, git-hash=unknown, git-branch=unknown
[2024-07-12 03:09:31,303] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
Installed CUDA version 11.5 does not match the version torch was compiled with 11.8 but since the APIs are compatible, accepting this combination


Using /home/theresa_lyu/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Emitting ninja build file /home/theresa_lyu/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
Time to load cpu_adam op: 0.31367015838623047 seconds
[2024-07-12 03:09:31,788] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
Adam Optimizer #0 is created with AVX2 arithmetic capability.
Config: alpha=0.000010, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1
[2024-07-12 03:09:31,789] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer


Loading extension module cpu_adam...


[2024-07-12 03:09:31,940] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
[2024-07-12 03:09:31,941] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
[2024-07-12 03:09:31,941] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float32 ZeRO stage 2 optimizer
[2024-07-12 03:09:31,942] [INFO] [stage_1_and_2.py:148:__init__] Reduce bucket size 200000000
[2024-07-12 03:09:31,943] [INFO] [stage_1_and_2.py:149:__init__] Allgather bucket size 200000000
[2024-07-12 03:09:31,943] [INFO] [stage_1_and_2.py:150:__init__] CPU Offload: True
[2024-07-12 03:09:31,944] [INFO] [stage_1_and_2.py:151:__init__] Round robin gradient partitioning: False
[2024-07-12 03:09:43,615] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states
[2024-07-12 03:09:43,616] [INFO] [utils.py:782:see_memory_usage] MA 6.0 GB         Max_MA 6.0 GB         

***** Running training *****
  Num examples = 80
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 150
  Number of trainable parameters = 1,541,384,960
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


Saving model checkpoint to ./whisper-large-v2-atcosim_corpus/checkpoint-150
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Configuration saved in ./whisper-large-v2-atcosim_corpus/checkpoint-150/config.json
Configuration saved in ./whisper-large-v2-atcosim_corpus/checkpoint-150/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at ./whisper-large-v2-atcosim_corpus/checkpoint-150/model.safetensors.index.json.
Feature extractor saved in ./whisper-large-v2-atcosim_corpus/checkpoint-150/preprocessor_config.json


[2024-07-12 03:44:51,172] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step150 is about to be saved!
[2024-07-12 03:44:51,187] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./whisper-large-v2-atcosim_corpus/checkpoint-150/global_step150/mp_rank_00_model_states.pt
[2024-07-12 03:44:51,187] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./whisper-large-v2-atcosim_corpus/checkpoint-150/global_step150/mp_rank_00_model_states.pt...
[2024-07-12 03:45:01,106] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./whisper-large-v2-atcosim_corpus/checkpoint-150/global_step150/mp_rank_00_model_states.pt.
[2024-07-12 03:45:01,109] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./whisper-large-v2-atcosim_corpus/checkpoint-150/global_step150/zero_pp_rank_0_mp_rank_00_optim_states.pt...
[2024-07-12 03:45:20,803] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./whisper-large-v2-atcosim_corpus/checkpoint-150/global_step1



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./whisper-large-v2-atcosim_corpus
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Configuration saved in ./whisper-large-v2-atcosim_corpus/config.json
Configuration saved in ./whisper-large-v2-atcosim_corpus/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at ./whisper-large-v2-atcosim_corpus/model.safetensors.index.json.
Feature extractor saved in ./whisper-large-v2-atcosim_corpus/preprocessor_config.json
Saving model checkpoint to ./whisper-large-v2-atcosim_corpus
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Configuration saved in ./whisper-large-v2-atcosim_corpus/config.json
Conf

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

events.out.tfevents.1720753789.watvis-whale.344117.0:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

events.out.tfevents.1720753555.watvis-whale.343639.0:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/daisyyedda/whisper-large-v2-atcosim_corpus/commit/f84eff1189ddd6a18022b7c00e3a98e7b8d9a532', commit_message='End of training', commit_description='', oid='f84eff1189ddd6a18022b7c00e3a98e7b8d9a532', pr_url=None, pr_revision=None, pr_num=None)