In [1]:
import os
os. chdir('../')

In [3]:
! python --version

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python 3.8.6


In [2]:
import re
import torch
import json
import pandas as pd
import argparse
import numpy as np
from torch import nn
from torchmetrics.text import WordErrorRate
from typing import Optional
from pytorch_lightning import LightningModule, LightningDataModule, Trainer
from pytorch_lightning.loggers import WandbLogger
from constants.mir_constants import TrainingArgs, WAV2VEC2_ARGS
from dataclasses import dataclass, asdict, field # type: ignore
from torch.utils.data import Dataset, DataLoader
from transformers import WhisperTokenizer, WhisperFeatureExtractor, Wav2Vec2Processor, BertTokenizer,WhisperForConditionalGeneration,BartForConditionalGeneration
from transformers import AutoTokenizer,AutoModelForCausalLM,AutoModelForCTC, AutoModelForSeq2SeqLM,AutoFeatureExtractor
from datasets import load_dataset, Dataset, Audio
from typing import Any, Dict, List, Optional, Union
from flash.audio import SpeechRecognition, SpeechRecognitionData
from training.wav2vec2_finetune import Wav2Vec2SpeechRecognition, SpeechRecognitionData

In [3]:
wandb_logger = WandbLogger(project="SLG - Whisper transfer learning",log_model=False,)

print(json.dumps(asdict(WAV2VEC2_ARGS), indent = 4))


[34m[1mwandb[0m: Currently logged in as: [33mgreeshmasmenon[0m ([33msongslyricstranscription[0m). Use [1m`wandb login --relogin`[0m to force relogin


{
    "TRAIN_FILE_PATH": "/scratch/users/gmenon/train_song_metadata_en_demucs_cleaned_filtered_095.csv",
    "TEST_FILE_PATH": "/scratch/users/gmenon/validation_song_metadata_en_demucs_cleaned_filtered_005.csv",
    "MODEL_BACKBONE": "facebook/wav2vec2-large-960h-lv60-self",
    "BATCH_SIZE": 1,
    "NUM_EPOCHS": 15,
    "MODEL_SAVE_PATH": "/scratch/users/gmenon//model_artefacts/wav2vec2_demucs_en_large-960h-lv60-self_freeze_unfreeze_15epochs_adamw.pt",
    "FINETUNE_STRATEGY": [
        "freeze_unfreeze",
        10
    ],
    "LR_SCHEDULER": "reduce_on_plateau_schedule"
}


In [4]:
torch.cuda.empty_cache()

In [12]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    feature_extractor: AutoFeatureExtractor
    tokenizer: AutoTokenizer
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.feature_extractor.pad(input_features, return_tensors="pt")
        label_attention_features =[{"input_ids": feature["label_attention_mask"]} for feature in features]

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways

        label_attention_batch = self.tokenizer.pad(
            label_attention_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_attention = label_attention_batch["input_ids"]
        #.masked_fill(labels_batch.input_ids.eq(101), 0).masked_fill(labels_batch.input_ids.eq(102), 0)
        labels = labels[:, 1:]
        labels_attention = labels_attention[:,1:]

        batch["labels"] = labels
        batch["label_attention_mask"]  = labels_attention

        return batch



In [13]:
class SpeechRecognitionDataModule(LightningDataModule):
    def __init__(self, WAV2VEC2_ARGS: WAV2VEC2_ARGS, num_workers,hparams):
        super().__init__()
        self.batch_size = WAV2VEC2_ARGS.BATCH_SIZE
        self.num_workers = num_workers
        self.tokenizer = AutoTokenizer.from_pretrained(hparams.lm_model)
        self.feature_extractor = WhisperFeatureExtractor.from_pretrained(hparams.whisper_model)
        #self.feature_extractor = AutoFeatureExtractor(do_normalize=True, return_attention_mask=True)
        self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(feature_extractor=self.feature_extractor, tokenizer=self.tokenizer, padding=True)
    
    def setup(self, stage=None):
        train_df = pd.read_csv(WAV2VEC2_ARGS.TRAIN_FILE_PATH).head(100)
        validation_df = pd.read_csv(WAV2VEC2_ARGS.TEST_FILE_PATH).head(10)
        if stage == 'fit' or stage is None:
            print("In Stage = Fit")
            train_dataset = Dataset.from_dict(
                    {"audio": list(train_df["consolidated_file_path"]),
                    "transcription": list(train_df["transcription"])}).cast_column("audio", Audio(sampling_rate=16_000))
            self.train_dataset = train_dataset.map(self.prepare_dataset,remove_columns = train_dataset.column_names)
            
            val_dataset = Dataset.from_dict(
                    {"audio": list(validation_df["consolidated_file_path"]),
                    "transcription": list(validation_df["transcription"])}).cast_column("audio", Audio(sampling_rate=16_000))
            self.val_dataset = val_dataset.map(self.prepare_dataset,remove_columns = val_dataset.column_names)

        
        if stage == 'test' or stage is None:
            print("In Stage = Test")
            test_dataset = Dataset.from_dict(
                    {"audio": list(validation_df["consolidated_file_path"]),
                    "transcription": list(validation_df["transcription"])}).cast_column("audio", Audio(sampling_rate=16_000))
            test_dataset = val_dataset.map(self.prepare_dataset,remove_columns = val_dataset.column_names)
            self.test_dataset = test_dataset
    
    def train_dataloader(self):
        print("entering train data loader")
        return DataLoader(
            self.train_dataset.with_format("torch"), 
            batch_size=self.batch_size, 
            num_workers=self.num_workers,
            collate_fn = self.data_collator
        )
    
    def val_dataloader(self):
        print("entering val data loader")
        return DataLoader(
            self.val_dataset.with_format("torch"), 
            batch_size=self.batch_size, 
            num_workers=self.num_workers,
            collate_fn = self.data_collator
        )
    
    def test_dataloader(self):
        print("entering test data loader")
        return DataLoader(
            self.test_dataset.with_format("torch"), 
            batch_size=self.batch_size, 
            num_workers=self.num_workers,
            collate_fn = self.data_collator
        )

    def prepare_dataset(self,batch):
        audio = batch["audio"]
        # batched output is "un-batched" to ensure mapping is correct
        batch["input_features"] = self.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
        batch["input_length"] = len(batch["input_features"])
        batch["labels"] = self.tokenizer(batch["transcription"]).input_ids
        batch["label_attention_mask"] = self.tokenizer(batch["transcription"]).attention_mask
        return batch

In [14]:
class Wav2SeqModel(LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.batch_size = hparams.batch_size
        self.learning_rate=hparams.learning_rate
        self.save_hyperparameters()
        self.tokenizer = AutoTokenizer.from_pretrained(hparams.lm_model)
        self.whisper = WhisperForConditionalGeneration.from_pretrained(hparams.whisper_model).model.encoder
        self.seq2seq = AutoModelForCausalLM.from_pretrained(hparams.lm_model)
        #self.seq2seq = AutoModelForSeq2SeqLM.from_pretrained(hparams.lm_model)
        #self.seq2seq = BartForConditionalGeneration.from_pretrained(hparams.lm_model,
        #                                            forced_bos_token_id=0) #https://github.com/huggingface/transformers/issues/15559
        self.seq2seq.config.is_decoder = True
        self.seq2seq.add_cross_attention = True
        self.bridging_layer = nn.Linear(self.whisper.config.hidden_size, self.seq2seq.config.hidden_size)

    def forward(self, audio, labels, label_attention_mask):
        #print("entering forward step")
        self.whisper.eval()
        self.seq2seq.train()
        encoder_outputs = self.whisper(audio,
                                        output_hidden_states=True,
                                        output_attentions=True)
        encoder_hidden_states = encoder_outputs[0]  
        encoder_hidden_states = self.bridging_layer(encoder_hidden_states)
        decoder_input_ids = self.shift_tokens_right(labels) 
        decoder_attention_masks = self.shift_tokens_right_mask(label_attention_mask)
        decoder_outputs = self.seq2seq(input_ids=decoder_input_ids,
                                       encoder_hidden_states=encoder_hidden_states,)
        return decoder_outputs

    def training_step(self, batch, batch_idx):
        #print("entering training step")
        audio = batch["input_features"]
        label_attention_mask = batch["label_attention_mask"]
        labels = batch["labels"]
        logits = self(audio,labels,label_attention_mask).logits
        input_lengths = torch.full(size=(self.batch_size,), fill_value=logits.shape[0], dtype=torch.long)
        target_lengths = torch.full(size=(self.batch_size,), fill_value=labels.shape[0], dtype=torch.long)
        ce_loss = nn.CrossEntropyLoss()
        loss = ce_loss(logits.squeeze(),labels.squeeze())
        return loss

    def validation_step(self, batch,batch_idx):
        #print("validation_step")
        audio = batch["input_features"]
        labels = batch["labels"]
        #attention_mask = batch["attention_mask"]
        label_attention_mask = batch["label_attention_mask"]
        logits = self(audio,labels,label_attention_mask).logits
        ce_loss = nn.CrossEntropyLoss()
        loss = ce_loss(logits.squeeze(),labels.squeeze())
        predicted_ids = torch.argmax(logits, dim=-1)
        print(f"original text = {self.tokenizer.decode(labels[0],skip_special_tokens=False)}, labels = {labels[0]}")
        print(f"Predicted text = {self.tokenizer.decode(predicted_ids[0],skip_special_tokens=False)}, predicted ids = {predicted_ids[0]}")
        self.log('val_loss', loss, on_step=True, on_epoch=True)

    def configure_optimizers(self):
        print("Entering Optimization Step")
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    @staticmethod
    def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int=0, decoder_start_token_id: int=101):
        """
        Shift input ids one token to the right.
        """
        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
        shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
        if decoder_start_token_id is None:
            raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
        shifted_input_ids[:, 0] = decoder_start_token_id
    
        if pad_token_id is None:
            raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
        # replace possible -100 values in labels by `pad_token_id`
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
    
        return shifted_input_ids

    @staticmethod
    def shift_tokens_right_mask(input_ids: torch.Tensor):
        """
        Shift input ids one token to the right.
        """
        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
        shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
        shifted_input_ids[:, 0] = 0
        return shifted_input_ids

In [15]:
def run(hparams):
    print(hparams)
    model = Wav2SeqModel(hparams)
    trainer = Trainer(max_epochs=1,devices=1, accelerator="gpu")
    trainer.fit(model,SpeechRecognitionDataModule(WAV2VEC2_ARGS,num_workers=4,hparams=hparams))
    return model, trainer

In [16]:
hparams = argparse.Namespace()
hparams.wav2vec2_model = 'facebook/wav2vec2-large-960h-lv60-self'
hparams.whisper_model = 'openai/whisper-large-v2'#'openai/whisper-large'
hparams.lm_model = 'bert-base-uncased' #'bert-base-uncased' #
hparams.vocab_size = 20000
hparams.learning_rate = 1e-7
hparams.batch_size = 1

model,trainer = run(hparams=hparams)

Namespace(batch_size=1, learning_rate=1e-07, lm_model='bert-base-uncased', vocab_size=20000, wav2vec2_model='facebook/wav2vec2-large-960h-lv60-self', whisper_model='openai/whisper-large-v2')


KeyboardInterrupt: 

In [None]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(hparams.whisper_model)
tokenizer = AutoTokenizer.from_pretrained(hparams.lm_model)
data_collator = DataCollatorSpeechSeq2SeqWithPadding(feature_extractor=feature_extractor, tokenizer=tokenizer, padding=True)
dataset = SpeechRecognitionDataModule(WAV2VEC2_ARGS,num_workers=4,hparams=hparams)
dataset.setup()

In [None]:

data = next(iter(DataLoader(
            dataset.val_dataset.with_format("torch"), 
            batch_size=1, 
            num_workers=1,
            collate_fn = data_collator)))
model.eval()
output= model(data["input_features"],torch.Tensor([[101,0,0,0,0,0,0,0]]).type(torch.int32),data["label_attention_mask"])
tokenizer.decode(torch.argmax(output.logits,dim=-1)[0]),tokenizer.decode(data["labels"][0])

In [None]:
data["labels"]

In [None]:
type(output.logits)

In [None]:
labels = torch.Tensor([[101,0,0,0,0]]

In [None]:
tokenizer.decode(torch.Tensor(dataset.val_dataset[0]["input_features"]))

In [None]:
# {'input_features': tensor([[[-0.0521, -0.1011, -0.1179,  ..., -0.6646, -0.6646, -0.6646],
#          [-0.2894, -0.3708, -0.3870,  ..., -0.6646, -0.6646, -0.6646],
#          [-0.3774, -0.6646, -0.5762,  ..., -0.6646, -0.6646, -0.6646],
#          ...,
#          [-0.3111, -0.6646, -0.6646,  ..., -0.6646, -0.6646, -0.6646],
#          [-0.3033, -0.6646, -0.6646,  ..., -0.6646, -0.6646, -0.6646],
#          [-0.2985, -0.6646, -0.6646,  ..., -0.6646, -0.6646, -0.6646]]],
#        device='cuda:0'), 'labels': tensor([[ 2061,  4553,  2013,  2115, 12051,   102]], device='cuda:0'), 'label_attention_mask': tensor([[1, 1, 1, 1, 1, 0]], device='cuda:0')}

In [None]:
# WhisperConfig {
#   "_name_or_path": "openai/whisper-large",
#   "activation_dropout": 0.0,
#   "activation_function": "gelu",
#   "apply_spec_augment": false,
#   "architectures": [
#     "WhisperForConditionalGeneration"
#   ],
#   "attention_dropout": 0.0,
#   "begin_suppress_tokens": [
#     220,
#     50257
#   ],
#   "bos_token_id": 50257,
#   "classifier_proj_size": 256,
#   "d_model": 1280,
#   "decoder_attention_heads": 20,
#   "decoder_ffn_dim": 5120,
#   "decoder_layerdrop": 0.0,
#   "decoder_layers": 32,
#   "decoder_start_token_id": 50258,
#   "dropout": 0.0,
#   "encoder_attention_heads": 20,
#   "encoder_ffn_dim": 5120,
#   "encoder_layerdrop": 0.0,
#   "encoder_layers": 32,
#   "eos_token_id": 50257,
#   "forced_decoder_ids": [
#     [
#       1,
#       50358
#     ],
#     [
#       2,
#       50363
#     ]
#   ],
#   "init_std": 0.02,
#   "is_encoder_decoder": true,
#   "mask_feature_length": 10,
#   "mask_feature_min_masks": 0,
#   "mask_feature_prob": 0.0,
#   "mask_time_length": 10,
#   "mask_time_min_masks": 2,
#   "mask_time_prob": 0.05,
#   "max_length": 448,
#   "max_source_positions": 1500,
#   "max_target_positions": 448,
#   "median_filter_width": 7,
#   "model_type": "whisper",
#   "num_hidden_layers": 32,
#   "num_mel_bins": 80,
#   "pad_token_id": 50257,
#   "scale_embedding": false,
#   "suppress_tokens": [
#     1,
#     2,
#     7,
#     8,
#     9,
#     10,
#     14,
#     25,
#     26,
#     27,
#     28,
#     29,
#     31,
#     58,
#     59,
#     60,
#     61,
#     62,
#     63,
#     90,
#     91,
#     92,
#     93,
#     359,
#     503,
#     522,
#     542,
#     873,
#     893,
#     902,
#     918,
#     922,
#     931,
#     1350,
#     1853,
#     1982,
#     2460,
#     2627,
#     3246,
#     3253,
#     3268,
#     3536,
#     3846,
#     3961,
#     4183,
#     4667,
#     6585,
#     6647,
#     7273,
#     9061,
#     9383,
#     10428,
#     10929,
#     11938,
#     12033,
#     12331,
#     12562,
#     13793,
#     14157,
#     14635,
#     15265,
#     15618,
#     16553,
#     16604,
#     18362,
#     18956,
#     20075,
#     21675,
#     22520,
#     26130,
#     26161,
#     26435,
#     28279,
#     29464,
#     31650,
#     32302,
#     32470,
#     36865,
#     42863,
#     47425,
#     49870,
#     50254,
#     50258,
#     50358,
#     50359,
#     50360,
#     50361,
#     50362
#   ],
#   "torch_dtype": "float32",
#   "transformers_version": "4.33.0.dev0",
#   "use_cache": true,
#   "use_weighted_layer_sum": false,
#   "vocab_size": 51865
# }

In [1]:
import os
os. chdir('/home/users/gmenon/workspace/songsLyricsGenerator/src')
import lyrics_finetune

[34m[1mwandb[0m: Currently logged in as: [33mgreeshmasmenon[0m ([33msongslyricstranscription[0m). Use [1m`wandb login --relogin`[0m to force relogin


{
    "TRAIN_FILE_PATH": "/scratch/users/gmenon/train_song_metadata_en_demucs_cleaned_filtered_095.csv",
    "TEST_FILE_PATH": "/scratch/users/gmenon/validation_song_metadata_en_demucs_cleaned_filtered_005.csv",
    "MODEL_BACKBONE": "facebook/wav2vec2-large-960h-lv60-self",
    "BATCH_SIZE": 1,
    "NUM_EPOCHS": 15,
    "MODEL_SAVE_PATH": "/scratch/users/gmenon//model_artefacts/wav2vec2_demucs_en_large-960h-lv60-self_freeze_unfreeze_15epochs_adamw.pt",
    "FINETUNE_STRATEGY": [
        "freeze_unfreeze",
        10
    ],
    "LR_SCHEDULER": "reduce_on_plateau_schedule"
}
Namespace(batch_size=1, learning_rate=1e-05, lm_model='bert-base-uncased', max_epochs=3, vocab_size=20000, wav2vec2_model='facebook/wav2vec2-large-960h-lv60-self', whisper_model='openai/whisper-large-v2')


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In Stage = Fit


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type            | Params
---------------------------------------------------
0 | whisper        | WhisperEncoder  | 636 M 
1 | seq2seq        | BertLMHeadModel | 109 M 
2 | bridging_layer | Linear          | 983 K 
3 | wer            | WordErrorRate   | 0     
---------------------------------------------------
747 M     Trainable params
0         Non-trainable params
747 M     Total params
2,989.131 Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Entering Optimization Step


Sanity Checking: 0it [00:00, ?it/s]

entering val data loader
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


original text = so learn from your mistakes [SEP], labels = tensor([ 2061,  4553,  2013,  2115, 12051,   102], device='cuda:0')
Predicted text = and and and and its and, predicted ids = tensor([1998, 1998, 1998, 1998, 2049, 1998], device='cuda:0')
original text = i've been connected to the right line [SEP], labels = tensor([1045, 1005, 2310, 2042, 4198, 2000, 1996, 2157, 2240,  102],
       device='cuda:0')
Predicted text = and.. t been. to the right line, predicted ids = tensor([1998, 1012, 1012, 1056, 2042, 1012, 2000, 1996, 2157, 2240],
       device='cuda:0')
entering train data loader


  rank_zero_warn(


Training: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Validation: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


original text = so learn from your mistakes [SEP], labels = tensor([ 2061,  4553,  2013,  2115, 12051,   102], device='cuda:0')
Predicted text = and i i i i and, predicted ids = tensor([1998, 1045, 1045, 1045, 1045, 1998], device='cuda:0')
original text = i've been connected to the right line [SEP], labels = tensor([1045, 1005, 2310, 2042, 4198, 2000, 1996, 2157, 2240,  102],
       device='cuda:0')
Predicted text = and........., predicted ids = tensor([1998, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012],
       device='cuda:0')
original text = the truth to be found [SEP], labels = tensor([1996, 3606, 2000, 2022, 2179,  102], device='cuda:0')
Predicted text = and ". " be., predicted ids = tensor([1998, 1000, 1012, 1000, 2022, 1012], device='cuda:0')
original text = he said the way myblue eyes shined [SEP], labels = tensor([ 2002,  2056,  1996,  2126,  2026, 16558,  5657,  2159, 12342,  2094,
          102], device='cuda:0')
Predicted text = and.. "..... " ", predicted ids = ten

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Validation: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


original text = so learn from your mistakes [SEP], labels = tensor([ 2061,  4553,  2013,  2115, 12051,   102], device='cuda:0')
Predicted text = . i i i you., predicted ids = tensor([1012, 1045, 1045, 1045, 2017, 1012], device='cuda:0')
original text = i've been connected to the right line [SEP], labels = tensor([1045, 1005, 2310, 2042, 4198, 2000, 1996, 2157, 2240,  102],
       device='cuda:0')
Predicted text = ...''.. ".., predicted ids = tensor([1012, 1012, 1012, 1005, 1005, 1012, 1012, 1000, 1012, 1012],
       device='cuda:0')
original text = the truth to be found [SEP], labels = tensor([1996, 3606, 2000, 2022, 2179,  102], device='cuda:0')
Predicted text = . " " " " ", predicted ids = tensor([1012, 1000, 1000, 1000, 1000, 1000], device='cuda:0')
original text = he said the way myblue eyes shined [SEP], labels = tensor([ 2002,  2056,  1996,  2126,  2026, 16558,  5657,  2159, 12342,  2094,
          102], device='cuda:0')
Predicted text = .. " " ".. " " " ", predicted ids = tensor

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Validation: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


original text = so learn from your mistakes [SEP], labels = tensor([ 2061,  4553,  2013,  2115, 12051,   102], device='cuda:0')
Predicted text = and'i and you., predicted ids = tensor([1998, 1005, 1045, 1998, 2017, 1012], device='cuda:0')
original text = i've been connected to the right line [SEP], labels = tensor([1045, 1005, 2310, 2042, 4198, 2000, 1996, 2157, 2240,  102],
       device='cuda:0')
Predicted text = and''''.'' '., predicted ids = tensor([1998, 1005, 1005, 1005, 1005, 1012, 1005, 1005, 1005, 1012],
       device='cuda:0')
original text = the truth to be found [SEP], labels = tensor([1996, 3606, 2000, 2022, 2179,  102], device='cuda:0')
Predicted text = and'''' ', predicted ids = tensor([1998, 1005, 1005, 1005, 1005, 1005], device='cuda:0')
original text = he said the way myblue eyes shined [SEP], labels = tensor([ 2002,  2056,  1996,  2126,  2026, 16558,  5657,  2159, 12342,  2094,
          102], device='cuda:0')
Predicted text = and.. '. '...'', predicted ids = tensor(

`Trainer.fit` stopped: `max_epochs=3` reached.


	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import os
os. chdir('/home/users/gmenon/workspace/songsLyricsGenerator/src')
from training import lyrics_finetune
#from constants.mir_constants import TrainingArgs, WAV2VEC2_ARGS
#import argparse


In [None]:
hparams = argparse.Namespace()
hparams.wav2vec2_model = 'facebook/wav2vec2-large-960h-lv60-self'
hparams.whisper_model = 'openai/whisper-large-v2'#'openai/whisper-large'
hparams.lm_model = 'bert-base-uncased' #'bert-base-uncased' #
hparams.vocab_size = 20000
hparams.learning_rate = 1e-6
hparams.batch_size = 1

model,trainer = lyrics_finetune.run(hparams=hparams)