In [2]:
from datasets import load_dataset, Dataset, Audio
from transformers import AutoProcessor, WhisperModel, AutoTokenizer, AutoModelForCausalLM
import torch
import wave

  from .autonotebook import tqdm as notebook_tqdm
2024-06-03 17:41:07.978106: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-03 17:41:08.201891: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Load the Audio Data
list_of_audio_files = ["data/sub/De95Osq7p1c_trimmed_segment_1.wav", "data/sub/De95Osq7p1c_trimmed_segment_2.wav"]

In [3]:
processor = AutoProcessor.from_pretrained('whisper-medium', local_files_only=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
model = WhisperModel.from_pretrained('whisper-medium', local_files_only=True).to("cuda")

# Experimenting Adaptors

In [6]:
from datasets import Dataset, Audio
from transformers import AutoProcessor, WhisperModel, AutoTokenizer, AutoModelForCausalLM, AutoModel
import torch
import pytorch_lightning as pl


class TranslateModel(pl.LightningModule):
    def __init__(self, audio_encoder="./whisper-medium", llm="./sea-lion-7b-instruct"):
        super().__init__()

        # Device
        self.device_type = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load the Whisper model and processor
        print("Loading Audio Encoder")
        self.audio_processor = AutoProcessor.from_pretrained(audio_encoder, local_files_only=True)
        # self.audio_encoder = WhisperModel.from_pretrained(audio_encoder, local_files_only=True).to(self.device_type)

        # Define the Adaptor
        self.adaptor = torch.nn.Linear(1024, 4096)  # Do we need bias?

        # Load the LLM and its tokenizer
        print("Loading LLM")

        self.generation_kwargs = {
            "do_sample": False,  # set to true if temperature is not 0
            "temperature": None,
            "max_new_tokens": 256,
            "top_k": 50,
            "top_p": 0.7,
            "repetition_penalty": 1.2,
        }
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            llm, 
            trust_remote_code=True,
            local_files_only=True
        )

        # self.llm = AutoModelForCausalLM.from_pretrained(
        #     llm,
        #     trust_remote_code=True,
        #     device_map="auto",
        #     local_files_only=True
        # )

        #self.prefix_embeddings = self.embed_prompt_tokens("### USER:\nTranslate the following to English. ")
        #self.suffix_embeddings = self.embed_prompt_tokens(" \n\n### RESPONSE:\n")


    def forward(self, list_audio_filepaths):
        # Encode Audio
        # (batch_size, 1500, 1024)
        audio_embeddings = self.process_and_encode_audio(list_audio_filepaths)

        # Adapt audio embeddings
        adapted_audio_embeddings = self.adaptor(audio_embeddings).to()

        # Concat audio embeddings with prompt
        input_embeddings = torch.cat([self.prefix_embeddings.unsqueeze(0), adapted_audio_embeddings, self.suffix_embeddings.unsqueeze(0)], dim=1)

        # Feed into LLM
        tokenised_output = self.llm.generate(
            input_embeds = input_embeddings,
            **self.generation_kwargs
        )

        # Get translated output
        translated_output = self.tokenizer.decode(
            tokenised_output[0], 
            skip_special_tokens= True
        )

        return translated_output

    def process_and_encode_audio(self, list_audio_filepaths):
        print("Loading Dataset")

        def prepare_dataset(batch):
            audio = batch["audio"]
            batch["input_features"] = self.audio_processor.feature_extractor(
                audio["array"],
                sampling_rate=audio["sampling_rate"],
                return_tensors='pt'
            )['input_features'][0]
            return batch

        audio_dataset = Dataset.from_dict({
            "audio": list_audio_filepaths
        })

        
        audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=16000))
        print(audio_dataset[0])

        # Maps the audio files into Huggingface Dataset Format
        print("Mapping")
        audio_dataset = audio_dataset.map(prepare_dataset)

        
        print("Mapped")
        inputs = torch.tensor(audio_dataset['input_features']).to(self.device_type)

        return inputs

        # # Ensuring No Gradient Updates during Encoding
        # with torch.no_grad():
        #     encoder_outputs = self.audio_encoder.encoder(inputs, output_hidden_states=True)

        # return encoder_outputs.last_hidden_state
    
    # def embed_prompt_tokens(self, string):
    #     tokens = self.tokenizer(string, return_tensors="pt")
    #     token_embeddings = self.llm.transformer.wte(tokens['input_ids'])
    #     return token_embeddings
    

    # def training_step(self, batch, batch_idx):
    #     # Define the training step logic
    #     inputs, target = batch
    #     output = self(inputs, target)

    #     # Return loss
    #     pass

    # def configure_optimizers(self):
    #     # Define the optimizers and schedulers
    #     pass




In [7]:
import os

In [8]:
os.cpu_count()

28

In [7]:
translate = TranslateModel()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading Audio Encoder
Loading LLM


In [8]:
list_of_audio_files

['data/sub/De95Osq7p1c_trimmed_segment_1.wav',
 'data/sub/De95Osq7p1c_trimmed_segment_2.wav']

In [9]:
translate.process_and_encode_audio(list_of_audio_files)

Loading Dataset
{'audio': {'path': 'data/sub/De95Osq7p1c_trimmed_segment_1.wav', 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00323486,
        0.00119019, -0.00064087]), 'sampling_rate': 16000}}
Mapping


Map: 100%|██████████| 2/2 [00:00<00:00, 18.58 examples/s]


Mapped


tensor([[[-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         ...,
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276]],

        [[ 0.2992,  0.0724,  0.1169,  ..., -0.8499, -0.8499, -0.8499],
         [ 0.2086,  0.0056,  0.0539,  ..., -0.8499, -0.8499, -0.8499],
         [ 0.1671,  0.0290,  0.0586,  ..., -0.8499, -0.8499, -0.8499],
         ...,
         [-0.0828,  0.0063, -0.0152,  ..., -0.8499, -0.8499, -0.8499],
         [-0.1165, -0.1588, -0.1714,  ..., -0.8499, -0.8499, -0.8499],
         [-0.1584, -0.2315, -0.2529,  ..., -0.8499, -0.8499, -0.8499]]],
       device='cuda:0')

: 

In [5]:
audio_dataset = Dataset.from_dict({
            "audio": list_of_audio_files
        })

audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [6]:

for row in audio_dataset:
    input = processor(
        row["audio"]["array"],
        return_tensors="pt",
        return_attention_mask=True,
        sampling_rate=16000
    )

input


{'input_features': tensor([[[ 0.2992,  0.0724,  0.1169,  ..., -0.8499, -0.8499, -0.8499],
         [ 0.2086,  0.0056,  0.0539,  ..., -0.8499, -0.8499, -0.8499],
         [ 0.1671,  0.0290,  0.0586,  ..., -0.8499, -0.8499, -0.8499],
         ...,
         [-0.0828,  0.0063, -0.0152,  ..., -0.8499, -0.8499, -0.8499],
         [-0.1165, -0.1588, -0.1714,  ..., -0.8499, -0.8499, -0.8499],
         [-0.1584, -0.2315, -0.2529,  ..., -0.8499, -0.8499, -0.8499]]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)}

In [12]:
row = audio_dataset[0]

inputs = processor(
    row["audio"]["array"],
    return_tensors="pt",
    return_attention_mask=True,
    sampling_rate=16000
)

In [13]:
inputs

{'input_features': tensor([[[-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         ...,
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276],
         [-0.8276, -0.8276, -0.8276,  ..., -0.8276, -0.8276, -0.8276]]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)}