In [3]:
!pip install accelerate -U
!pip install datasets

Requirement already up-to-date: accelerate in /home/gautam/.local/lib/python3.8/site-packages (0.28.0)


In [1]:
from datasets import Dataset
import pandas as pd
from datasets import Audio
import gc


  from .autonotebook import tqdm as notebook_tqdm


In [2]:


## we will load the both of the data here.
train_df = pd.read_csv("/home/gautam/Documents/wspace/video_Search/summary/atrain.csv")
test_df = pd.read_csv("/home/gautam/Documents/wspace/video_Search/summary/stt_test_new.csv")

## we will rename the columns as "audio", "sentence".
train_df.columns = ["audio", "sentence"]
test_df.columns = ["audio", "sentence"]

## convert the pandas dataframes to dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

## convert the sample rate of every audio files using cast_column function
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [3]:
train_dataset


Dataset({
    features: ['audio', 'sentence'],
    num_rows: 14
})

In [4]:
test_dataset

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 2
})

In [5]:
from transformers import WhisperFeatureExtractor


In [6]:
## import feature extractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

## Load WhisperTokenizer
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="English", task="transcribe")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
from transformers import WhisperProcessor


In [8]:
## Combine To Create A WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
def prepare_dataset(examples):
    # compute log-Mel input features from input audio array
    audio = examples["audio"]
    examples["input_features"] = feature_extractor(
        audio["array"], sampling_rate=16000).input_features[0]
    del examples["audio"]
    sentences = examples["sentence"]

    # encode target text to label ids
    examples["labels"] = tokenizer(sentences).input_ids
    del examples["sentence"]
    return examples

In [11]:
!pip show soundfile


Name: soundfile
Version: 0.12.1
Summary: An audio library based on libsndfile, CFFI and NumPy
Home-page: https://github.com/bastibe/python-soundfile
Author: Bastian Bechtold
Author-email: basti@bastibe.de
License: BSD 3-Clause License
Location: /home/gautam/.local/lib/python3.8/site-packages
Requires: cffi
Required-by: librosa


In [13]:
train_dataset = train_dataset.map(prepare_dataset, num_proc=1)


Map:   0%|          | 0/14 [00:00<?, ? examples/s]


RuntimeError: Decoding 'mp3' files requires system library 'libsndfile'>=1.1.0, You can try to update `soundfile` python library: `pip install "soundfile>=0.12.1"`. 

In [8]:
test_dataset = test_dataset.map(prepare_dataset, num_proc=1)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [9]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

## lets initiate the data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [10]:
!pip install jiwer
!pip install evaluate
import evaluate

metric = evaluate.load('wer')

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.7.0
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [11]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [12]:
# Load a Pre-Trained Checkpoint
from transformers import WhisperForConditionalGeneration
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [13]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [14]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments

In [15]:
training_args = Seq2SeqTrainingArguments(
   output_dir="./whisper-base-en6",
    per_device_train_batch_size=5,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    warmup_steps=0,
    max_steps=-1,
    gradient_checkpointing=True,
    #fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    save_steps=0.2,
    # logging_steps=25,
    report_to=["tensorboard"],
    #load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
    save_total_limit=5
)

In [16]:

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [17]:

## start the model training
trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


TrainOutput(global_step=6, training_loss=0.9963014125823975, metrics={'train_runtime': 79.6958, 'train_samples_per_second': 0.339, 'train_steps_per_second': 0.075, 'total_flos': 7791805808640000.0, 'train_loss': 0.9963014125823975, 'epoch': 3.0})

In [18]:
trainer.evaluate()

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


{'eval_loss': 0.5965615510940552,
 'eval_wer': 18.91891891891892,
 'eval_runtime': 4.0367,
 'eval_samples_per_second': 0.495,
 'eval_steps_per_second': 0.495,
 'epoch': 3.0}

In [19]:
trainer.predict(test_dataset).predictions


array([[50258, 50352, 50359, 50363,   286,   528,   281,  1884,   257,
         3090,   724, 20769,   304, 10852, 12249, 22962,   293,   286,
          528,   294,   452,  8963,   257,  3637,   295,   805,  3808,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
      

In [27]:
processor.save_pretrained('/content/whisper-base-en6/checkpoint-6')

[]

In [28]:
trainer.save_model()

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


In [30]:
from transformers import pipeline
whisper  = pipeline("automatic-speech-recognition",
                    "/content/whisper-base-en6/checkpoint-6"
                    )
transcription = whisper("/content/a10.mp3",
                        chunk_length_s=30)
print(transcription["text"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 please create a role open__small__3496 and assign it to user user__brain__3498 and add it to a dataset with entity find__simvol__2318
