In [None]:
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available()else "cpu")
print(device)

In [None]:
import IPython
import matplotlib.pyplot as plt
from torchaudio.utils import download_asset

speech_file = download_asset("/content/0017_001731.wav")

In [None]:
bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
print("sample rate:",bundle.sample_rate)
print("labels:",bundle.get_labels())

In [None]:
model = bundle.get_model().to(device)
print(model.__class__)

In [None]:
IPython.display.Audio(speech_file)

In [None]:
waveform,sample_rate = torchaudio.load(speech_file)
waveform = waveform.to(device)

if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

In [None]:
with torch.inference_mode():
  features,_ = model.extract_features(waveform)

In [None]:
fig, ax = plt.subplots(len(features), 1, figsize=(16, 4.3 * len(features)))
for i, feats in enumerate(features):
    ax[i].imshow(feats[0].cpu(), interpolation="nearest")
    ax[i].set_title(f"Feature from transformer layer {i+1}")
    ax[i].set_xlabel("Feature dimension")
    ax[i].set_ylabel("Frame (time-axis)")
fig.tight_layout()

In [None]:
with torch.inference_mode():
    emission, _ = model(waveform)

In [None]:
plt.imshow(emission[0].cpu().T, interpolation="nearest")
plt.title("Classification result")
plt.xlabel("Frame (time-axis)")
plt.ylabel("Class")
plt.tight_layout()
print("Class labels:", bundle.get_labels())

In [None]:
class GreedyCTCDecoder(torch.nn.Module):
  def __init__(self, labels, blank=0):
     super().__init__()
     self.labels = labels
     self.blank = blank
  def forward(self,emission:torch.Tensor) ->str:
    # emission shape: (batch, frame, num_labels)
    # We are interested in the first item in the batch
    emission = emission[0] # shape: (frame, num_labels)
    indices = torch.argmax(emission,dim =-1) # shape: (frame,)
    indices = torch.unique_consecutive(indices,dim =-1) # shape: (unique_frames,)
    indices = [i for i in indices if i != self.blank]
    return "".join([self.labels[i] for i in indices])

In [None]:
decoder = GreedyCTCDecoder(labels=bundle.get_labels())
transcript = decoder(emission)

In [None]:
print(transcript)

In [None]:
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [None]:
audio,rate = librosa.load("/content/0017_001731.wav",sr = 16000)
print(audio)
print(rate)

In [None]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

In [None]:
input_values = tokenizer(audio,return_tensors="pt").input_values

In [None]:
logits = model(input_values).logits

In [None]:
prediction = torch.argmax(logits,dim=-1)

In [None]:
transcription = tokenizer.batch_decode(prediction)[0]
print(transcription)