In [None]:
# Install libraries
!pip install torch #pytorch
!pip install transformers #huggingface transformer
!pip install librosa #for managing audio files

In [None]:
# Import libraries
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer #import Wav2Vec huggingface transformer
import os
import numpy as np
import librosa.display
import matplotlib
import pandas as pd

## Load Audio Data from gdrive

In [None]:
# Mount the drive that contains wav files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Find all file paths containing WAV files using librosa's utility function.
# Note: Files with a size less than 100KB (~ <2 seconds) were excluded in a previous step,
# as librosa.load cannot resample them to 16000Hz.


# Define the directory path where the WAV files are located.
files = librosa.util.find_files('path-containing-wav-files', ext='wav', recurse=False)

# Convert the list of file paths to a NumPy array.
files = np.asarray(files)

#len(files)
#files

In [None]:
# Load and resample all WAV files to 16,000 Hz using librosa.load.

df=[]
for i in files:
  df.append(librosa.load(i, sr=16000))
  #df.append(librosa.load(i)) #it also works without resampling

#print(df)

librosa_values=pd.DataFrame(df, columns=['value', 'sr'])
#librosa_values

# Extract the 'value' column from the DataFrame
#value = librosa_values.value

#value

In [None]:
# Importing Wav2Vec pretrained model from HuggingFace
tokenizer =  Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") #("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")


In [None]:
# The next step is to process the input values by passing the audio (array) into a tokenizer.
# We want our tensors in PyTorch format, so we set return_tensors = "pt" which is PyTorch format.

# Initialize an index for tracking progress.
input_values=[]

# Initialize an index for tracking progress.
index = 0

# Loop through the audio values in 'librosa_values'.
for i in librosa_values["value"]:
  # Tokenize the audio and append the input values to 'input_values'.

   input_values.append(tokenizer(i, return_tensors = "pt").input_values)
   # Print the index to track progress.
   print(index)

   # Increment the index.
   index = index +1

# 'input_values' now contains the tokenized input in PyTorch format.

In [None]:
# If an out-of-RAM error occurs when executing the next code chunk, save resulting tensors and data to files.
# You can continue with the following code chunks on a local machine.

# Save the 'data' DataFrame to a pickle file named 'data.pkl'.
#data.to_pickle('data.pkl')

# Save 'input_values' (tensors) to a PyTorch file named 'input_values.pt'.
#torch.save(input_values, 'input_values.pt')

In [None]:
# Store logits (non-normalized prediction values) for the input values.

# Initialize an empty list to store the logits.
logits = []

# Initialize an index for tracking progress.
index = 0

# Use 'torch.no_grad()' to disable gradient calculation, as we are not training the model.
with torch.no_grad():
    # Loop through the input values and compute logits using the ASR model.
    for i in input_values:
        logits.append(model(i).logits)

        # Print the index to track progress.
        print(index)

        # Increment the index.
        index = index + 1

# 'logits' now contains the non-normalized prediction values.

In [None]:
# Pass the logits (non-normalized prediction values) through softmax to get the predicted values.

# Initialize an empty list to store the predictions.
prediction = []

# Loop through the logits and compute predictions using the argmax operation.
for i in logits:
    prediction.append(torch.argmax(i, dim=-1))

# 'prediction' now contains the predicted values after applying softmax.


In [None]:
# Convert predicted values to text by passing them through the tokenizer's decode function.

# Initialize an empty list to store the transcriptions.
transcription = []

# Loop through the predictions and use the tokenizer's batch_decode to obtain transcriptions.
for i in prediction:
    transcription.append(tokenizer.batch_decode(i)[0])

# 'transcription' now contains the text transcriptions.


In [None]:
# Merge the transcribed text ('strings') with a 'data' DataFrame by adding a new column named 'transcription'.
data=[]
data["transcription"]=strings
data