In [None]:
# Install Nvidia NeMo toolkit and its dependencies, including all optional components.
!pip install nemo_toolkit['all']

# Import NeMo's core package.
import nemo

# Import NeMo's ASR collection, which includes complete ASR models and building blocks.
import nemo.collections.asr as nemo_asr

In [None]:
# Import other necessary libraries for data manipulation and audio file management.
import numpy as np
import pandas as pd
import librosa

## Import Data

In [None]:
# Mount the drive that contains wav files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Find all file paths containing WAV files using librosa's utility function.
# Note: Files with a size less than 100KB (~ <2 seconds) were excluded in a previous step,
# as librosa.load cannot resample them to 16000Hz.


# Define the directory path where the WAV files are located.
files = librosa.util.find_files('path-containing-wav-files', ext='wav', recurse=False)

# Convert the list of file paths to a NumPy array.
files = np.asarray(files)

#len(files)
#files

In [None]:
# Loop over the WAV files and determine their sample rates using librosa.get_samplerate.

for wav_file in files:
    sr = librosa.get_samplerate(wav_file)
    print(f'{wav_file}: {sr} Hz')

In [None]:
# In the same directory, resample and write all files to 16000hz

from scipy.io import wavfile

# Set the new sample rate
new_sample_rate = 16000

# Loop over all files in the list
for filepath in files:
    # Load the audio data and current sample rate
    y, sr = librosa.load(filepath, sr=None)

    # Resample the audio to the new sample rate
    y_resampled = librosa.resample(y, orig_sr=sr, target_sr=new_sample_rate)

    # Write the resampled audio to a new WAV file
    wavfile.write(filepath, new_sample_rate, y_resampled)

In [None]:
# Import the resampled files

files_split1 = librosa.util.find_files('path-containing-resampled-wav-files', ext='wav')
files = np.asarray(files_split1)

#len(files)

2993

In [None]:
 # Loop over the WAV files and get their sample rates
librosa.get_samplerate(files[1])

#16000

16000

## Import Model

In [None]:
# Specify the pretrained ASR (Automatic Speech Recognition) model to be used.
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name="stt_en_conformer_transducer_xxlarge")

## Transcription

In [None]:
# Use "%%capture" to capture and suppress the output of the following cell.
%%capture

# Initialize an empty list 'transcriptions_single' to store single transcriptions.
transcriptions = []

# Iterate through the list of transcriptions and extract the second element (transcribed text) from each.
for i in files:
  x = asr_model.transcribe([i])
  transcriptions.append(x)

In [None]:
#len(transcriptions)

2993

In [None]:
# Initialize an empty list 'transcriptions_single' to store single transcriptions.
transcriptions_single=[]
#type(transcriptions_single)

# Loop through the list of transcriptions and extract the second element (transcribed text) from each.
for i in transcriptions:
  transcriptions_single.append(i[1])

In [None]:
# Merge the transcribed text ('strings') with a 'data' DataFrame by adding a new column named 'transcription'.
data=[]
data["transcription"]=transcriptions_single
data

# To access the transcribed text for a specific sample, you can use:
#transcription[0]["text"]