Import required libraries

In [1]:
import os
import re
import librosa
import unicodedata
import numpy as np
import pandas as pd
from tqdm import tqdm

# Utility functions

## preprocess_audio_slr

Reads audio file in the specified DATA_DIR and audio name with .wav extension and returns audio file path and duration of the audio file

In [2]:
def preprocess_audio_slr(audio):
    audio_file = os.path.join(DATA_DIR, str(audio.split('.')[0]) + '.wav')
    y, sr = librosa.load(audio_file, sr=None)
    duration = librosa.get_duration(y, sr)
    return audio_file, duration

## clean_text

Cleans the text data to remove unnecessary symbols, and english letters. Returns normalized unicode data of the transcript text

In [3]:
def clean_text(text):
    text = text.split(' ')
    cleaned_text = [re.sub('[!.‌?/\\{|},><,,+-_=*(^&%$#@)|:;’"‘–”\'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ]', '', item) for item in text]
    text = ' '.join([item for item in cleaned_text if item!=''])
    text = unicodedata.normalize("NFC", text)
    return text

## preprocess_slr

Reads tsv file form given path, gets audio file names and transcript text. gives exact file path of audio files and cleaned text of the transcript and writes them into a new tsv file in the format used for TensorflowASR


In [4]:
def preprocess_slr(df_path, out_path):
    df = pd.read_csv(df_path, sep='\t', encoding='utf-8')
    
    files = df.path.astype('str').values.tolist()
    texts = df.sentence.astype('str').values.tolist()
    
    print('Pre-processing!')
    
    transcripts = []
    
    for i in range(len(df)):
        audio_file, duration = preprocess_audio_slr(files[i])
        text = clean_text(texts[i])
        transcripts.append(f"{audio_file}\t{duration:.2f}\t{text}\n")
    
    with open(out_path, "w", encoding="utf-8") as out:
        out.write("PATH\tDURATION\tTRANSCRIPT\n")
        for line in tqdm(transcripts, desc="[Writing]"):
            out.write(line)
    
    print('Completed!')

Preprocess and prepare tsv files for new training and testing data

In [5]:
DATA_DIR = 'clips/'

tsvs = ['train.tsv', 'test.tsv', 'dev.tsv', 'validated.tsv', 'invalidated.tsv', 'other.tsv']

for tsv in tsvs:
    preprocess_slr(tsv, 'new_' + tsv)

Pre-processing!


[Writing]: 100%|███████████████████████████████████████████████████████████████| 1896/1896 [00:00<00:00, 237683.08it/s]


Completed!
Pre-processing!


[Writing]: 100%|███████████████████████████████████████████████████████████████| 1638/1638 [00:00<00:00, 541904.87it/s]


Completed!
Pre-processing!


[Writing]: 100%|███████████████████████████████████████████████████████████████| 1668/1668 [00:00<00:00, 834657.49it/s]


Completed!
Pre-processing!


[Writing]: 100%|█████████████████████████████████████████████████████████████| 12163/12163 [00:00<00:00, 406464.18it/s]


Completed!
Pre-processing!


[Writing]: 100%|█████████████████████████████████████████████████████████████████| 538/538 [00:00<00:00, 134911.85it/s]


Completed!
Pre-processing!


[Writing]: 100%|███████████████████████████████████████████████████████████████| 5241/5241 [00:00<00:00, 374397.03it/s]

Completed!



