# Speech To Text

Speech to Text, also known as speech recognition, is a technology that converts spoken language into written text. By utilizing natural language processing algorithms and models, this technology enables the automatic transcription of spoken words into a readable text format.

In [2]:
import whisperx
import gc 
import pandas as pd
import torch
import warnings
import os, sys

from pydub import AudioSegment
from src.commons.common_tools import check_directories

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
device = "cuda" 
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
batch_size = 16 # reduce if low on GPU mem

parameters = {
    'path': os.path.dirname(sys.path[0]),
    'path_in': os.path.join(os.path.dirname(sys.path[0]), '01_data'),
    'path_out': os.path.join(os.path.dirname(sys.path[0]), '03_output'),
    'curated': os.path.join(os.path.dirname(sys.path[0]), '01_data', 'curated'),
    'matching': os.path.join(os.path.dirname(sys.path[0]), '01_data', 'matching_dbs'),
    'topic_modeling': os.path.join(os.path.dirname(sys.path[0]), '03_output', 'topic_modeling'),
    'speech': os.path.join(os.path.dirname(sys.path[0]), '03_output', 'speech_to_text'),
    'audios': os.path.join(sys.path[0], '01_data', 'audios_ccenter'),
    'speakers': os.path.join(sys.path[0], '03_output', 'speech_to_text', 'audios_speakers'),
    'transcription': os.path.join(sys.path[0], '03_output', 'speech_to_text', 'transcription')
}

In [4]:
def separateChannels(file, savePath):
    stereoAudio = AudioSegment.from_file(file, format="mp3")

    monoAudios = stereoAudio.split_to_mono()
    
    audioRightFile = "speaker_cliente.mp3"
    audioLeftFile = "speaker_asesor.mp3"

    monoAudios[0].export(os.path.join(savePath, audioLeftFile), format="mp3")
    monoAudios[1].export(os.path.join(savePath, audioRightFile), format="mp3")
    
    return os.path.join(savePath, audioLeftFile), os.path.join(savePath, audioRightFile)
    
def whisperTranscription(file):
    model = whisperx.load_model("large-v2", device, compute_type=compute_type)

    audio = whisperx.load_audio(file)
    result = model.transcribe(audio, batch_size=batch_size, language="es")

    gc.collect(); torch.cuda.empty_cache(); del model
    
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    return result

def filterModelMistakes(df):
    df = df.assign(T_DELTA=df.end-df.start)
    df = df.assign(WORD_LENGHT=df.text.str.split(' {1,}').str.len())
    df = df.assign(RATIO=df.WORD_LENGHT/df.T_DELTA)
    df = df.assign(VALID=df.RATIO<20)
    return df

def concatTranscript(asesor, cliente):
    transcriptCliente = pd.DataFrame.from_dict(cliente['segments'])
    transcriptCliente = transcriptCliente.assign(speaker='cliente')
    transcriptCliente[['start', 'end', 'text', 'speaker']]

    transcriptAsesor = pd.DataFrame.from_dict(asesor['segments'])
    transcriptAsesor = transcriptAsesor.assign(speaker='asesor')
    transcriptAsesor[['start', 'end', 'text', 'speaker']]

    concat = pd.concat([transcriptCliente, transcriptAsesor])[['start', 'end', 'text', 'speaker']].sort_values('start')

    return filterModelMistakes(concat)

def exportText(df, filePath, fileName):
    with open(os.path.join(filePath, fileName), 'w') as file:
        for i, row in df.iterrows():
            line = f"{row['speaker']}: {row['text']}\n"
            file.write(line)    

In [None]:
check_directories(parameters)

In [26]:
audioFile = os.path.join(parameters['audios'], "test_audio_1.mp3")
speakersPath = os.path.join(parameters['speakers'], "test_audio_1")

In [27]:
cliente, asesor = separateChannels(audioFile, speakersPath)

In [28]:
resultTranscriptCliente = whisperTranscription(cliente)
resultTranscriptAsesor = whisperTranscription(asesor)

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../../root/.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1+cu117. Bad things might happen unless you revert torch to 1.x.


Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_voxpopuli_base_10k_asr_es.pt" to /root/.cache/torch/hub/checkpoints/wav2vec2_voxpopuli_base_10k_asr_es.pt
100%|██████████| 360M/360M [00:01<00:00, 374MB/s] 
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../../root/.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1+cu117. Bad things might happen unless you revert torch to 1.x.


In [34]:
df_transcript = concatTranscript(resultTranscriptCliente, resultTranscriptAsesor)
df_transcript

Unnamed: 0,start,end,text,speaker,T_DELTA,WORD_LENGHT,RATIO,VALID
0,1.783,12.61,A ver si tengo el número de Radical.,cliente,10.827,9,0.831255,True
0,4.946,8.207,"Muy buena tarde, bienvenido a las líneas del ...",asesor,3.261,12,3.679853,True
1,8.207,13.57,"Habla con Yorami Shapono, luego con el señor S...",asesor,5.363,9,1.678165,True
1,12.61,17.733,"Sí, señora.",cliente,5.123,2,0.390396,True
2,13.57,15.652,¿Cómo puede ayudarle el día de hoy?,asesor,2.082,7,3.362152,True
2,17.733,18.133,Gracias.,cliente,0.4,1,2.5,True
3,18.133,18.734,"Mira, un favor.",cliente,0.601,3,4.991681,True
4,18.734,26.199,Es que yo llamo el mes de mayo para que me hic...,cliente,7.465,25,3.348962,True
5,26.199,27.82,Porque a mí en general me pagan los 5.,cliente,1.621,9,5.552128,True
6,29.664,40.848,me hicieron todo el proceso de como media hor...,cliente,11.184,36,3.218884,True


In [38]:
exportText(df_transcript, parameters['transcription'], "transcript_1.txt")