# Speech To Text

Speech to Text, also known as speech recognition, is a technology that converts spoken language into written text. By utilizing natural language processing algorithms and models, this technology enables the automatic transcription of spoken words into a readable text format.

In [3]:
import whisper
import gc 
import pandas as pd
import torch
import warnings
import os, sys
import json

from pydub import AudioSegment
from src.commons.common_tools import check_directories, parameters

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
batch_size = 16 # reduce if low on GPU mem

In [5]:
def separateChannels(file, savePath):
    """
    Separate a stereo audio file into two mono channels and export as MP3 files.

    Parameters
    ----------
    file : str
        Path to the stereo audio file.
    savePath : str
        Path to the directory where separated mono audio files will be saved.

    Returns
    -------
    str, str
        Paths to the separated mono audio files.
    """
    stereoAudio = AudioSegment.from_file(file, format="mp3")

    monoAudios = stereoAudio.split_to_mono()
    
    audioRightFile = f"cliente.mp3"
    audioLeftFile = f"asesor.mp3"
    
    if not os.path.exists(savePath):
        os.mkdir(savePath)
        print(f"Se ha creado el directorio '{savePath}'")
    else:
        print(f"El directorio '{savePath}' ya existe")

    monoAudios[0].export(os.path.join(savePath, audioLeftFile), format="mp3")
    monoAudios[1].export(os.path.join(savePath, audioRightFile), format="mp3")
    
    return os.path.join(savePath, audioLeftFile), os.path.join(savePath, audioRightFile)
    
def whisperTranscription(file):
    """
    Perform speech transcription using the Whisper model.
    
    Parameters
    ----------
    file : str
        Path to the audio file for transcription.
    
    Returns
    -------
    dict
        Transcription results including segments, start times, end times, and transcribed text.
    """
    model = whisper.load_model("large")
    
    audio = whisper.load_audio(file)
    result = model.transcribe(
        audio,
        language ="Spanish",
        word_timestamps=True
    )
    gc.collect(); torch.cuda.empty_cache()
    return result

def filterModelMistakes(df):
    """
    Filter transcribed segments based on text length and duration.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing transcribed segments with timing and text information.

    Returns
    -------
    pandas.DataFrame
        DataFrame with additional columns for timing analysis and validity checks.
    """
    df = df.assign(T_DELTA=df.end-df.start)
    df = df.assign(WORD_LENGHT=df.text.str.split(' {1,}').str.len())
    df = df.assign(RATIO=df.WORD_LENGHT/df.T_DELTA)
    df = df.assign(VALID=df.RATIO<20)
    return df

def concatTranscript(asesor, cliente):
    """
    Concatenates and processes transcriptions of advisor and client.

    Parameters
    ----------
    asesor : dict
        Transcription of the advisor.
    cliente : dict
        Transcription of the client.

    Returns
    -------
    pandas.DataFrame
        Concatenated and processed transcriptions.
    """
    transcriptCliente = pd.DataFrame.from_dict(cliente['segments'])
    transcriptCliente = transcriptCliente.assign(speaker='cliente')
    transcriptCliente[['start', 'end', 'text', 'speaker']]

    transcriptAsesor = pd.DataFrame.from_dict(asesor['segments'])
    transcriptAsesor = transcriptAsesor.assign(speaker='asesor')
    transcriptAsesor[['start', 'end', 'text', 'speaker']]

    concat = pd.concat([transcriptCliente, transcriptAsesor])[['start', 'end', 'text', 'speaker']].sort_values('start')

    return filterModelMistakes(concat)

def exportText(df, filePath, fileName):
    """
    Export transcribed text to a text file.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing transcribed segments with speaker and text information.
    filePath : str
        Path to the directory where the text file will be saved.
    fileName : str
        Name of the text file.

    Returns
    -------
    None
    """
    with open(os.path.join(filePath, fileName), 'w') as file:
        for i, row in df.iterrows():
            line = f"{row['speaker']}: {row['text']}\n"
            file.write(line)    

In [6]:
check_directories(parameters)
with open(os.path.join(parameters['parametric'], 'speech2text.json'), 'r') as f:
        parametric = json.loads(f.read())

In [7]:
nameFolder = '.'.join(parametric['audio_file'].split('.')[:-1])

audioFile = os.path.join(parameters['audios'], parametric['audio_file'])

speakersPath = os.path.join(parameters['speakers'], nameFolder)

In [9]:
cliente, asesor = separateChannels(audioFile, speakersPath)

El directorio '/notebooks/03_output/speech_to_text/audios_speakers/vys_01TUP21LACA578J11C4H5B5AES1AV3TM_2023-07-07_19-53-36 Chacon Ramirez Jurany' ya existe


In [10]:
resultTranscriptCliente = whisperTranscription(cliente)
resultTranscriptAsesor = whisperTranscription(asesor)

100%|█████████████████████████████████████| 2.87G/2.87G [01:43<00:00, 29.8MiB/s]


In [11]:
df_transcript = concatTranscript(resultTranscriptCliente, resultTranscriptAsesor)
df_transcript

Unnamed: 0,start,end,text,speaker,T_DELTA,WORD_LENGHT,RATIO,VALID
0,0.76,2.68,"Gracias, si tengo el número de Radical.",cliente,1.92,8,4.166667,True
0,4.58,9.36,"Muy buena tarde, bienvenido a la línea del Ba...",asesor,4.78,16,3.34728,True
1,6.38,7.86,¿Radical es el que ha dedicado?,cliente,1.48,7,4.72973,True
2,8.2,9.12,¿Es el que ha dedicado?,cliente,0.92,6,6.521739,True
1,9.4,16.62,luego con el señor Sergio. ¿Vos saludarlo con...,asesor,7.22,16,2.216066,True
3,11.66,12.1,¿Es el que ha dedicado?,cliente,0.44,6,13.636364,True
4,12.1,12.64,"Sí, señora.",cliente,0.54,3,5.555556,True
5,13.9,14.52,¿Cuál es?,cliente,0.62,3,4.83871,True
2,16.62,17.12,¿Sí?,asesor,0.5,2,4.0,True
6,17.52,26.1,"Gracias, mire, un favor, es que yo llamo el m...",cliente,8.58,30,3.496503,True


In [38]:
exportText(df_transcript, parameters['transcription'], parametric['transcript_file'])