# Simple Speech-To-Text Implementation

About:
This takes an audio wav file, converts it to a signed integer PCM wav file if it is not already in that format, and then converts the speech to text using pocketsphinx

In [2]:
#!/usr/bin/env python3
"""
Recognize a single utterance from a WAV file.

Supporting other file types is left as an exercise to the reader.
"""

# MIT license (c) 2022, see LICENSE for more information.
# Author: David Huggins-Daines <dhdaines@gmail.com>

from pocketsphinx import Decoder
import argparse
import wave
import sox
import os

In [3]:
def ConvertToSignedIntPCM(file):
    # create a transformer
    tfm = sox.Transformer()

    # set the output bit depth to 16 bits (for PCM)
    tfm.convert(bitdepth=16)
    filename, extension = os.path.splitext(file)
    out_file = f"{filename}_s16pcm{extension}"
    
    if tfm.build(file, out_file):
        encoding = sox.file_info.encoding(out_file)
        bitdepth = sox.file_info.bitdepth(out_file)
        print(f"{file} converted successfully\n'{out_file}' details: \nencoding: '{encoding}'\nbitdepth: '{bitdepth}'")
        
    else:
        print('An error occurred.')
        
    return out_file

In [13]:
audio_file = "/home/regal/devel/ws_cacti/src/hri_cacti_xr/data/go_right/30x_4mono_10hz/20240218_222152_go_right.wav"

# may need to convert wave file to 16-bit PCM
if(sox.file_info.bitdepth(audio_file) != 16):
    audio_file = ConvertToSignedIntPCM(audio_file)

In [14]:
print(sox.file_info.sample_rate(audio_file))
print(sox.file_info.num_samples(audio_file))
print(sox.file_info.encoding(audio_file))


16000.0
133440
Signed Integer PCM


In [15]:
with wave.open(audio_file, "rb") as audio:
    decoder = Decoder(samprate=audio.getframerate())
    decoder.start_utt()
    decoder.process_raw(audio.getfp().read(), full_utt=True)
    decoder.end_utt()
    print(decoder.hyp().hypstr)

i love it the whole of but
