Things to track:
- Pitch
- Content of speech ("um", "uhh", laughing)
- Response time (hesitation)
- How specific is the answer

Consider normalizing speech



In [2]:
import pyaudio
import wave
import keyboard

# Initialize pyaudio
audio = pyaudio.PyAudio()

# Define stream parameters
CHANNELS = 1
FORMAT = pyaudio.paInt16
RATE = 44100
CHUNK = 1024

# Open stream
stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

print("Recording... Press 'q' to stop.")

frames = []

# Record audio until 'q' is pressed
while not keyboard.is_pressed('q'):
    data = stream.read(CHUNK)
    frames.append(data)

print("Finished recording.")

# Stop and close the stream
stream.stop_stream()
stream.close()
audio.terminate()

# Save the recorded data to a file
wf = wave.open("output.wav", 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()


Recording... Press 'q' to stop.
Finished recording.


Testing Google's Cloud Speech-To-Text
https://cloud.google.com/speech-to-text/docs/transcribe-streaming-audio#speech-streaming-recognize-python

This transcribes streaming audio to text.

To install the library:
pip install --upgrade google-cloud-speech

Download the gcloud CLI: https://cloud.google.com/sdk/docs/install.

Configure the development environment. 
<img src="attachment:image.png" width="300"/>

For some reason you have to go here to accept terms of service with gcloud, before doing "gcloud init" command
https://console.cloud.google.com/terms/universal?pli=

You have to request access to use this, as it looks like 1 specific person will "own" the project. People can be added directly to the project.

Info:
Project name: truthinators
Project number: 399310020607
Project ID: truthinators

You have to enable the Speech to Text API for the specific project + add billing :(

API Request limits: 
https://cloud.google.com/speech-to-text/quotas
For our purposes, we should not be going over the API limitations (it's like... 480 hours of audio per day, 900 requests per 60 seconds).







In [24]:

import pyaudio

p = pyaudio.PyAudio()

for i in range(p.get_device_count()):
    info = p.get_device_info_by_index(i)
    if info["maxInputChannels"] > 0:
        print(f"Index {i}: {info['name']}")

p.terminate()


Index 0: Microsoft Sound Mapper - Input
Index 1: Microphone Array (IntelÂ® Smart 
Index 2: Headset (IntelÂ® Smart Sound Tec
Index 6: Primary Sound Capture Driver
Index 7: Microphone Array (IntelÂ® Smart Sound Technology for Digital Microphones)
Index 8: Headset (IntelÂ® Smart Sound Technology for BluetoothÂ® Audio)
Index 14: Headset (IntelÂ® Smart Sound Technology for BluetoothÂ® Audio)
Index 15: Microphone Array (IntelÂ® Smart Sound Technology for Digital Microphones)
Index 18: Input (@System32\drivers\bthhfenum.sys,#2;%1 Hands-Free%0
;(Jabra Evolve2 75))
Index 20: Headset 1 (@System32\drivers\bthhfenum.sys,#2;%1 Hands-Free%0
;(Jabra Evolve2 75))
Index 21: Headset 2 (@System32\drivers\bthhfenum.sys,#2;%1 Hands-Free%0
;(Jabra Evolve2 75))
Index 24: Input (@System32\drivers\btha2dp.sys,#1;%1%0
;(Jabra Evolve2 75))
Index 26: Microphone (Mic in at front panel (black))
Index 27: Stereo Mix (Realtek HD Audio Stereo input)
Index 30: PC Speaker (Realtek HD Audio output with SST)
Index 33: PC 

In [None]:
import queue
import re
import sys
import wave
import threading
from google.cloud import speech
import pyaudio
import keyboard

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms
FORMAT = pyaudio.paInt16
CHANNELS = 1
INPUT_DEVICE_INDEX = 2
LANGUAGE_CODE = "en-US"
'''
Common language codes:
English: en-US
French: fr-FR
Spanish: es-ES
German: de-DE
Chinese (simplified): cmn-Hans-CN
Japanese: ja-JP
Italian: it-IT

Full list: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
'''

class MicrophoneStream:
    """Opens a recording stream as a generator yielding the audio chunks."""

    def __init__(self, rate=RATE, chunk=CHUNK):
        self._rate = rate
        self._chunk = chunk
        self._buff = queue.Queue()
        self.closed = True
        self.audio_frames = []  # Store audio frames for saving
        self.should_stop = False  # Flag for keyboard interrupt

    def __enter__(self):
        self._audio_interface = pyaudio.PyAudio()
        self._audio_stream = self._audio_interface.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=self._rate,
            input=True,
            frames_per_buffer=self._chunk,
            stream_callback=self._fill_buffer,
            input_device_index=INPUT_DEVICE_INDEX
        )

        self.closed = False
        return self

    def __exit__(self, type, value, traceback):
        self._audio_stream.stop_stream()
        self._audio_stream.close()
        self.closed = True
        self._buff.put(None)
        self._audio_interface.terminate()
        
    def _getMicrophone(self):
        info = self._audio_interface.get_device_info_by_index(INPUT_DEVICE_INDEX)
        return info["name"]

    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
        """Continuously collect data from the audio stream, into the buffer."""
        self._buff.put(in_data)
        self.audio_frames.append(in_data)  # Save audio data for WAV file
        return None, pyaudio.paContinue

    def generator(self):
        """Generates audio chunks from the stream of audio data."""
        while not self.closed and not self.should_stop:
            chunk = self._buff.get()
            if chunk is None:
                return
            data = [chunk]

            while True:
                try:
                    chunk = self._buff.get(block=False)
                    if chunk is None:
                        return
                    data.append(chunk)
                except queue.Empty:
                    break

            yield b"".join(data)

    def save_audio(self, filename):
        """Save the recorded audio to a WAV file."""
        wf = wave.open(filename, 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(self._audio_interface.get_sample_size(FORMAT))
        wf.setframerate(self._rate)
        wf.writeframes(b''.join(self.audio_frames))
        wf.close()
        print(f"\nAudio saved to {filename}")

def check_for_quit(stream):
    """Monitor for 'q' key press."""
    keyboard.wait('q')
    stream.should_stop = True
    print("\nStopped by keyboard ('q' pressed)")

def listen_print_save_loop(responses, stream, name):
    """Iterates through server responses and prints them while recording audio."""
    num_chars_printed = 0
    final_transcript = ""

    with open(f'{name}_transcript.txt', "w", encoding="utf-8") as file:
        for response in responses:
            if stream.should_stop:
                break

            if not response.results:
                continue

            result = response.results[0]
            if not result.alternatives:
                continue

            transcript = result.alternatives[0].transcript
            overwrite_chars = " " * (num_chars_printed - len(transcript))

            if not result.is_final:
                sys.stdout.write(transcript + overwrite_chars + "\r")
                sys.stdout.flush()
                num_chars_printed = len(transcript)
            else:
                print(transcript + overwrite_chars)
                file.write(transcript + overwrite_chars + "\n")

                if re.search(r"\b(exit|quit)\b", transcript, re.I):
                    print("\nStopped by voice command ('exit' or 'quit' detected)")
                    stream.should_stop = True
                    break

                num_chars_printed = 0
                final_transcript += transcript + " "

    # Save audio file before exiting
    stream.save_audio(f"{name}_audio.wav")
    print(f"Transcript saved to {name}_transcript.txt")
    return final_transcript.strip()

def main():    
    print("Enter participant name: ")
    participant_name = input().strip()
    
    print("Recording... Say 'exit' or 'quit' OR press 'q' to stop.")

    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code=LANGUAGE_CODE,
    )

    streaming_config = speech.StreamingRecognitionConfig(
        config=config, interim_results=True
    )

    with MicrophoneStream(RATE, CHUNK) as stream:
        print(f"Recording from {stream._getMicrophone()}...")
        # Start keyboard monitoring in a separate thread
        keyboard_thread = threading.Thread(target=check_for_quit, args=(stream,))
        keyboard_thread.daemon = True
        keyboard_thread.start()

        audio_generator = stream.generator()
        requests = (
            speech.StreamingRecognizeRequest(audio_content=content)
            for content in audio_generator
        )

        try:
            responses = client.streaming_recognize(streaming_config, requests)
            listen_print_save_loop(responses, stream, participant_name)
        except Exception as e:
            print(f"\nAn error occurred: {e}")
            stream.save_audio(f"{participant_name}_audio.wav")

if __name__ == "__main__":
    main()

Enter participant name: 
Recording... Say 'exit' or 'quit' OR press 'q' to stop.
Recording from Headset (IntelÂ® Smart Sound Tec...
你好吗
我叫王晓阳我的朋友

Stopped by keyboard ('q' pressed)

Audio saved to test_audio.wav
Transcript saved to test_transcript.txt
