# main

In [1]:
import requests
import os
import openai
import speech_recognition as sr
from dotenv import load_dotenv
import time
import re

# setup
load_dotenv() # Load environment variables from .env file
openai.api_key = os.getenv("AZURE_OPENAI_KEY")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_type = 'azure'
openai.api_version = '2023-05-15'
BODY_URL = "http://localhost:5004"  # Assuming it runs locally

class NaoStream:
    def __init__(self, audio_generator):
        self.audio_generator = audio_generator

    def read(self, size=-1):  # added size parameter, default -1
        try:
            return next(self.audio_generator)
        except StopIteration:
            return b''

class NaoAudioSource(sr.AudioSource):
    def __init__(self, server_url='http://localhost:5004'): 
        self.server_url = server_url
        self.stream = None
        self.is_listening = False
        self.CHUNK = 1024
        self.SAMPLE_RATE = 16000
        self.SAMPLE_WIDTH = 2

    def __enter__(self): # this is called when using the "with" statement
        requests.post(f"{self.server_url}/start_listening")
        self.is_listening = True
        self.stream = NaoStream(self.audio_generator())  # wrap the generator
        return self # return object (self) to be used in the with statement

    def audio_generator(self): # generator function that continuously fetches audio chunks from the server as long as 'self.is_listening' is True
        while self.is_listening:
            response = requests.get(f"{self.server_url}/get_audio_chunk")
            yield response.content 
            # yield is used to return a value from a generator function, but unlike return, it doesn't terminate the function
            # instead, it suspends the function and saves its state for later resumption
            time.sleep(0.05)

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.is_listening = False
        requests.post(f"{self.server_url}/stop_listening")

def get_user_text():

    max_tries = 5
    for i in range(max_tries):
            
        # Record audio
        filename = "input.wav"
        start = time.time()
        print("Recording...")
        with NaoAudioSource() as source:
            recognizer = sr.Recognizer()
            source.pause_threshold = 1 # seconds of non-speaking audio before a phrase is considered complete
            audio = recognizer.listen(source, phrase_time_limit=None, timeout=None)
            with open(filename, "wb") as f:
                f.write(audio.get_wav_data())
        end = time.time()
        print(f"Recording took {end - start} seconds")

        # Transcribe audio to text
        try: 
            start = time.time()
            print("Transcribing...")
            text = recognizer.recognize_google(audio)
            end = time.time()
            print(f"Transcribing took {end - start} seconds")
            print("You said: " + text)
            return text
        except sr.UnknownValueError:
            print(f"Google Speech Recognition could not understand audio, retrying ({i+1}/{max_tries})")
    
    return "" # Return empty string if transcription failed after max_tries, GPT will then handle it

# posture command pattern
posture_command_pattern = r"\^\(posture: (\w+)\)"

# conversation loop ############################################################################################################################################################################

with open("system_prompt.txt", "r") as f:
    system_prompt = f.read() # Read system prompt from file

conversation_context = [{"role": "system", "content": system_prompt}] # Initialize conversation context with system prompt

running = True
counter = 0
while running:
    counter += 1
    user_message = get_user_text() # Get the user's message
    conversation_context.append({"role": "user", "content": user_message}) # Add the user's message to the conversation context

    # Process the received input with GPT-4
    start = time.time()
    response = openai.ChatCompletion.create(
        engine="NAO35",
        messages=conversation_context
    )
    end = time.time()
    print(f"{response.engine} took {end - start} seconds to respond")

    # Extract the GPT-4 response
    gpt4_message = response['choices'][0]['message']['content']
    print(f"Nao: {gpt4_message}")

    # Add the GPT-4 response to the conversation context
    conversation_context.append({"role": "assistant", "content": gpt4_message})
    
    # Write conversation context to file for easier debugging
    with open("conversation_context.txt", "w") as f:
        for entry in conversation_context:
            role = entry['role'].capitalize()  # Capitalize the role for formatting
            content = entry['content']
            f.write(f"{role}:\n{content}\n\n")

    # Define the pattern to match the whole command including the parentheses and caret
    posture_command_pattern_full = r"(\^\(posture: \w+\))"

    # Split the response into segments of talk and posture commands
    segments = re.split(posture_command_pattern_full, gpt4_message)

    for segment in segments:
        if re.match(posture_command_pattern_full, segment):
            # If the segment matches the posture command pattern, extract the command and send it
            posture_command = re.findall(r"\^\(posture: (\w+)\)", segment)[0]
            response = requests.post(f"{BODY_URL}/change_posture", json={"posture": posture_command})
            if response.json().get("success"):
                print(f"Posture changed to {posture_command}")
            else:
                print(f"Failed to change posture to {posture_command}")
        else:
            # If the segment is not a command, it's part of the talk message
            # Send the segment to be spoken by the robot, if it is not empty
            if segment.strip():  # Check if the segment is not just whitespace
                response = requests.post(f"{BODY_URL}/talk", json={"message": segment.strip()})
                if response.ok:  # Check if the request was successful
                    print(f"Speaking: {segment.strip()}")
                else:
                    print("Failed to send the talk command.")

Recording...
Recording took 5.398397922515869 seconds
Transcribing...
Transcribing took 1.1892950534820557 seconds
You said: I don't know please stand up
NAO35 took 0.40936803817749023 seconds to respond
Nao: ^(posture: Stand)
Posture changed to Stand
Recording...
Recording took 3.0684778690338135 seconds
Transcribing...
Google Speech Recognition could not understand audio, retrying (1/5)
Recording...
Recording took 2.154995918273926 seconds
Transcribing...
Transcribing took 0.9202220439910889 seconds
You said: repeat again
NAO35 took 0.21056675910949707 seconds to respond
Nao: ^(posture: Stand)
Posture changed to Stand
Recording...
Recording took 8.011915922164917 seconds
Transcribing...
Transcribing took 0.9687099456787109 seconds
You said: please say it again
NAO35 took 0.41015076637268066 seconds to respond
Nao: I apologize for the confusion. Here it is again: ^(posture: Stand)
Speaking: I apologize for the confusion. Here it is again:
Posture changed to Stand
Recording...
Recordin

ConnectionError: HTTPConnectionPool(host='localhost', port=5004): Max retries exceeded with url: /stop_listening (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc92c710880>: Failed to establish a new connection: [Errno 61] Connection refused'))

## debugging

### trying noise reduction library
=> result was not good

In [3]:
import requests
import os
import openai
import speech_recognition as sr
from dotenv import load_dotenv
import time
import noisereduce as nr
import soundfile as sf
import librosa

In [4]:
def speech_to_text(filename):
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except:
        print("Skipping unknown error")

In [5]:
audio_clip, sample_rate = librosa.load("input.wav", sr=16000) # open input.wav for noise reduction
reduced_noise = nr.reduce_noise(y=audio_clip, sr=sample_rate) # reduce noise
sf.write("input_noise_reduced.wav", reduced_noise, sample_rate) # write noise reduced audio to a new file

In [6]:
speech_to_text("input.wav")

"an apple is around edible fruit produced by an apple tree apple trees are cultivated worldwide and are the most likely grown species in the Guinness models the tree originated in Central Asia where it's wild ancestor models CA vercy is still found apple trees have been grown for thousands of years in Asia and Europe and were introduced to North America by European colonists apples have religious and mythological significance in many cultures including North Greek and European Christian traditional"

In [7]:
speech_to_text("input_noise_reduced.wav")

"edible fruit produced by an apple tree apple trees are cultivated for a flight not most likely grown species in the Game of Thrones the tree originated in Central Asia where it's about ancestors is still on at 3 7 5"

### trying adjust_for_ambient_noise
=> result was not good

In [8]:
import requests
import os
import openai
import speech_recognition as sr
from dotenv import load_dotenv
import time
import noisereduce as nr
import soundfile as sf
import random

In [9]:
def speech_to_text_with_ambient_noise(filename):
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        recognizer.adjust_for_ambient_noise(source)
        recognizer.energy_threshold = 400  # Example value, tweak as needed
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except:
        print("Skipping unknown error")

In [10]:
speech_to_text_with_ambient_noise("input.wav")

"edible fruit produced by an apple tree apple trees are cultivated worldwide and are the most likely grown species in the Guinness models the tree originated in Central Asia where it's wild ancestor models CA vercy is still found apple trees have been grown for thousands of years in Asia and Europe and were introduced to North America by European colonists apples have religious and mythological significance in many cultures including North Greek and European Christian traditional"

### trying spectral subtraction
=> result was not good at all, worst so far

In [11]:
import numpy as np
import librosa
import requests
import os
import openai
import speech_recognition as sr
from dotenv import load_dotenv
import time
import noisereduce as nr
import soundfile as sf

def spectral_subtraction(y, sample_rate, frame_size=2048, hop_length=512):
    # Compute the short-time Fourier transform
    D = librosa.stft(y, n_fft=frame_size, hop_length=hop_length)

    # Estimate the noise spectrum by averaging consecutive frames in silent or quiet parts of the audio
    # For simplicity, we're taking the mean of all the frames assuming the audio has more noise than speech.
    # This can be replaced by a more robust noise estimation technique if required.
    noise_spectrum = np.mean(np.abs(D), axis=1)

    # Subtract the noise magnitude from the noisy speech magnitude
    clean_mag = np.abs(D) - noise_spectrum[:, np.newaxis]
    clean_mag = np.maximum(clean_mag, 0)  # Ensure there are no negative values

    # Retain the phase from the original transform to reconstruct the time-domain signal
    clean_signal = clean_mag * np.exp(1j * np.angle(D))

    # Inverse STFT to convert back to time domain
    y_clean = librosa.istft(clean_signal, hop_length=hop_length)

    return y_clean

def speech_to_text(filename):
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except:
        print("Skipping unknown error")

In [12]:
# Load your audio file
y, sample_rate = librosa.load("input.wav", sr=16000)

# Apply spectral subtraction
y_clean = spectral_subtraction(y, sample_rate)

# Save the cleaned audio
sf.write("input_spectral_subtraction.wav", y_clean, sample_rate)

In [13]:
speech_to_text("input_spectral_subtraction.wav")

"apple is around edible fruit reduce high and apple tree apple trees are cultivated worldwide and are the most likely grown species in the Guinness models the tree originated in Central Asia where it's valid ancestor calls Sierra C is still found apple trees have been grown for thousands of years in Asia and Europe and were introduced to North America by European colonists apples have religious and mythological significance in many cultures including Norse Greek and European Christian traditional"

# experimental stuff

## start audio, get chunk, stop audio

In [None]:
import requests
from io import BytesIO
import time
import numpy as np
import matplotlib.pyplot as plt


BODY_URL = "http://localhost:5003"  # Assuming it runs locally

In [None]:
# test audio start recording
requests.post(f"{BODY_URL}/start_listening")

In [None]:
# test get audio chunk
audio_response = requests.get(f"{BODY_URL}/get_audio_chunk")

In [None]:
# test audio stop recording
requests.post(f"{BODY_URL}/stop_listening")

In [None]:
print(audio_response)
print(audio_response.headers)
print(audio_response.encoding)
print(audio_response.elapsed)
print(audio_response.status_code)

# find out how long an audio chunk is
audio_chunk = BytesIO(audio_response.content)
print(len(audio_chunk.getvalue()))

## digital audio processing of audio from nao

### get the audio 

In [None]:
# Start listening on the NAO robot
requests.post(f"{BODY_URL}/start_listening")

is_listening = True
audio_chunks = []

for i in range(30):

    time.sleep(0.1)  # Sleep for a short duration before trying again

    # Request a chunk of audio from the NAO server
    audio_response = requests.get(f"{BODY_URL}/get_audio_chunk")
    
    # Use BytesIO to handle byte data directly
    audio_chunk = BytesIO(audio_response.content)
    audio_chunks.append(audio_chunk)

# Stop listening on the NAO robot
requests.post(f"{BODY_URL}/stop_listening")

# Concatenate all the audio chunks to make them one continuous audio stream
full_audio = BytesIO()

for chunk in audio_chunks:
    full_audio.write(chunk.getvalue())

### play the audio

In [None]:
import sounddevice as sd


# Convert the concatenated audio stream to a numpy array
audio_data = np.frombuffer(full_audio.getvalue(), dtype=np.int16)

# Play the audio
sample_rate = 16000  # Adjust this according to your audio's sample rate
sd.play(audio_data, samplerate=sample_rate)
sd.wait()  # Wait until audio playback is done

### visualize the audio

In [None]:
# Convert the audio bytes to a numpy array of 16-bit integers
audio_array = np.frombuffer(full_audio.getvalue(), dtype=np.int16)

# Plot the audio waveform
plt.figure(figsize=(14, 4))
plt.plot(audio_array)
plt.title('Audio Waveform')
plt.xlabel('Samples')
plt.ylabel('Amplitude')
plt.show()

## simple speech to text with microphone from laptop

In [None]:
import speech_recognition as sr

def speech_to_text(filename):
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except:
        print("Skipping unknown error")

def main():
    while True:
        # Wait for user to say "Nao"
        print("Say 'Genius' to start recording...")
        with sr.Microphone() as source:
            recognizer = sr.Recognizer()
            audio = recognizer.listen(source)
            try:
                transcription = recognizer.recognize_google(audio)
                print("You said: " + transcription)
                if transcription.lower() == "genius":
                    # Record audio
                    filename = "input.wav"
                    print("Recording...")
                    with sr.Microphone() as source:
                        recognizer = sr.Recognizer()
                        source.pause_threshold = 1
                        audio = recognizer.listen(source, phrase_time_limit=None, timeout=None)
                        with open(filename, "wb") as f:
                            f.write(audio.get_wav_data())

                    # Transcribe audio to text
                    text = speech_to_text(filename)
                    if text:
                        print("You said: " + text)
                        response = "Hello, I am Nao."
                        print("Nao said: " + response)
            except Exception as e:
                print("An error occurred: " + str(e))


if __name__ == "__main__":
    main()

### even simpler

In [None]:
import speech_recognition as sr

def speech_to_text(filename):
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except:
        print("Skipping unknown error")


while True:
    # Record audio
    filename = "input.wav"
    print("Recording...")
    with sr.Microphone() as source:
        recognizer = sr.Recognizer()
        source.pause_threshold = 1 # seconds of non-speaking audio before a phrase is considered complete
        audio = recognizer.listen(source, phrase_time_limit=None, timeout=None)
        with open(filename, "wb") as f:
            f.write(audio.get_wav_data())

    # Transcribe audio to text
    text = speech_to_text(filename)
    if text:
        print("You said: " + text)
        response = "Hello, I am Nao."
        print("Nao said: " + response)

## speech recognition with NAO microphone

### speech recognition 23.10.2023 first try
-> once got a successful transcription... but usually does not work...

In [None]:
import requests
from io import BytesIO
import time
import numpy as np
import speech_recognition as sr

BODY_URL = "http://localhost:5003"  # Assuming it runs locally

def recognize_speech_from_nao():
    # Start listening on the NAO robot
    requests.post(f"{BODY_URL}/start_listening")

    # Use the Recognizer class from speech_recognition package
    r = sr.Recognizer()

    # Accumulated audio buffer
    audio_buffer = BytesIO()

    SILENCE_THRESHOLD = 320
    SILENCE_LIMIT = 30
    silent_chunks = 0

    print("Listening...")

    while True:
        # Fetch an audio chunk from the NAO robot
        audio_response = requests.get(f"{BODY_URL}/get_audio_chunk")
        audio_chunk = audio_response.content

        # Append the chunk to our buffer
        audio_buffer.write(audio_chunk)

        # Detecting silence by checking the amplitude of the audio
        if max(audio_chunk) < SILENCE_THRESHOLD:
            silent_chunks += 1
        else:
            silent_chunks = 0

        # If enough silent chunks detected, assume end of speech
        if silent_chunks > SILENCE_LIMIT:
            break
        
        time.sleep(0.01)  # Sleep for a short duration before fetching next chunk

    # Convert the audio bytes into an AudioData object for the recognizer
    audio_data = sr.AudioData(audio_buffer.getvalue(), 16000, 2)

    print("Recognizing...")
    try:
        # Use Google's speech recognition
        text = r.recognize_google(audio_data)
        print("You said:", text)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError:
        print("Could not request results from Google Speech Recognition service")

    # Stop listening on the NAO robot
    requests.post(f"{BODY_URL}/stop_listening")

recognize_speech_from_nao()

### speech recognition 23.10 second try

In [None]:
import requests
from io import BytesIO
import speech_recognition as sr

BODY_URL = "http://localhost:5003"

def transcribe_realtime():
    r = sr.Recognizer()
    
    requests.post(f"{BODY_URL}/start_listening")

    while True:
        try:
            # Fetch audio data
            audio_response = requests.get(f"{BODY_URL}/get_audio_chunk")
            audio_chunk = BytesIO(audio_response.content)
            
            # Use the Recognizer instance to recognize the audio
            audio_data = sr.AudioData(audio_chunk.getvalue(), sample_rate=16000, sample_width=2)
            text = r.recognize_google(audio_data)
            print(text)  # You can process the transcribed text further or store it
            
        except sr.UnknownValueError:
            print("Could not understand audio")
        except sr.RequestError:
            print("API unavailable or unresponsive")
            break

    requests.post(f"{BODY_URL}/stop_listening")

transcribe_realtime()

### speech recognition 23.10 third try

In [None]:
import speech_recognition as sr
import requests
from io import BytesIO

BODY_URL = "http://localhost:5004"

class NaoAudioSource(sr.AudioSource):
    def __init__(self):
        self.pause_threshold = 4

    def __enter__(self):
        self.audio_chunks = []
        requests.post(f"{BODY_URL}/start_listening")
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        requests.post(f"{BODY_URL}/stop_listening")

    def listen(self, recognizer, timeout=None):
        start_time = time.time()
        while True:
            audio_response = requests.get(f"{BODY_URL}/get_audio_chunk")
            audio_chunk = BytesIO(audio_response.content)
            self.audio_chunks.append(audio_chunk)

            # Check for pause threshold to determine end of phrase
            if time.time() - start_time > self.pause_threshold:
                full_audio = BytesIO()
                for chunk in self.audio_chunks:
                    full_audio.write(chunk.getvalue())
                return sr.AudioData(full_audio.getvalue(), 16000, 2) # Assuming 16kHz rate, 16-bit depth

def speech_to_text(audio_data):
    recognizer = sr.Recognizer()
    try:
        return recognizer.recognize_google(audio_data)
    except:
        print("Skipping unknown error")

while True:
    # Record audio
    print("Recording...")
    with NaoAudioSource() as source:
        recognizer = sr.Recognizer()
        audio = source.listen(recognizer)

    # Transcribe audio to text
    text = speech_to_text(audio)
    if text:
        print("You said: " + text)
        response = "Hello, I am Nao."
        print("Nao said: " + response)


### speech recognition 23.10 fourth try

In [None]:
import requests
import speech_recognition as sr
from io import BytesIO

BODY_URL = "http://localhost:5004"

class NaoAudioSource(sr.AudioSource):
    CHUNK = 2048  # Adjust as needed
    SAMPLE_RATE = 16000  # As used in NAO code
    SAMPLE_WIDTH = 2  
    
    def __init__(self):
        self.stream = BytesIO()
    
    def __enter__(self):
        requests.post(f"{BODY_URL}/start_listening")
        return self
    
    def __exit__(self, exc_type, exc_value, traceback):
        requests.post(f"{BODY_URL}/stop_listening")

    def read(self, size=-1):
        audio_response = requests.get(f"{BODY_URL}/get_audio_chunk")
        audio_chunk = BytesIO(audio_response.content)
        self.stream.write(audio_chunk.getvalue())
        return audio_chunk.getvalue()

def speech_to_text(filename):
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except Exception as e:
        print(f"Error: {e}")
        print("Skipping unknown error")

while True:
    # Record audio
    filename = "input.wav"
    print("Recording...")
    with NaoAudioSource() as source:
        recognizer = sr.Recognizer()
        source.pause_threshold = 3 # seconds of non-speaking audio before a phrase is considered complete
        audio = recognizer.listen(source, phrase_time_limit=None, timeout=None)
        with open(filename, "wb") as f:
            f.write(audio.get_wav_data())

    # Transcribe audio to text
    text = speech_to_text(filename)
    if text:
        print("You said: " + text)
        response = "Hello, I am Nao."
        print("Nao said: " + response)

### speech recognition 23.10 fifth try

In [14]:
import speech_recognition as sr
import requests

class NaoStream:
    def __init__(self, audio_generator):
        self.audio_generator = audio_generator

    def read(self, size=-1):  # added size parameter, default -1
        try:
            return next(self.audio_generator)
        except StopIteration:
            return b''

class NaoAudioSource(sr.AudioSource):
    def __init__(self, server_url='http://localhost:5004'):
        self.server_url = server_url
        self.stream = None
        self.is_listening = False
        self.CHUNK = 1024
        self.SAMPLE_RATE = 16000
        self.SAMPLE_WIDTH = 2

    def __enter__(self):
        requests.post(f"{self.server_url}/start_listening")
        self.is_listening = True
        self.stream = NaoStream(self.audio_generator())  # Wrap the generator
        return self

    def audio_generator(self):
        while self.is_listening:
            response = requests.get(f"{self.server_url}/get_audio_chunk")
            print(response.content)
            yield response.content

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.is_listening = False
        requests.post(f"{self.server_url}/stop_listening")

In [16]:
# Test the NaoAudioSource class
filename = "input.wav"
print("Recording...")
with NaoAudioSource() as source:
    recognizer = sr.Recognizer()
    source.pause_threshold = 1 # seconds of non-speaking audio before a phrase is considered complete
    audio = recognizer.listen(source, phrase_time_limit=None, timeout=None)
    with open(filename, "wb") as f:
        f.write(audio.get_wav_data())

# Transcribe audio to text
text = recognizer.recognize_google(audio)
print("You said: " + text)

Recording...
b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x02\x00\xfd\xff\x03\x00\xfc\xff\x03\x00\xfe\xff\xfe\xff\x0c\x00]\xff\x00\xff9\xff;\xff?\xff\x1c\xff\x1a\xff3\xffB\xff\x85\xff\x9d\xffp\xff\x9c\xff\xdb\xff\xc0\xff\xbd\xff\xc9\xff\xbd\xff\x9b\xffP\xff\r\xff\x06\xff:\xffO\xfff\xffg\xff8\xffo\xff\xe4\xff\r\x00\x0e\x00&\x009\x00:\x00,\x00\x1d\x00+\x00\x16\x00\xea\xff\xdc\xff\xc8\xff\xb2\xff\xa2\xff\xc1\xff\xe2\xff\xe6\xff\xf7\xff\r\x00\x14\x00#\x003\x00#\x00-\x006\x00^\x00\x8d\x00\x82\x00\xaa\x00\xbf\x00\xab\x00\x97\x00\x89\x00\x8e\x00d\x00^\x00r\x00H\x00E\x00i\x00\x8a\x00\xc2\x00\xf3\x00$\x01T\x01)\x01\x11\x01O\x01#\x01\x08\x01\x1f\x01\xd9\x00\x95\x00m\x00r\x00\x92\x00\x91\x00\xa5\x00\xc9\x00\xd2\x00\xed\x00\x16\x01\x1e\x01\xf3\x00\xce\x00\xda\x00\xfa\x00\x13\x01\x07\x01\xf3\x00\xfb\x00\xfa\x00\xc9\x00\xa1\x00\x89\x00\x88\x00\x8f\x00q\x00A\x00C\x00\x87\x00\xa1\x00\x97\x00I\x00\x0e\x00T\x00Q\x00E\x00G\x00\xf2\xff\xde\xff\xb8\xff\x80\xffp\xff9\xff@\xffQ\xff8\xff

In [None]:
with NaoAudioSource() as source:
    recognizer = sr.Recognizer()
    audio = recognizer.listen(source, phrase_time_limit=None, timeout=None)
    with open(filename, "wb") as f:
        f.write(audio.get_wav_data())