From 3f303d710a110e572b54120edf1896ea1e3ca376 Mon Sep 17 00:00:00 2001 From: Damien Murphy Date: Mon, 15 Apr 2024 09:20:27 -0700 Subject: [PATCH 1/7] Handle is_final and endpointing together with utterance end + clean up the output so it's easier to follow --- examples/streaming/microphone/main.py | 37 ++++++++++++++++++++------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/examples/streaming/microphone/main.py b/examples/streaming/microphone/main.py index 262ba713..2def7091 100644 --- a/examples/streaming/microphone/main.py +++ b/examples/streaming/microphone/main.py @@ -16,6 +16,7 @@ load_dotenv() +is_finals = [] def main(): try: @@ -30,31 +31,43 @@ def main(): dg_connection = deepgram.listen.live.v("1") def on_open(self, open, **kwargs): - print(f"\n\n{open}\n\n") + print(f"Deepgram Connection Open") def on_message(self, result, **kwargs): + global is_finals sentence = result.channel.alternatives[0].transcript if len(sentence) == 0: return - print(f"speaker: {sentence}") + if result.is_final: + is_finals.append(sentence) + if result.speech_final: + utterance = ' '.join(is_finals) + print(f"Speech Final: {utterance}") + is_finals = [] + else: + print(f"Interim Results: {sentence}") def on_metadata(self, metadata, **kwargs): - print(f"\n\n{metadata}\n\n") + print(f"Deepgram Metadata: {metadata}") def on_speech_started(self, speech_started, **kwargs): - print(f"\n\n{speech_started}\n\n") + print(f"Speech Started") def on_utterance_end(self, utterance_end, **kwargs): - print(f"\n\n{utterance_end}\n\n") + global is_finals + if len(is_finals) > 0: + utterance = ' '.join(is_finals) + print(f"Utterance End: {utterance}") + is_finals = [] def on_close(self, close, **kwargs): - print(f"\n\n{close}\n\n") + print(f"Deepgram Connection Closed") def on_error(self, error, **kwargs): - print(f"\n\n{error}\n\n") + print(f"Deepgram Handled Error: {error}") def on_unhandled(self, unhandled, **kwargs): - print(f"\n\n{unhandled}\n\n") + print(f"Deepgram Unhandled Error: {unhandled}") dg_connection.on(LiveTranscriptionEvents.Open, on_open) dg_connection.on(LiveTranscriptionEvents.Transcript, on_message) @@ -67,8 +80,10 @@ def on_unhandled(self, unhandled, **kwargs): options: LiveOptions = LiveOptions( model="nova-2", - punctuate=True, language="en-US", + # Apply smart formatting to the output + smart_format=True, + # Raw audio format deatils encoding="linear16", channels=1, sample_rate=16000, @@ -76,6 +91,10 @@ def on_unhandled(self, unhandled, **kwargs): interim_results=True, utterance_end_ms="1000", vad_events=True, + # Time in milliseconds of silence to wait for before finalizing speech + endpointing=300, + # Prevent waiting for additional numbers + # no_delay=True ) print("\n\nPress Enter to stop recording...\n\n") From b6d27c66aaa1a25edf9ef81a65e09ae58ac336a2 Mon Sep 17 00:00:00 2001 From: Damien Murphy Date: Mon, 15 Apr 2024 09:25:23 -0700 Subject: [PATCH 2/7] Add some additional details for is_final and interim results --- examples/streaming/microphone/main.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/streaming/microphone/main.py b/examples/streaming/microphone/main.py index 2def7091..05548a7c 100644 --- a/examples/streaming/microphone/main.py +++ b/examples/streaming/microphone/main.py @@ -44,20 +44,24 @@ def on_message(self, result, **kwargs): utterance = ' '.join(is_finals) print(f"Speech Final: {utterance}") is_finals = [] + else: + # These are useful if you need real time captioning and update what the Interim Results produced + print(f"Is Final: {sentence}") else: + # These are useful if you need real time captioning of what is being spoken print(f"Interim Results: {sentence}") def on_metadata(self, metadata, **kwargs): print(f"Deepgram Metadata: {metadata}") def on_speech_started(self, speech_started, **kwargs): - print(f"Speech Started") + print(f"Deepgram Speech Started") def on_utterance_end(self, utterance_end, **kwargs): global is_finals if len(is_finals) > 0: utterance = ' '.join(is_finals) - print(f"Utterance End: {utterance}") + print(f"Deepgram Utterance End: {utterance}") is_finals = [] def on_close(self, close, **kwargs): From ffa24038f2234bf538aab0778ecc2a0fcdc36d16 Mon Sep 17 00:00:00 2001 From: Damien Murphy Date: Mon, 15 Apr 2024 09:27:49 -0700 Subject: [PATCH 3/7] Add some more comments and a link --- examples/streaming/microphone/main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/streaming/microphone/main.py b/examples/streaming/microphone/main.py index 05548a7c..f4f30926 100644 --- a/examples/streaming/microphone/main.py +++ b/examples/streaming/microphone/main.py @@ -39,7 +39,12 @@ def on_message(self, result, **kwargs): if len(sentence) == 0: return if result.is_final: + # We need to collect these and concatenate them together when we get a speech_final=true + # See docs: https://developers.deepgram.com/docs/understand-endpointing-interim-results is_finals.append(sentence) + + # Speech Final means we have detected sufficent silence to consider this end of speech + # Speech final is the lowest latency result as it triggers as soon an the endpointing value has triggered if result.speech_final: utterance = ' '.join(is_finals) print(f"Speech Final: {utterance}") From d2e550ed9eaa15ba69d7c1daee7a60901aecf515 Mon Sep 17 00:00:00 2001 From: Damien Murphy Date: Mon, 15 Apr 2024 09:30:20 -0700 Subject: [PATCH 4/7] Remove error from unhandled --- examples/streaming/microphone/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/streaming/microphone/main.py b/examples/streaming/microphone/main.py index f4f30926..357e6d11 100644 --- a/examples/streaming/microphone/main.py +++ b/examples/streaming/microphone/main.py @@ -76,7 +76,7 @@ def on_error(self, error, **kwargs): print(f"Deepgram Handled Error: {error}") def on_unhandled(self, unhandled, **kwargs): - print(f"Deepgram Unhandled Error: {unhandled}") + print(f"Deepgram Unhandled Websocket Message: {unhandled}") dg_connection.on(LiveTranscriptionEvents.Open, on_open) dg_connection.on(LiveTranscriptionEvents.Transcript, on_message) From b48ba43c6201876f145877d2917ffcd6088d3525 Mon Sep 17 00:00:00 2001 From: Damien Murphy Date: Mon, 15 Apr 2024 10:15:52 -0700 Subject: [PATCH 5/7] Add no_delay --- examples/streaming/microphone/main.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/streaming/microphone/main.py b/examples/streaming/microphone/main.py index 357e6d11..7fed22a8 100644 --- a/examples/streaming/microphone/main.py +++ b/examples/streaming/microphone/main.py @@ -101,13 +101,16 @@ def on_unhandled(self, unhandled, **kwargs): utterance_end_ms="1000", vad_events=True, # Time in milliseconds of silence to wait for before finalizing speech - endpointing=300, - # Prevent waiting for additional numbers - # no_delay=True + endpointing=300 ) + addons = { + # Prevent waiting for additional numbers + "no_delay": "true" + } + print("\n\nPress Enter to stop recording...\n\n") - if dg_connection.start(options) is False: + if dg_connection.start(options, addons=addons) is False: print("Failed to connect to Deepgram") return From 4acba3f794d238ab99d3ff08d3557ad4db40f7c7 Mon Sep 17 00:00:00 2001 From: Damien Murphy Date: Mon, 15 Apr 2024 10:21:59 -0700 Subject: [PATCH 6/7] Add same changes to the async mic example --- examples/streaming/async_microphone/main.py | 58 ++++++++++++++++----- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/examples/streaming/async_microphone/main.py b/examples/streaming/async_microphone/main.py index 85c0db05..3d2b668d 100644 --- a/examples/streaming/async_microphone/main.py +++ b/examples/streaming/async_microphone/main.py @@ -18,6 +18,8 @@ load_dotenv() +# We will collect the is_final=true messages here so we can use them when the person finishes speaking +is_finals = [] async def main(): try: @@ -42,31 +44,52 @@ async def main(): dg_connection = deepgram.listen.asynclive.v("1") async def on_open(self, open, **kwargs): - print(f"\n\n{open}\n\n") + print(f"Deepgram Connection Open") async def on_message(self, result, **kwargs): + global is_finals sentence = result.channel.alternatives[0].transcript if len(sentence) == 0: return - print(f"speaker: {sentence}") + if result.is_final: + # We need to collect these and concatenate them together when we get a speech_final=true + # See docs: https://developers.deepgram.com/docs/understand-endpointing-interim-results + is_finals.append(sentence) + + # Speech Final means we have detected sufficent silence to consider this end of speech + # Speech final is the lowest latency result as it triggers as soon an the endpointing value has triggered + if result.speech_final: + utterance = ' '.join(is_finals) + print(f"Speech Final: {utterance}") + is_finals = [] + else: + # These are useful if you need real time captioning and update what the Interim Results produced + print(f"Is Final: {sentence}") + else: + # These are useful if you need real time captioning of what is being spoken + print(f"Interim Results: {sentence}") async def on_metadata(self, metadata, **kwargs): - print(f"\n\n{metadata}\n\n") + print(f"Deepgram Metadata: {metadata}") async def on_speech_started(self, speech_started, **kwargs): - print(f"\n\n{speech_started}\n\n") + print(f"Deepgram Speech Started") async def on_utterance_end(self, utterance_end, **kwargs): - print(f"\n\n{utterance_end}\n\n") + global is_finals + if len(is_finals) > 0: + utterance = ' '.join(is_finals) + print(f"Deepgram Utterance End: {utterance}") + is_finals = [] - def on_close(self, close, **kwargs): - print(f"\n\n{close}\n\n") + async def on_close(self, close, **kwargs): + print(f"Deepgram Connection Closed") - def on_error(self, error, **kwargs): - print(f"\n\n{error}\n\n") + async def on_error(self, error, **kwargs): + print(f"Deepgram Handled Error: {error}") - def on_unhandled(self, unhandled, **kwargs): - print(f"\n\n{unhandled}\n\n") + async def on_unhandled(self, unhandled, **kwargs): + print(f"Deepgram Unhandled Websocket Message: {unhandled}") dg_connection.on(LiveTranscriptionEvents.Open, on_open) dg_connection.on(LiveTranscriptionEvents.Transcript, on_message) @@ -80,8 +103,10 @@ def on_unhandled(self, unhandled, **kwargs): # connect to websocket options: LiveOptions = LiveOptions( model="nova-2", - punctuate=True, language="en-US", + # Apply smart formatting to the output + smart_format=True, + # Raw audio format deatils encoding="linear16", channels=1, sample_rate=16000, @@ -89,10 +114,17 @@ def on_unhandled(self, unhandled, **kwargs): interim_results=True, utterance_end_ms="1000", vad_events=True, + # Time in milliseconds of silence to wait for before finalizing speech + endpointing=300 ) + addons = { + # Prevent waiting for additional numbers + "no_delay": "true" + } + print("\n\nStart talking! Press Ctrl+C to stop...\n") - if await dg_connection.start(options) is False: + if await dg_connection.start(options, addons=addons) is False: print("Failed to connect to Deepgram") return From 10a8b2eea97655cef34c3bb7f92a20bc7607691e Mon Sep 17 00:00:00 2001 From: Damien Murphy Date: Mon, 15 Apr 2024 10:22:28 -0700 Subject: [PATCH 7/7] add comment --- examples/streaming/microphone/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/streaming/microphone/main.py b/examples/streaming/microphone/main.py index 7fed22a8..1e232d38 100644 --- a/examples/streaming/microphone/main.py +++ b/examples/streaming/microphone/main.py @@ -16,6 +16,7 @@ load_dotenv() +# We will collect the is_final=true messages here so we can use them when the person finishes speaking is_finals = [] def main():