deepgram · davidvonthenen · Jan 26, 2024 · Jan 18, 2024
@@ -20,6 +20,7 @@
 from .client import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )

@@ -21,6 +21,7 @@
 from .clients import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )

@@ -13,6 +13,7 @@
 from .live import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )

@@ -36,6 +36,7 @@
 from .live import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )

@@ -9,6 +9,7 @@
 from .client import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )

@@ -9,6 +9,7 @@
 from .v1.response import (
     LiveResultResponse as LiveResultResponseLatest,
     MetadataResponse as MetadataResponseLatest,
+    SpeechStartedResponse as SpeechStartedResponseLatest,
     UtteranceEndResponse as UtteranceEndResponseLatest,
     ErrorResponse as ErrorResponseLatest,
 )
@@ -45,6 +46,14 @@ class MetadataResponse(MetadataResponseLatest):
     pass
 
 
+class SpeechStartedResponse(SpeechStartedResponseLatest):
+    """
+    pass through for SpeechStartedResponse based on API version
+    """
+
+    pass
+
+
 class UtteranceEndResponse(UtteranceEndResponseLatest):
     """
     pass through for UtteranceEndResponse based on API version

@@ -15,5 +15,6 @@ class LiveTranscriptionEvents(Enum):
     Transcript = "Results"
     Metadata = "Metadata"
     UtteranceEnd = "UtteranceEnd"
+    SpeechStarted = "SpeechStarted"
     Error = "Error"
     Warning = "Warning"
@@ -9,6 +9,7 @@
 from .response import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
@@ -14,6 +14,7 @@
 from .response import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
@@ -127,6 +128,19 @@ async def _start(self) -> None:
                             metadata=result,
                             **dict(self.kwargs),
                         )
+                    case LiveTranscriptionEvents.SpeechStarted.value:
+                        self.logger.debug(
+                            "response_type: %s, data: %s", response_type, data
+                        )
+                        result = SpeechStartedResponse.from_json(message)
+                        if result is None:
+                            self.logger.error("SpeechStartedResponse.from_json is None")
+                            continue
+                        await self._emit(
+                            LiveTranscriptionEvents.SpeechStarted,
+                            speech_started=result,
+                            **dict(self.kwargs),
+                        )
                     case LiveTranscriptionEvents.UtteranceEnd.value:
                         self.logger.debug(
                             "response_type: %s, data: %s", response_type, data

@@ -16,6 +16,7 @@
 from .response import (
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
@@ -159,6 +160,19 @@ def _listening(self) -> None:
                             metadata=result,
                             **dict(self.kwargs),
                         )
+                    case LiveTranscriptionEvents.SpeechStarted.value:
+                        self.logger.debug(
+                            "response_type: %s, data: %s", response_type, data
+                        )
+                        result = SpeechStartedResponse.from_json(message)
+                        if result is None:
+                            self.logger.error("SpeechStartedResponse.from_json is None")
+                            continue
+                        self._emit(
+                            LiveTranscriptionEvents.SpeechStarted,
+                            speech_started=result,
+                            **dict(self.kwargs),
+                        )
                     case LiveTranscriptionEvents.UtteranceEnd.value:
                         self.logger.debug(
                             "response_type: %s, data: %s", response_type, data

@@ -44,6 +44,7 @@ class LiveOptions:
     tag: Optional[list] = None
     tier: Optional[str] = None
     utterance_end_ms: Optional[str] = None
+    vad_events: Optional[bool] = None
     version: Optional[str] = None
 
     def __getitem__(self, key):

@@ -161,6 +161,25 @@ def __getitem__(self, key):
         return _dict[key]
 
 
+# Speech Started Message
+
+
+@dataclass_json
+@dataclass
+class SpeechStartedResponse:
+    """
+    SpeechStartedResponse Message from the Deepgram Platform
+    """
+
+    type: Optional[str] = ""
+    channel: Optional[List[int]] = None
+    timestamp: Optional[float] = 0
+
+    def __getitem__(self, key):
+        _dict = self.to_dict()
+        return _dict[key]
+
+
 # Utterance End Message
 
 

@@ -39,6 +39,9 @@ def on_message(self, result, **kwargs):
         def on_metadata(self, metadata, **kwargs):
             print(f"\n\n{metadata}\n\n")
 
+        def on_speech_started(self, speech_started, **kwargs):
+            print(f"\n\n{speech_started}\n\n")
+
         def on_utterance_end(self, utterance_end, **kwargs):
             print(f"\n\n{utterance_end}\n\n")
 
@@ -47,11 +50,12 @@ def on_error(self, error, **kwargs):
 
         liveClient.on(LiveTranscriptionEvents.Transcript, on_message)
         liveClient.on(LiveTranscriptionEvents.Metadata, on_metadata)
+        liveClient.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
         liveClient.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
         liveClient.on(LiveTranscriptionEvents.Error, on_error)
 
         # connect to websocket
-        options = LiveOptions(model="nova", interim_results=False, language="en-US")
+        options = LiveOptions(model="nova-2", language="en-US")
         liveClient.start(options)
 
         lock_exit = threading.Lock()

@@ -14,6 +14,7 @@
     Microphone,
     LiveResultResponse,
     MetadataResponse,
+    SpeechStartedResponse,
     UtteranceEndResponse,
     ErrorResponse,
 )
@@ -27,6 +28,7 @@ def __init__(self, config: LiveClient):
         super().__init__(config)
         super().on(LiveTranscriptionEvents.Transcript, self.on_message)
         super().on(LiveTranscriptionEvents.Metadata, self.on_metadata)
+        super().on(LiveTranscriptionEvents.SpeechStarted, self.on_speech_started)
         super().on(LiveTranscriptionEvents.UtteranceEnd, self.on_utterance_end)
         super().on(LiveTranscriptionEvents.Error, self.on_error)
         # self.test = "child"
@@ -54,6 +56,9 @@ def on_message(self, parent, result, **kwargs):
     def on_metadata(self, parent, metadata, **kwargs):
         print(f"\n\n{metadata}\n\n")
 
+    def on_speech_started(self, parent, speech_started, **kwargs):
+        print(f"\n\n{speech_started}\n\n")
+
     def on_utterance_end(self, parent, utterance_end, **kwargs):
         print(f"\n\n{utterance_end}\n\n")
 
@@ -73,6 +78,7 @@ def main():
         liveClient = MyLiveClient(ClientOptionsFromEnv())
 
         options = LiveOptions(
+            model="nova-2",
             punctuate=True,
             language="en-US",
             encoding="linear16",
@@ -81,6 +87,7 @@ def main():
             # To get UtteranceEnd, the following must be set:
             interim_results=True,
             utterance_end_ms="1000",
+            vad_events=True,
         )
         liveClient.start(options, addons=dict(myattr="hello"), test="hello")
 

@@ -14,8 +14,7 @@
 API_KEY = os.getenv("DG_API_KEY")
 
 options = LiveOptions(
-    model="nova",
-    interim_results=False,
+    model="nova-2",
     language="en-US",
 )
 
@@ -39,14 +38,18 @@ async def on_message(self, result, **kwargs):
         async def on_metadata(self, metadata, **kwargs):
             print(f"\n\n{metadata}\n\n")
 
-        def on_utterance_end(self, utterance_end, **kwargs):
+        async def on_speech_started(self, speech_started, **kwargs):
+            print(f"\n\n{speech_started}\n\n")
+
+        async def on_utterance_end(self, utterance_end, **kwargs):
             print(f"\n\n{utterance_end}\n\n")
 
         async def on_error(self, error, **kwargs):
             print(f"\n\n{error}\n\n")
 
         dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
         dg_connection.on(LiveTranscriptionEvents.Metadata, on_metadata)
+        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
         dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
         dg_connection.on(LiveTranscriptionEvents.Error, on_error)
 

@@ -42,6 +42,9 @@ def on_message(self, result, **kwargs):
         def on_metadata(self, metadata, **kwargs):
             print(f"\n\n{metadata}\n\n")
 
+        def on_speech_started(self, speech_started, **kwargs):
+            print(f"\n\n{speech_started}\n\n")
+
         def on_utterance_end(self, utterance_end, **kwargs):
             print(f"\n\n{utterance_end}\n\n")
 
@@ -50,11 +53,12 @@ def on_error(self, error, **kwargs):
 
         dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
         dg_connection.on(LiveTranscriptionEvents.Metadata, on_metadata)
+        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
         dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
         dg_connection.on(LiveTranscriptionEvents.Error, on_error)
 
         # connect to websocket
-        options = LiveOptions(model="nova", interim_results=False, language="en-US")
+        options = LiveOptions(model="nova-2", language="en-US")
         dg_connection.start(options)
 
         lock_exit = threading.Lock()

@@ -39,6 +39,9 @@ def on_message(self, result, **kwargs):
         def on_metadata(self, metadata, **kwargs):
             print(f"\n\n{metadata}\n\n")
 
+        def on_speech_started(self, speech_started, **kwargs):
+            print(f"\n\n{speech_started}\n\n")
+
         def on_utterance_end(self, utterance_end, **kwargs):
             print(f"\n\n{utterance_end}\n\n")
 
@@ -47,10 +50,12 @@ def on_error(self, error, **kwargs):
 
         dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
         dg_connection.on(LiveTranscriptionEvents.Metadata, on_metadata)
+        dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
         dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
         dg_connection.on(LiveTranscriptionEvents.Error, on_error)
 
         options = LiveOptions(
+            model="nova-2",
             punctuate=True,
             language="en-US",
             encoding="linear16",
@@ -59,6 +64,7 @@ def on_error(self, error, **kwargs):
             # To get UtteranceEnd, the following must be set:
             interim_results=True,
             utterance_end_ms="1000",
+            vad_events=True,
         )
         dg_connection.start(options, addons=dict(myattr="hello"), test="hello")