feat: introduces speaker diarization configuration

googleapis · Sep 5, 2019 · 8adb72a · 8adb72a
1 parent 3cc8989
commit 8adb72a
Show file tree

Hide file tree

Showing 4 changed files with 116 additions and 5 deletions.
diff --git a/protos/google/cloud/speech/v1/cloud_speech.proto b/protos/google/cloud/speech/v1/cloud_speech.proto
@@ -276,6 +276,16 @@ message RecognitionConfig {
   // premium feature.
   bool enable_automatic_punctuation = 11;
 
+  // *Optional* Config to enable speaker diarization and set additional
+  // parameters to make diarization better suited for your application.
+  // Note: When this is enabled, we send all the words from the beginning of the
+  // audio for the top alternative in every consecutive STREAMING responses.
+  // This is done in order to improve our speaker tags as our models learn to
+  // identify the speakers in the conversation over time.
+  // For non-streaming requests, the diarization results will be provided only
+  // in the top alternative of the FINAL SpeechRecognitionResult.
+  SpeakerDiarizationConfig diarization_config = 19;
+
   // *Optional* Metadata regarding this request.
   RecognitionMetadata metadata = 9;
 
@@ -324,6 +334,36 @@ message RecognitionConfig {
   bool use_enhanced = 14;
 }
 
+// *Optional* Config to enable speaker diarization.
+message SpeakerDiarizationConfig {
+  // *Optional* If 'true', enables speaker detection for each recognized word in
+  // the top alternative of the recognition result using a speaker_tag provided
+  // in the WordInfo.
+  bool enable_speaker_diarization = 1;
+
+  // Note: Set min_speaker_count = max_speaker_count to fix the number of
+  // speakers to be detected in the audio.
+
+  // *Optional*
+  // Minimum number of speakers in the conversation. This range gives you more
+  // flexibility by allowing the system to automatically determine the correct
+  // number of speakers. If not set, the default value is 2.
+  int32 min_speaker_count = 2;
+
+  // *Optional*
+  // Maximum number of speakers in the conversation. This range gives you more
+  // flexibility by allowing the system to automatically determine the correct
+  // number of speakers. If not set, the default value is 6.
+  int32 max_speaker_count = 3;
+
+  // Output only. A distinct integer value is assigned for every speaker within
+  // the audio. This field specifies which one of those speakers was detected to
+  // have spoken this word. Value ranges from '1' to diarization_speaker_count.
+  // speaker_tag is set if enable_speaker_diarization = 'true' and only in the
+  // top alternative.
+  int32 speaker_tag = 5;
+}
+
 // Description of audio data to be recognized.
 message RecognitionMetadata {
   // Use case categories that the audio recognition request can be described

diff --git a/protos/protos.json b/protos/protos.json
@@ -144,6 +144,10 @@
                           "type": "bool",
                           "id": 11
                         },
+                        "diarizationConfig": {
+                          "type": "SpeakerDiarizationConfig",
+                          "id": 19
+                        },
                         "metadata": {
                           "type": "RecognitionMetadata",
                           "id": 9
@@ -172,6 +176,26 @@
                         }
                       }
                     },
+                    "SpeakerDiarizationConfig": {
+                      "fields": {
+                        "enableSpeakerDiarization": {
+                          "type": "bool",
+                          "id": 1
+                        },
+                        "minSpeakerCount": {
+                          "type": "int32",
+                          "id": 2
+                        },
+                        "maxSpeakerCount": {
+                          "type": "int32",
+                          "id": 3
+                        },
+                        "speakerTag": {
+                          "type": "int32",
+                          "id": 5
+                        }
+                      }
+                    },
                     "RecognitionMetadata": {
                       "fields": {
                         "interactionType": {

diff --git a/src/v1/doc/google/cloud/speech/v1/doc_cloud_speech.js b/src/v1/doc/google/cloud/speech/v1/doc_cloud_speech.js
@@ -213,6 +213,18 @@ const StreamingRecognitionConfig = {
  *   to all users. In the future this may be exclusively available as a
  *   premium feature.
  *
+ * @property {Object} diarizationConfig
+ *   *Optional* Config to enable speaker diarization and set additional
+ *   parameters to make diarization better suited for your application.
+ *   Note: When this is enabled, we send all the words from the beginning of the
+ *   audio for the top alternative in every consecutive STREAMING responses.
+ *   This is done in order to improve our speaker tags as our models learn to
+ *   identify the speakers in the conversation over time.
+ *   For non-streaming requests, the diarization results will be provided only
+ *   in the top alternative of the FINAL SpeechRecognitionResult.
+ *
+ *   This object should have the same structure as [SpeakerDiarizationConfig]{@link google.cloud.speech.v1.SpeakerDiarizationConfig}
+ *
  * @property {Object} metadata
  *   *Optional* Metadata regarding this request.
  *
@@ -358,6 +370,41 @@ const RecognitionConfig = {
   }
 };
 
+/**
+ * *Optional* Config to enable speaker diarization.
+ *
+ * @property {boolean} enableSpeakerDiarization
+ *   *Optional* If 'true', enables speaker detection for each recognized word in
+ *   the top alternative of the recognition result using a speaker_tag provided
+ *   in the WordInfo.
+ *
+ * @property {number} minSpeakerCount
+ *   *Optional*
+ *   Minimum number of speakers in the conversation. This range gives you more
+ *   flexibility by allowing the system to automatically determine the correct
+ *   number of speakers. If not set, the default value is 2.
+ *
+ * @property {number} maxSpeakerCount
+ *   *Optional*
+ *   Maximum number of speakers in the conversation. This range gives you more
+ *   flexibility by allowing the system to automatically determine the correct
+ *   number of speakers. If not set, the default value is 6.
+ *
+ * @property {number} speakerTag
+ *   Output only. A distinct integer value is assigned for every speaker within
+ *   the audio. This field specifies which one of those speakers was detected to
+ *   have spoken this word. Value ranges from '1' to diarization_speaker_count.
+ *   speaker_tag is set if enable_speaker_diarization = 'true' and only in the
+ *   top alternative.
+ *
+ * @typedef SpeakerDiarizationConfig
+ * @memberof google.cloud.speech.v1
+ * @see [google.cloud.speech.v1.SpeakerDiarizationConfig definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/speech/v1/cloud_speech.proto}
+ */
+const SpeakerDiarizationConfig = {
+  // This is for documentation. Actual contents will be loaded by gRPC.
+};
+
 /**
  * Description of audio data to be recognized.
  *

diff --git a/synth.metadata b/synth.metadata
@@ -1,19 +1,19 @@
 {
-  "updateTime": "2019-08-31T11:19:24.627851Z",
+  "updateTime": "2019-09-05T11:20:15.683535Z",
   "sources": [
     {
       "generator": {
         "name": "artman",
-        "version": "0.36.1",
-        "dockerImage": "googleapis/artman@sha256:7c20f006c7a62d9d782e2665647d52290c37a952ef3cd134624d5dd62b3f71bd"
+        "version": "0.36.2",
+        "dockerImage": "googleapis/artman@sha256:0e6f3a668cd68afc768ecbe08817cf6e56a0e64fcbdb1c58c3b97492d12418a1"
       }
     },
     {
       "git": {
         "name": "googleapis",
         "remote": "https://github.com/googleapis/googleapis.git",
-        "sha": "82809578652607c8ee29d9e199c21f28f81a03e0",
-        "internalRef": "266247326"
+        "sha": "0930bdac6369674ed7460d3de230a6b9193600b7",
+        "internalRef": "267282771"
       }
     },
     {