feat: Support MP3, TranscriptNormalization and SpeakerLabels in STT V…

…1 API PiperOrigin-RevId: 578629599
googleapis · Nov 1, 2023 · 08facab · 08facab
1 parent 6381c62
commit 08facab
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 6 deletions.
diff --git a/google/cloud/speech/v1/cloud_speech.proto b/google/cloud/speech/v1/cloud_speech.proto
@@ -1,4 +1,4 @@
-// Copyright 2022 Google LLC
+// Copyright 2023 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -260,6 +260,12 @@ message RecognitionConfig {
     // wideband is supported. `sample_rate_hertz` must be 16000.
     SPEEX_WITH_HEADER_BYTE = 7;
 
+    // MP3 audio. MP3 encoding is a Beta feature and only available in
+    // v1p1beta1. Support all standard MP3 bitrates (which range from 32-320
+    // kbps). When using this encoding, `sample_rate_hertz` has to match the
+    // sample rate of the file being used.
+    MP3 = 8;
+
     // Opus encoded audio frames in WebM container
     // ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be
     // one of 8000, 12000, 16000, 24000, or 48000.
@@ -343,6 +349,13 @@ message RecognitionConfig {
   // When speech adaptation is set it supersedes the `speech_contexts` field.
   SpeechAdaptation adaptation = 20;
 
+  // Optional. Use transcription normalization to automatically replace parts of
+  // the transcript with phrases of your choosing. For StreamingRecognize, this
+  // normalization only applies to stable partial transcripts (stability > 0.8)
+  // and final transcripts.
+  TranscriptNormalization transcript_normalization = 24
+      [(google.api.field_behavior) = OPTIONAL];
+
   // Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
   // A means to provide context to assist the speech recognition. For more
   // information, see
@@ -463,8 +476,8 @@ message RecognitionConfig {
 // Config to enable speaker diarization.
 message SpeakerDiarizationConfig {
   // If 'true', enables speaker detection for each recognized word in
-  // the top alternative of the recognition result using a speaker_tag provided
-  // in the WordInfo.
+  // the top alternative of the recognition result using a speaker_label
+  // provided in the WordInfo.
   bool enable_speaker_diarization = 1;
 
   // Minimum number of speakers in the conversation. This range gives you more
@@ -956,9 +969,19 @@ message WordInfo {
   // Output only. A distinct integer value is assigned for every speaker within
   // the audio. This field specifies which one of those speakers was detected to
   // have spoken this word. Value ranges from '1' to diarization_speaker_count.
-  // speaker_tag is set if enable_speaker_diarization = 'true' and only in the
+  // speaker_tag is set if enable_speaker_diarization = 'true' and only for the
   // top alternative.
-  int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
+  // Note: Use speaker_label instead.
+  int32 speaker_tag = 5
+      [deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY];
+
+  // Output only. A label value assigned for every unique speaker within the
+  // audio. This field specifies which speaker was detected to have spoken this
+  // word. For some models, like medical_conversation this can be actual speaker
+  // role, for example "patient" or "provider", but generally this would be a
+  // number identifying a speaker. This field is only set if
+  // enable_speaker_diarization = 'true' and only for the top alternative.
+  string speaker_label = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
 }
 
 // Information on speech adaptation use in results

diff --git a/google/cloud/speech/v1/resource.proto b/google/cloud/speech/v1/resource.proto
@@ -1,4 +1,4 @@
-// Copyright 2022 Google LLC
+// Copyright 2023 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -147,3 +147,27 @@ message SpeechAdaptation {
   // See specifications: https://www.w3.org/TR/speech-grammar
   ABNFGrammar abnf_grammar = 4;
 }
+
+// Transcription normalization configuration. Use transcription normalization
+// to automatically replace parts of the transcript with phrases of your
+// choosing. For StreamingRecognize, this normalization only applies to stable
+// partial transcripts (stability > 0.8) and final transcripts.
+message TranscriptNormalization {
+  // A single replacement configuration.
+  message Entry {
+    // What to replace. Max length is 100 characters.
+    string search = 1;
+
+    // What to replace with. Max length is 100 characters.
+    string replace = 2;
+
+    // Whether the search is case sensitive.
+    bool case_sensitive = 3;
+  }
+
+  // A list of replacement entries. We will perform replacement with one entry
+  // at a time. For example, the second entry in ["cat" => "dog", "mountain cat"
+  // => "mountain dog"] will never be applied because we will always process the
+  // first entry before it. At most 100 entries.
+  repeated Entry entries = 1;
+}