Skip to content

Commit

Permalink
feat: Support MP3, TranscriptNormalization and SpeakerLabels in STT V…
Browse files Browse the repository at this point in the history
…1 API

PiperOrigin-RevId: 578629599
  • Loading branch information
Google APIs authored and Copybara-Service committed Nov 1, 2023
1 parent 6381c62 commit 08facab
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 6 deletions.
33 changes: 28 additions & 5 deletions google/cloud/speech/v1/cloud_speech.proto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022 Google LLC
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -260,6 +260,12 @@ message RecognitionConfig {
// wideband is supported. `sample_rate_hertz` must be 16000.
SPEEX_WITH_HEADER_BYTE = 7;

// MP3 audio. MP3 encoding is a Beta feature and only available in
// v1p1beta1. Support all standard MP3 bitrates (which range from 32-320
// kbps). When using this encoding, `sample_rate_hertz` has to match the
// sample rate of the file being used.
MP3 = 8;

// Opus encoded audio frames in WebM container
// ([OggOpus](https://wiki.xiph.org/OggOpus)). `sample_rate_hertz` must be
// one of 8000, 12000, 16000, 24000, or 48000.
Expand Down Expand Up @@ -343,6 +349,13 @@ message RecognitionConfig {
// When speech adaptation is set it supersedes the `speech_contexts` field.
SpeechAdaptation adaptation = 20;

// Optional. Use transcription normalization to automatically replace parts of
// the transcript with phrases of your choosing. For StreamingRecognize, this
// normalization only applies to stable partial transcripts (stability > 0.8)
// and final transcripts.
TranscriptNormalization transcript_normalization = 24
[(google.api.field_behavior) = OPTIONAL];

// Array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
// A means to provide context to assist the speech recognition. For more
// information, see
Expand Down Expand Up @@ -463,8 +476,8 @@ message RecognitionConfig {
// Config to enable speaker diarization.
message SpeakerDiarizationConfig {
// If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
// the top alternative of the recognition result using a speaker_label
// provided in the WordInfo.
bool enable_speaker_diarization = 1;

// Minimum number of speakers in the conversation. This range gives you more
Expand Down Expand Up @@ -956,9 +969,19 @@ message WordInfo {
// Output only. A distinct integer value is assigned for every speaker within
// the audio. This field specifies which one of those speakers was detected to
// have spoken this word. Value ranges from '1' to diarization_speaker_count.
// speaker_tag is set if enable_speaker_diarization = 'true' and only in the
// speaker_tag is set if enable_speaker_diarization = 'true' and only for the
// top alternative.
int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
// Note: Use speaker_label instead.
int32 speaker_tag = 5
[deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY];

// Output only. A label value assigned for every unique speaker within the
// audio. This field specifies which speaker was detected to have spoken this
// word. For some models, like medical_conversation this can be actual speaker
// role, for example "patient" or "provider", but generally this would be a
// number identifying a speaker. This field is only set if
// enable_speaker_diarization = 'true' and only for the top alternative.
string speaker_label = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// Information on speech adaptation use in results
Expand Down
26 changes: 25 additions & 1 deletion google/cloud/speech/v1/resource.proto
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2022 Google LLC
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -147,3 +147,27 @@ message SpeechAdaptation {
// See specifications: https://www.w3.org/TR/speech-grammar
ABNFGrammar abnf_grammar = 4;
}

// Transcription normalization configuration. Use transcription normalization
// to automatically replace parts of the transcript with phrases of your
// choosing. For StreamingRecognize, this normalization only applies to stable
// partial transcripts (stability > 0.8) and final transcripts.
message TranscriptNormalization {
// A single replacement configuration.
message Entry {
// What to replace. Max length is 100 characters.
string search = 1;

// What to replace with. Max length is 100 characters.
string replace = 2;

// Whether the search is case sensitive.
bool case_sensitive = 3;
}

// A list of replacement entries. We will perform replacement with one entry
// at a time. For example, the second entry in ["cat" => "dog", "mountain cat"
// => "mountain dog"] will never be applied because we will always process the
// first entry before it. At most 100 entries.
repeated Entry entries = 1;
}

0 comments on commit 08facab

Please sign in to comment.