Skip to content
This repository has been archived by the owner on Apr 20, 2024. It is now read-only.

Commit

Permalink
feat: added alternative_language_codes to RecognitionConfig (#290)
Browse files Browse the repository at this point in the history
- [ ] Regenerate this pull request now.

PiperOrigin-RevId: 413453425

Source-Link: googleapis/googleapis@2b47b24

Source-Link: googleapis/googleapis-gen@7ffe6e0
Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiN2ZmZTZlMGExYmY2M2Q4NTQwMDA5Y2U2OTg2NjBlYmI3MWM1NGZmMSJ9

feat: add WEBM_OPUS codec 
feat: add SpeechAdaptation configuration 
feat: add word confidence 
feat: add spoken punctuation and spoken emojis 
feat: add hint boost in SpeechContext
  • Loading branch information
gcf-owl-bot[bot] committed Dec 5, 2021
1 parent d38d2b9 commit 3fdef1f
Show file tree
Hide file tree
Showing 8 changed files with 370 additions and 2 deletions.
6 changes: 6 additions & 0 deletions google/cloud/speech/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
from google.cloud.speech_v1.types.cloud_speech import StreamingRecognizeResponse
from google.cloud.speech_v1.types.cloud_speech import TranscriptOutputConfig
from google.cloud.speech_v1.types.cloud_speech import WordInfo
from google.cloud.speech_v1.types.resource import CustomClass
from google.cloud.speech_v1.types.resource import PhraseSet
from google.cloud.speech_v1.types.resource import SpeechAdaptation

__all__ = (
"SpeechClient",
Expand All @@ -57,4 +60,7 @@
"StreamingRecognizeResponse",
"TranscriptOutputConfig",
"WordInfo",
"CustomClass",
"PhraseSet",
"SpeechAdaptation",
)
6 changes: 6 additions & 0 deletions google/cloud/speech_v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
from .types.cloud_speech import StreamingRecognizeResponse
from .types.cloud_speech import TranscriptOutputConfig
from .types.cloud_speech import WordInfo
from .types.resource import CustomClass
from .types.resource import PhraseSet
from .types.resource import SpeechAdaptation

from google.cloud.speech_v1.helpers import SpeechHelpers

Expand All @@ -45,15 +48,18 @@ class SpeechClient(SpeechHelpers, SpeechClient):

__all__ = (
"SpeechAsyncClient",
"CustomClass",
"LongRunningRecognizeMetadata",
"LongRunningRecognizeRequest",
"LongRunningRecognizeResponse",
"PhraseSet",
"RecognitionAudio",
"RecognitionConfig",
"RecognitionMetadata",
"RecognizeRequest",
"RecognizeResponse",
"SpeakerDiarizationConfig",
"SpeechAdaptation",
"SpeechClient",
"SpeechContext",
"SpeechRecognitionAlternative",
Expand Down
4 changes: 4 additions & 0 deletions google/cloud/speech_v1/services/speech/async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ class SpeechAsyncClient:
DEFAULT_ENDPOINT = SpeechClient.DEFAULT_ENDPOINT
DEFAULT_MTLS_ENDPOINT = SpeechClient.DEFAULT_MTLS_ENDPOINT

custom_class_path = staticmethod(SpeechClient.custom_class_path)
parse_custom_class_path = staticmethod(SpeechClient.parse_custom_class_path)
phrase_set_path = staticmethod(SpeechClient.phrase_set_path)
parse_phrase_set_path = staticmethod(SpeechClient.parse_phrase_set_path)
common_billing_account_path = staticmethod(SpeechClient.common_billing_account_path)
parse_common_billing_account_path = staticmethod(
SpeechClient.parse_common_billing_account_path
Expand Down
32 changes: 32 additions & 0 deletions google/cloud/speech_v1/services/speech/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,38 @@ def transport(self) -> SpeechTransport:
"""
return self._transport

@staticmethod
def custom_class_path(project: str, location: str, custom_class: str,) -> str:
"""Returns a fully-qualified custom_class string."""
return "projects/{project}/locations/{location}/customClasses/{custom_class}".format(
project=project, location=location, custom_class=custom_class,
)

@staticmethod
def parse_custom_class_path(path: str) -> Dict[str, str]:
"""Parses a custom_class path into its component segments."""
m = re.match(
r"^projects/(?P<project>.+?)/locations/(?P<location>.+?)/customClasses/(?P<custom_class>.+?)$",
path,
)
return m.groupdict() if m else {}

@staticmethod
def phrase_set_path(project: str, location: str, phrase_set: str,) -> str:
"""Returns a fully-qualified phrase_set string."""
return "projects/{project}/locations/{location}/phraseSets/{phrase_set}".format(
project=project, location=location, phrase_set=phrase_set,
)

@staticmethod
def parse_phrase_set_path(path: str) -> Dict[str, str]:
"""Parses a phrase_set path into its component segments."""
m = re.match(
r"^projects/(?P<project>.+?)/locations/(?P<location>.+?)/phraseSets/(?P<phrase_set>.+?)$",
path,
)
return m.groupdict() if m else {}

@staticmethod
def common_billing_account_path(billing_account: str,) -> str:
"""Returns a fully-qualified billing_account string."""
Expand Down
8 changes: 8 additions & 0 deletions google/cloud/speech_v1/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
TranscriptOutputConfig,
WordInfo,
)
from .resource import (
CustomClass,
PhraseSet,
SpeechAdaptation,
)

__all__ = (
"LongRunningRecognizeMetadata",
Expand All @@ -53,4 +58,7 @@
"StreamingRecognizeResponse",
"TranscriptOutputConfig",
"WordInfo",
"CustomClass",
"PhraseSet",
"SpeechAdaptation",
)
105 changes: 103 additions & 2 deletions google/cloud/speech_v1/types/cloud_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
#
import proto # type: ignore

from google.cloud.speech_v1.types import resource
from google.protobuf import duration_pb2 # type: ignore
from google.protobuf import timestamp_pb2 # type: ignore
from google.protobuf import wrappers_pb2 # type: ignore
from google.rpc import status_pb2 # type: ignore


Expand Down Expand Up @@ -245,6 +247,20 @@ class RecognitionConfig(proto.Message):
language tag. Example: "en-US". See `Language
Support <https://cloud.google.com/speech-to-text/docs/languages>`__
for a list of the currently supported language codes.
alternative_language_codes (Sequence[str]):
A list of up to 3 additional
`BCP-47 <https://www.rfc-editor.org/rfc/bcp/bcp47.txt>`__
language tags, listing possible alternative languages of the
supplied audio. See `Language
Support <https://cloud.google.com/speech-to-text/docs/languages>`__
for a list of the currently supported language codes. If
alternative languages are listed, recognition result will
contain recognition in the most likely language detected
including the main language_code. The recognition result
will include the language tag of the language detected in
the audio. Note: This feature is only supported for Voice
Command and Voice Search use cases and performance may vary
for other use cases (e.g., phone call transcription).
max_alternatives (int):
Maximum number of recognition hypotheses to be returned.
Specifically, the maximum number of
Expand All @@ -258,6 +274,12 @@ class RecognitionConfig(proto.Message):
profanities, replacing all but the initial character in each
filtered word with asterisks, e.g. "f***". If set to
``false`` or omitted, profanities won't be filtered out.
adaptation (google.cloud.speech_v1.types.SpeechAdaptation):
Speech adaptation configuration improves the accuracy of
speech recognition. For more information, see the `speech
adaptation <https://cloud.google.com/speech-to-text/docs/adaptation>`__
documentation. When speech adaptation is set it supersedes
the ``speech_contexts`` field.
speech_contexts (Sequence[google.cloud.speech_v1.types.SpeechContext]):
Array of
[SpeechContext][google.cloud.speech.v1.SpeechContext]. A
Expand All @@ -269,13 +291,35 @@ class RecognitionConfig(proto.Message):
start and end time offsets (timestamps) for those words. If
``false``, no word-level time offset information is
returned. The default is ``false``.
enable_word_confidence (bool):
If ``true``, the top result includes a list of words and the
confidence for those words. If ``false``, no word-level
confidence information is returned. The default is
``false``.
enable_automatic_punctuation (bool):
If 'true', adds punctuation to recognition
result hypotheses. This feature is only
available in select languages. Setting this for
requests in other languages has no effect at
all. The default 'false' value does not add
punctuation to result hypotheses.
enable_spoken_punctuation (google.protobuf.wrappers_pb2.BoolValue):
The spoken punctuation behavior for the call If not set,
uses default behavior based on model of choice e.g.
command_and_search will enable spoken punctuation by default
If 'true', replaces spoken punctuation with the
corresponding symbols in the request. For example, "how are
you question mark" becomes "how are you?". See
https://cloud.google.com/speech-to-text/docs/spoken-punctuation
for support. If 'false', spoken punctuation is not replaced.
enable_spoken_emojis (google.protobuf.wrappers_pb2.BoolValue):
The spoken emoji behavior for the call
If not set, uses default behavior based on model
of choice If 'true', adds spoken emoji
formatting for the request. This will replace
spoken emojis with the corresponding Unicode
symbols in the final transcript. If 'false',
spoken emojis are not replaced.
diarization_config (google.cloud.speech_v1.types.SpeakerDiarizationConfig):
Config to enable speaker diarization and set
additional parameters to make diarization better
Expand Down Expand Up @@ -352,7 +396,7 @@ class AudioEncoding(proto.Enum):
codecs are used to capture or transmit audio, particularly if
background noise is present. Lossy codecs include ``MULAW``,
``AMR``, ``AMR_WB``, ``OGG_OPUS``, ``SPEEX_WITH_HEADER_BYTE``,
``MP3``.
``MP3``, and ``WEBM_OPUS``.
The ``FLAC`` and ``WAV`` audio file formats include a header that
describes the included audio content. You can request recognition
Expand All @@ -374,19 +418,31 @@ class AudioEncoding(proto.Enum):
AMR_WB = 5
OGG_OPUS = 6
SPEEX_WITH_HEADER_BYTE = 7
WEBM_OPUS = 9

encoding = proto.Field(proto.ENUM, number=1, enum=AudioEncoding,)
sample_rate_hertz = proto.Field(proto.INT32, number=2,)
audio_channel_count = proto.Field(proto.INT32, number=7,)
enable_separate_recognition_per_channel = proto.Field(proto.BOOL, number=12,)
language_code = proto.Field(proto.STRING, number=3,)
alternative_language_codes = proto.RepeatedField(proto.STRING, number=18,)
max_alternatives = proto.Field(proto.INT32, number=4,)
profanity_filter = proto.Field(proto.BOOL, number=5,)
adaptation = proto.Field(
proto.MESSAGE, number=20, message=resource.SpeechAdaptation,
)
speech_contexts = proto.RepeatedField(
proto.MESSAGE, number=6, message="SpeechContext",
)
enable_word_time_offsets = proto.Field(proto.BOOL, number=8,)
enable_word_confidence = proto.Field(proto.BOOL, number=15,)
enable_automatic_punctuation = proto.Field(proto.BOOL, number=11,)
enable_spoken_punctuation = proto.Field(
proto.MESSAGE, number=22, message=wrappers_pb2.BoolValue,
)
enable_spoken_emojis = proto.Field(
proto.MESSAGE, number=23, message=wrappers_pb2.BoolValue,
)
diarization_config = proto.Field(
proto.MESSAGE, number=19, message="SpeakerDiarizationConfig",
)
Expand Down Expand Up @@ -534,9 +590,21 @@ class SpeechContext(proto.Message):
for every month of the year, using the $MONTH class improves
the likelihood of correctly transcribing audio that includes
months.
boost (float):
Hint Boost. Positive value will increase the probability
that a specific phrase will be recognized over other similar
sounding phrases. The higher the boost, the higher the
chance of false positive recognition as well. Negative boost
values would correspond to anti-biasing. Anti-biasing is not
enabled, so negative boost will simply be ignored. Though
``boost`` can accept a wide range of positive values, most
use cases are best served with values between 0 and 20. We
recommend using a binary search approach to finding the
optimal value for your use case.
"""

phrases = proto.RepeatedField(proto.STRING, number=1,)
boost = proto.Field(proto.FLOAT, number=4,)


class RecognitionAudio(proto.Message):
Expand Down Expand Up @@ -617,6 +685,12 @@ class LongRunningRecognizeResponse(proto.Message):
total_billed_time (google.protobuf.duration_pb2.Duration):
When available, billed audio seconds for the
corresponding request.
output_config (google.cloud.speech_v1.types.TranscriptOutputConfig):
Original output config if present in the
request.
output_error (google.rpc.status_pb2.Status):
If the transcript output fails this field
contains the relevant error.
"""

results = proto.RepeatedField(
Expand All @@ -625,6 +699,10 @@ class LongRunningRecognizeResponse(proto.Message):
total_billed_time = proto.Field(
proto.MESSAGE, number=3, message=duration_pb2.Duration,
)
output_config = proto.Field(
proto.MESSAGE, number=6, message="TranscriptOutputConfig",
)
output_error = proto.Field(proto.MESSAGE, number=7, message=status_pb2.Status,)


class LongRunningRecognizeMetadata(proto.Message):
Expand Down Expand Up @@ -777,7 +855,7 @@ class StreamingRecognitionResult(proto.Message):
that channel. For audio_channel_count = N, its output values
can range from '1' to 'N'.
language_code (str):
The
Output only. The
`BCP-47 <https://www.rfc-editor.org/rfc/bcp/bcp47.txt>`__
language tag of the language in this result. This language
code was detected to have the most likelihood of being
Expand Down Expand Up @@ -812,12 +890,25 @@ class SpeechRecognitionResult(proto.Message):
corresponding to the recognized result for the audio from
that channel. For audio_channel_count = N, its output values
can range from '1' to 'N'.
result_end_time (google.protobuf.duration_pb2.Duration):
Time offset of the end of this result
relative to the beginning of the audio.
language_code (str):
Output only. The
`BCP-47 <https://www.rfc-editor.org/rfc/bcp/bcp47.txt>`__
language tag of the language in this result. This language
code was detected to have the most likelihood of being
spoken in the audio.
"""

alternatives = proto.RepeatedField(
proto.MESSAGE, number=1, message="SpeechRecognitionAlternative",
)
channel_tag = proto.Field(proto.INT32, number=2,)
result_end_time = proto.Field(
proto.MESSAGE, number=4, message=duration_pb2.Duration,
)
language_code = proto.Field(proto.STRING, number=5,)


class SpeechRecognitionAlternative(proto.Message):
Expand Down Expand Up @@ -866,6 +957,15 @@ class WordInfo(proto.Message):
word (str):
The word corresponding to this set of
information.
confidence (float):
The confidence estimate between 0.0 and 1.0. A higher number
indicates an estimated greater likelihood that the
recognized words are correct. This field is set only for the
top alternative of a non-streaming result or, of a streaming
result where ``is_final=true``. This field is not guaranteed
to be accurate and users should not rely on it to be always
provided. The default of 0.0 is a sentinel value indicating
``confidence`` was not set.
speaker_tag (int):
Output only. A distinct integer value is assigned for every
speaker within the audio. This field specifies which one of
Expand All @@ -878,6 +978,7 @@ class WordInfo(proto.Message):
start_time = proto.Field(proto.MESSAGE, number=1, message=duration_pb2.Duration,)
end_time = proto.Field(proto.MESSAGE, number=2, message=duration_pb2.Duration,)
word = proto.Field(proto.STRING, number=3,)
confidence = proto.Field(proto.FLOAT, number=4,)
speaker_tag = proto.Field(proto.INT32, number=5,)


Expand Down
Loading

0 comments on commit 3fdef1f

Please sign in to comment.