Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
feat: introduces speaker diarization configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
yoshi-automation authored and bcoe committed Sep 5, 2019
1 parent 3cc8989 commit 8adb72a
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 5 deletions.
40 changes: 40 additions & 0 deletions protos/google/cloud/speech/v1/cloud_speech.proto
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,16 @@ message RecognitionConfig {
// premium feature.
bool enable_automatic_punctuation = 11;

// *Optional* Config to enable speaker diarization and set additional
// parameters to make diarization better suited for your application.
// Note: When this is enabled, we send all the words from the beginning of the
// audio for the top alternative in every consecutive STREAMING responses.
// This is done in order to improve our speaker tags as our models learn to
// identify the speakers in the conversation over time.
// For non-streaming requests, the diarization results will be provided only
// in the top alternative of the FINAL SpeechRecognitionResult.
SpeakerDiarizationConfig diarization_config = 19;

// *Optional* Metadata regarding this request.
RecognitionMetadata metadata = 9;

Expand Down Expand Up @@ -324,6 +334,36 @@ message RecognitionConfig {
bool use_enhanced = 14;
}

// *Optional* Config to enable speaker diarization.
message SpeakerDiarizationConfig {
// *Optional* If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
bool enable_speaker_diarization = 1;

// Note: Set min_speaker_count = max_speaker_count to fix the number of
// speakers to be detected in the audio.

// *Optional*
// Minimum number of speakers in the conversation. This range gives you more
// flexibility by allowing the system to automatically determine the correct
// number of speakers. If not set, the default value is 2.
int32 min_speaker_count = 2;

// *Optional*
// Maximum number of speakers in the conversation. This range gives you more
// flexibility by allowing the system to automatically determine the correct
// number of speakers. If not set, the default value is 6.
int32 max_speaker_count = 3;

// Output only. A distinct integer value is assigned for every speaker within
// the audio. This field specifies which one of those speakers was detected to
// have spoken this word. Value ranges from '1' to diarization_speaker_count.
// speaker_tag is set if enable_speaker_diarization = 'true' and only in the
// top alternative.
int32 speaker_tag = 5;
}

// Description of audio data to be recognized.
message RecognitionMetadata {
// Use case categories that the audio recognition request can be described
Expand Down
24 changes: 24 additions & 0 deletions protos/protos.json
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@
"type": "bool",
"id": 11
},
"diarizationConfig": {
"type": "SpeakerDiarizationConfig",
"id": 19
},
"metadata": {
"type": "RecognitionMetadata",
"id": 9
Expand Down Expand Up @@ -172,6 +176,26 @@
}
}
},
"SpeakerDiarizationConfig": {
"fields": {
"enableSpeakerDiarization": {
"type": "bool",
"id": 1
},
"minSpeakerCount": {
"type": "int32",
"id": 2
},
"maxSpeakerCount": {
"type": "int32",
"id": 3
},
"speakerTag": {
"type": "int32",
"id": 5
}
}
},
"RecognitionMetadata": {
"fields": {
"interactionType": {
Expand Down
47 changes: 47 additions & 0 deletions src/v1/doc/google/cloud/speech/v1/doc_cloud_speech.js
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,18 @@ const StreamingRecognitionConfig = {
* to all users. In the future this may be exclusively available as a
* premium feature.
*
* @property {Object} diarizationConfig
* *Optional* Config to enable speaker diarization and set additional
* parameters to make diarization better suited for your application.
* Note: When this is enabled, we send all the words from the beginning of the
* audio for the top alternative in every consecutive STREAMING responses.
* This is done in order to improve our speaker tags as our models learn to
* identify the speakers in the conversation over time.
* For non-streaming requests, the diarization results will be provided only
* in the top alternative of the FINAL SpeechRecognitionResult.
*
* This object should have the same structure as [SpeakerDiarizationConfig]{@link google.cloud.speech.v1.SpeakerDiarizationConfig}
*
* @property {Object} metadata
* *Optional* Metadata regarding this request.
*
Expand Down Expand Up @@ -358,6 +370,41 @@ const RecognitionConfig = {
}
};

/**
* *Optional* Config to enable speaker diarization.
*
* @property {boolean} enableSpeakerDiarization
* *Optional* If 'true', enables speaker detection for each recognized word in
* the top alternative of the recognition result using a speaker_tag provided
* in the WordInfo.
*
* @property {number} minSpeakerCount
* *Optional*
* Minimum number of speakers in the conversation. This range gives you more
* flexibility by allowing the system to automatically determine the correct
* number of speakers. If not set, the default value is 2.
*
* @property {number} maxSpeakerCount
* *Optional*
* Maximum number of speakers in the conversation. This range gives you more
* flexibility by allowing the system to automatically determine the correct
* number of speakers. If not set, the default value is 6.
*
* @property {number} speakerTag
* Output only. A distinct integer value is assigned for every speaker within
* the audio. This field specifies which one of those speakers was detected to
* have spoken this word. Value ranges from '1' to diarization_speaker_count.
* speaker_tag is set if enable_speaker_diarization = 'true' and only in the
* top alternative.
*
* @typedef SpeakerDiarizationConfig
* @memberof google.cloud.speech.v1
* @see [google.cloud.speech.v1.SpeakerDiarizationConfig definition in proto format]{@link https://github.com/googleapis/googleapis/blob/master/google/cloud/speech/v1/cloud_speech.proto}
*/
const SpeakerDiarizationConfig = {
// This is for documentation. Actual contents will be loaded by gRPC.
};

/**
* Description of audio data to be recognized.
*
Expand Down
10 changes: 5 additions & 5 deletions synth.metadata
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
{
"updateTime": "2019-08-31T11:19:24.627851Z",
"updateTime": "2019-09-05T11:20:15.683535Z",
"sources": [
{
"generator": {
"name": "artman",
"version": "0.36.1",
"dockerImage": "googleapis/artman@sha256:7c20f006c7a62d9d782e2665647d52290c37a952ef3cd134624d5dd62b3f71bd"
"version": "0.36.2",
"dockerImage": "googleapis/artman@sha256:0e6f3a668cd68afc768ecbe08817cf6e56a0e64fcbdb1c58c3b97492d12418a1"
}
},
{
"git": {
"name": "googleapis",
"remote": "https://github.com/googleapis/googleapis.git",
"sha": "82809578652607c8ee29d9e199c21f28f81a03e0",
"internalRef": "266247326"
"sha": "0930bdac6369674ed7460d3de230a6b9193600b7",
"internalRef": "267282771"
}
},
{
Expand Down

0 comments on commit 8adb72a

Please sign in to comment.