Skip to content

Commit

Permalink
Dynamically determine audio parameters for Dictation.
Browse files Browse the repository at this point in the history
Completes TODO by determining audio parameters from the system in
OnDeviceSpeechRecognizer instead of hard-coding them.

Also moves a lot of work from AudioSourceFetcher constructor into
the Start function (do less in constructors).

dictation on Chromebook with wifi disabled.

Bug: 1173135
AX-Relnotes: n/a
Test: SpeechRecognitionServiceBrowsertest, new tests added, and tried
Change-Id: I165957699a4ed215dfbd5592e949f0e467c06361
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2802756
Commit-Queue: Katie Dektar <katie@chromium.org>
Reviewed-by: Yilkal Abe <yilkal@chromium.org>
Reviewed-by: Avi Drissman <avi@chromium.org>
Reviewed-by: Robert Sesek <rsesek@chromium.org>
Reviewed-by: Evan Liu <evliu@google.com>
Cr-Commit-Position: refs/heads/master@{#869322}
  • Loading branch information
Katie Dektar authored and Chromium LUCI CQ committed Apr 5, 2021
1 parent e1b9aeb commit 1ad8e7a
Show file tree
Hide file tree
Showing 14 changed files with 364 additions and 104 deletions.
1 change: 1 addition & 0 deletions chrome/browser/ash/accessibility/dictation_browsertest.cc
Expand Up @@ -140,6 +140,7 @@ class DictationTest : public InProcessBrowserTest,
// Wait for interaction on UI thread.
fake_speech_recognition_manager_->WaitForRecognitionStarted();
} else {
fake_service_->WaitForRecognitionStarted();
// Only one thread, use a RunLoop to ensure mojom messages are done.
base::RunLoop().RunUntilIdle();
}
Expand Down
5 changes: 2 additions & 3 deletions chrome/browser/speech/cros_speech_recognition_service.cc
Expand Up @@ -50,7 +50,6 @@ void CrosSpeechRecognitionService::BindRecognizer(
void CrosSpeechRecognitionService::BindAudioSourceFetcher(
mojo::PendingReceiver<media::mojom::AudioSourceFetcher> fetcher_receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory,
BindRecognizerCallback callback) {
base::FilePath binary_path, languagepack_path;
PopulateFilePaths(binary_path, languagepack_path);
Expand All @@ -64,11 +63,11 @@ void CrosSpeechRecognitionService::BindAudioSourceFetcher(
FROM_HERE,
base::BindOnce(
&AudioSourceFetcherImpl::Create, std::move(fetcher_receiver),
std::move(stream_factory),
std::make_unique<CrosSpeechRecognitionRecognizerImpl>(
std::move(client), nullptr /* =SpeechRecognitionService WeakPtr*/,
binary_path, languagepack_path)));
std::move(callback).Run(true);
std::move(callback).Run(
CrosSpeechRecognitionRecognizerImpl::IsMultichannelSupported());
}

void CrosSpeechRecognitionService::PopulateFilePaths(
Expand Down
1 change: 0 additions & 1 deletion chrome/browser/speech/cros_speech_recognition_service.h
Expand Up @@ -41,7 +41,6 @@ class CrosSpeechRecognitionService
mojo::PendingReceiver<media::mojom::AudioSourceFetcher> fetcher_receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
client,
mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory,
BindRecognizerCallback callback) override;

private:
Expand Down
26 changes: 22 additions & 4 deletions chrome/browser/speech/fake_speech_recognition_service.cc
Expand Up @@ -27,27 +27,36 @@ void FakeSpeechRecognitionService::BindRecognizer(
recognizer_client_remote_.set_disconnect_handler(base::BindOnce(
&FakeSpeechRecognitionService::OnRecognizerClientDisconnected,
base::Unretained(this)));
std::move(callback).Run(true /* multichannel supported */);
std::move(callback).Run(is_multichannel_supported_);
}

void FakeSpeechRecognitionService::BindAudioSourceFetcher(
mojo::PendingReceiver<media::mojom::AudioSourceFetcher> fetcher_receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient> client,
mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory,
BindRecognizerCallback callback) {
fetcher_receiver_.Bind(std::move(fetcher_receiver));
recognizer_client_remote_.Bind(std::move(client));
recognizer_client_remote_.set_disconnect_handler(base::BindOnce(
&FakeSpeechRecognitionService::OnRecognizerClientDisconnected,
base::Unretained(this)));
std::move(callback).Run(true);
std::move(callback).Run(is_multichannel_supported_);
}

void FakeSpeechRecognitionService::Start() {
void FakeSpeechRecognitionService::Start(
mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory,
const std::string& device_id,
const ::media::AudioParameters& audio_parameters) {
capturing_audio_ = true;
device_id_ = device_id;
audio_parameters_ = audio_parameters;
if (recognition_started_closure_) {
std::move(recognition_started_closure_).Run();
}
}
void FakeSpeechRecognitionService::Stop() {
capturing_audio_ = false;
device_id_ = "";
audio_parameters_ = base::nullopt;
}

void FakeSpeechRecognitionService::SendAudioToSpeechRecognitionService(
Expand All @@ -68,11 +77,20 @@ void FakeSpeechRecognitionService::SendSpeechRecognitionError() {
recognizer_client_remote_->OnSpeechRecognitionError();
}

void FakeSpeechRecognitionService::WaitForRecognitionStarted() {
base::RunLoop runner;
recognition_started_closure_ = runner.QuitClosure();
runner.Run();
}

void FakeSpeechRecognitionService::OnRecognizerClientDisconnected() {
// Reset everything in case it will be re-used.
recognizer_client_remote_.reset();
fetcher_receiver_.reset();
recognizer_receiver_.reset();
capturing_audio_ = false;
device_id_ = "";
audio_parameters_ = base::nullopt;
}

} // namespace speech
29 changes: 27 additions & 2 deletions chrome/browser/speech/fake_speech_recognition_service.h
Expand Up @@ -6,6 +6,7 @@
#define CHROME_BROWSER_SPEECH_FAKE_SPEECH_RECOGNITION_SERVICE_H_

#include "chrome/browser/speech/chrome_speech_recognition_service.h"
#include "media/base/audio_parameters.h"
#include "media/mojo/mojom/speech_recognition_service.mojom.h"
#include "mojo/public/cpp/bindings/receiver_set.h"
#include "mojo/public/cpp/bindings/remote.h"
Expand Down Expand Up @@ -40,11 +41,13 @@ class FakeSpeechRecognitionService
mojo::PendingReceiver<media::mojom::AudioSourceFetcher> fetcher_receiver,
mojo::PendingRemote<media::mojom::SpeechRecognitionRecognizerClient>
client,
mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory,
BindRecognizerCallback callback) override;

// media::mojom::AudioSourceFetcher:
void Start() override;
void Start(
mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory,
const std::string& device_id,
const ::media::AudioParameters& audio_parameters) override;
void Stop() override;

// media::mojom::SpeechRecognitionRecognizer:
Expand All @@ -58,18 +61,40 @@ class FakeSpeechRecognitionService
media::mojom::SpeechRecognitionResultPtr result);
void SendSpeechRecognitionError();

void WaitForRecognitionStarted();

// Whether AudioSourceFetcher is capturing audio.
bool is_capturing_audio() { return capturing_audio_; }

// Whether SendAudioToSpeechRecognitionService has been called.
bool has_received_audio() { return has_received_audio_; }

std::string device_id() { return device_id_; }

const base::Optional<::media::AudioParameters>& audio_parameters() {
return audio_parameters_;
}

void set_multichannel_supported(bool is_multichannel_supported) {
is_multichannel_supported_ = is_multichannel_supported;
}

private:
void OnRecognizerClientDisconnected();

// Whether multichannel audio is supported.
bool is_multichannel_supported_ = false;
// Whether the AudioSourceFetcher has been started.
bool capturing_audio_ = false;
// Whether any audio has been sent to the SpeechRecognitionRecognizer.
bool has_received_audio_ = false;
// The device ID used to capture audio.
std::string device_id_;
// The audio parameters used to capture audio.
base::Optional<::media::AudioParameters> audio_parameters_;

base::OnceClosure recognition_started_closure_;

mojo::Remote<media::mojom::SpeechRecognitionRecognizerClient>
recognizer_client_remote_;

Expand Down
96 changes: 80 additions & 16 deletions chrome/browser/speech/on_device_speech_recognizer.cc
Expand Up @@ -4,6 +4,8 @@

#include "chrome/browser/speech/on_device_speech_recognizer.h"

#include <algorithm>

#include "base/strings/utf_string_conversions.h"
#include "chrome/browser/accessibility/soda_installer.h"
#include "chrome/browser/profiles/profile.h"
Expand All @@ -12,9 +14,49 @@
#include "chrome/browser/speech/speech_recognizer_delegate.h"
#include "content/public/browser/audio_service.h"
#include "content/public/browser/browser_thread.h"
#include "media/audio/audio_system.h"
#include "media/base/audio_parameters.h"
#include "media/base/bind_to_current_loop.h"
#include "media/base/media_switches.h"

namespace {

// Sample rate used by content::SpeechRecognizerImpl, which is used
// by NetworkSpeechRecognizer.
static constexpr int kAudioSampleRate = 16000;

// This is about how many times we want the audio callback to happen per second.
// Web speech recognition happens about 10 time per second, so we take that
// convervative number here. We can increase if it seems laggy.
static constexpr int kPollingTimesPerSecond = 10;

media::AudioParameters GetAudioParameters(
const base::Optional<media::AudioParameters>& params,
bool is_multichannel_supported) {
if (params) {
media::AudioParameters result = params.value();
int sample_rate = params->sample_rate();
int frames_per_buffer = std::max(params->frames_per_buffer(),
sample_rate / kPollingTimesPerSecond);
media::ChannelLayout channel_layout = is_multichannel_supported
? params->channel_layout()
: media::CHANNEL_LAYOUT_MONO;
result.Reset(params->format(), channel_layout, sample_rate,
frames_per_buffer);
return result;
}

static_assert(kAudioSampleRate % 100 == 0,
"Audio sample rate is not divisible by 100");
return media::AudioParameters(
media::AudioParameters::AUDIO_PCM_LOW_LATENCY,
is_multichannel_supported ? media::CHANNEL_LAYOUT_STEREO
: media::CHANNEL_LAYOUT_MONO,
kAudioSampleRate, kAudioSampleRate / kPollingTimesPerSecond);
}

} // namespace

bool OnDeviceSpeechRecognizer::IsOnDeviceSpeechRecognizerAvailable() {
// IsSodaInstalled will DCHECK if kUseSodaForLiveCaption is disabled.
// kUseSodaForLiveCaption is used to track SODA availability on-device.
Expand All @@ -26,26 +68,18 @@ OnDeviceSpeechRecognizer::OnDeviceSpeechRecognizer(
const base::WeakPtr<SpeechRecognizerDelegate>& delegate,
Profile* profile)
: SpeechRecognizer(delegate),
state_(SpeechRecognizerStatus::SPEECH_RECOGNIZER_OFF) {
state_(SpeechRecognizerStatus::SPEECH_RECOGNIZER_OFF),
is_multichannel_supported_(false),
waiting_for_params_(false) {
DCHECK_CURRENTLY_ON(content::BrowserThread::UI);

// Connect the SpeechRecognitionContext.
mojo::PendingReceiver<media::mojom::SpeechRecognitionContext>
speech_recognition_context_receiver =
speech_recognition_context_.BindNewPipeAndPassReceiver();

// Bind to an AudioSourceFetcher in the Speech Recognition service,
// passing the stream factory so it can listen to mic audio.
// TODO(crbug.com/1173135): Get input stream parameters from
// content::CreateAudioSystemForAudioService() if possible, and pass this
// and device_id to the AudioSourceFetcher in BindAudioSourceFetcher().
mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory;
content::GetAudioServiceStreamFactoryBinder().Run(
stream_factory.InitWithNewPipeAndPassReceiver());
speech_recognition_context_->BindAudioSourceFetcher(
audio_source_fetcher_.BindNewPipeAndPassReceiver(),
speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
std::move(stream_factory),
media::BindToCurrentLoop(
base::BindOnce(&OnDeviceSpeechRecognizer::OnRecognizerBound,
weak_factory_.GetWeakPtr())));
Expand All @@ -66,8 +100,15 @@ OnDeviceSpeechRecognizer::~OnDeviceSpeechRecognizer() {
}

void OnDeviceSpeechRecognizer::Start() {
audio_source_fetcher_->Start();
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_RECOGNIZING);
// Get audio parameters from the AudioSystem, and use these to start
// recognition from the callback.
if (!audio_system_)
audio_system_ = content::CreateAudioSystemForAudioService();
waiting_for_params_ = true;
audio_system_->GetInputStreamParameters(
media::AudioDeviceDescription::kDefaultDeviceId,
base::BindOnce(&OnDeviceSpeechRecognizer::StartFetchingOnInputDeviceInfo,
weak_factory_.GetWeakPtr()));
}

void OnDeviceSpeechRecognizer::Stop() {
Expand All @@ -94,17 +135,40 @@ void OnDeviceSpeechRecognizer::OnLanguageIdentificationEvent(
// Do nothing.
}

void OnDeviceSpeechRecognizer::OnRecognizerBound(bool success) {
if (success)
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_READY);
void OnDeviceSpeechRecognizer::OnRecognizerBound(
bool is_multichannel_supported) {
is_multichannel_supported_ = is_multichannel_supported;
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_READY);
}

void OnDeviceSpeechRecognizer::OnRecognizerDisconnected() {
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_ERROR);
}

void OnDeviceSpeechRecognizer::StartFetchingOnInputDeviceInfo(
const base::Optional<media::AudioParameters>& params) {
// waiting_for_params_ was set before requesting audio params from the
// AudioSystem, which returns here asynchronously. If this has changed, then
// we shouldn't start up any more.
if (!waiting_for_params_)
return;
waiting_for_params_ = false;

// Bind to an AudioSourceFetcher in the Speech Recognition service,
// passing the stream factory so it can listen to mic audio.
mojo::PendingRemote<media::mojom::AudioStreamFactory> stream_factory;
content::GetAudioServiceStreamFactoryBinder().Run(
stream_factory.InitWithNewPipeAndPassReceiver());
audio_source_fetcher_->Start(
std::move(stream_factory),
media::AudioDeviceDescription::kDefaultDeviceId,
GetAudioParameters(params, is_multichannel_supported_));
UpdateStatus(SpeechRecognizerStatus::SPEECH_RECOGNIZER_RECOGNIZING);
}

void OnDeviceSpeechRecognizer::UpdateStatus(SpeechRecognizerStatus state) {
DCHECK_CURRENTLY_ON(content::BrowserThread::UI);
waiting_for_params_ = false;
if (state_ == state)
return;
delegate()->OnSpeechRecognitionStateChanged(state);
Expand Down
18 changes: 18 additions & 0 deletions chrome/browser/speech/on_device_speech_recognizer.h
Expand Up @@ -15,6 +15,10 @@
class Profile;
class SpeechRecognizerDelegate;

namespace media {
class AudioSystem;
} // namespace media

// OnDeviceSpeechRecognizer is a wrapper around the on-device speech recognition
// engine that simplifies its use from the browser process.
class OnDeviceSpeechRecognizer
Expand Down Expand Up @@ -49,14 +53,28 @@ class OnDeviceSpeechRecognizer
media::mojom::LanguageIdentificationEventPtr event) override;

private:
friend class OnDeviceSpeechRecognizerBrowsertest;

void OnRecognizerBound(bool success);
void OnRecognizerDisconnected();
void StartFetchingOnInputDeviceInfo(
const base::Optional<media::AudioParameters>& params);

// Helper function to send the delegate updates to SpeechRecognizerStatus
// only when the status has changed.
void UpdateStatus(SpeechRecognizerStatus state);

SpeechRecognizerStatus state_;
bool is_multichannel_supported_;

// Whether we are waiting for the AudioParameters callback to return. Used
// to ensure Start doesn't keep starting if Stop or Error were called
// in between requesting the callback and it running.
bool waiting_for_params_;

// Tests may set audio_system_ after constructing an OnDeviceSpeechRecognizer
// to override default behavior.
std::unique_ptr<media::AudioSystem> audio_system_;

mojo::Remote<media::mojom::SpeechRecognitionContext>
speech_recognition_context_;
Expand Down

0 comments on commit 1ad8e7a

Please sign in to comment.