diff --git a/services/discourse_translator/amazon.rb b/services/discourse_translator/amazon.rb index 9afac2f..ac30278 100644 --- a/services/discourse_translator/amazon.rb +++ b/services/discourse_translator/amazon.rb @@ -6,7 +6,7 @@ module DiscourseTranslator class Amazon < Base require "aws-sdk-translate" - MAXLENGTH = 5000 + MAX_BYTES = 10_000 # Hash which maps Discourse's locale code to Amazon Translate's language code found in # https://docs.aws.amazon.com/translate/latest/dg/what-is-languages.html @@ -88,12 +88,20 @@ class Amazon < Base cy: "cy", } + # The API expects a maximum of 10k __bytes__ of text + def self.truncate(text) + return text if text.bytesize <= MAX_BYTES + text = text.byteslice(...MAX_BYTES) + text = text.byteslice(...text.bytesize - 1) until text.valid_encoding? + text + end + def self.access_token_key "aws-translator" end def self.detect(topic_or_post) - text = get_text(topic_or_post).truncate(MAXLENGTH, omission: nil) + text = truncate get_text(topic_or_post) return if text.blank? @@ -114,7 +122,7 @@ def self.translate(topic_or_post) result = client.translate_text( { - text: get_text(topic_or_post).truncate(MAXLENGTH, omission: nil), + text: truncate(get_text(topic_or_post)), source_language_code: "auto", target_language_code: SUPPORTED_LANG_MAPPING[I18n.locale], }, diff --git a/services/discourse_translator/base.rb b/services/discourse_translator/base.rb index b14eee8..0143329 100644 --- a/services/discourse_translator/base.rb +++ b/services/discourse_translator/base.rb @@ -52,7 +52,7 @@ def self.from_custom_fields(topic_or_post) def self.get_text(topic_or_post) case topic_or_post.class.name when "Post" - text = topic_or_post.cooked + topic_or_post.cooked when "Topic" topic_or_post.title end diff --git a/spec/services/amazon_spec.rb b/spec/services/amazon_spec.rb index b7b1efa..cba7ff0 100644 --- a/spec/services/amazon_spec.rb +++ b/spec/services/amazon_spec.rb @@ -5,10 +5,21 @@ RSpec.describe DiscourseTranslator::Amazon do let(:mock_response) { Struct.new(:status, :body) } + describe ".truncate" do + it "truncates text to 10000 bytes" do + text = "こんにちは" * (described_class::MAX_BYTES / 5) + truncated = described_class.truncate(text) + + expect(truncated.bytesize).to be <= described_class::MAX_BYTES + expect(truncated.valid_encoding?).to eq(true) + expect(truncated[-1]).to eq "に" + end + end + describe ".detect" do let(:post) { Fabricate(:post) } let!(:client) { Aws::Translate::Client.new(stub_responses: true) } - let(:text) { post.cooked.truncate(described_class::MAXLENGTH, omission: nil) } + let(:text) { described_class.truncate(post.cooked) } let(:detected_lang) { "en" } before do @@ -33,12 +44,6 @@ end end - it "should truncate string to 5000 characters and still process the request" do - length = 6000 - post.cooked = rand(36**length).to_s(36) - expect(described_class.detect(post)).to eq(detected_lang) - end - it "should fail graciously when the cooked translated text is blank" do post.cooked = "" expect(described_class.detect(post)).to be_nil