Skip to content

Commit

Permalink
FIX: Amazon text size is 10k bytes (#137)
Browse files Browse the repository at this point in the history
The Amazon translate API allows for a text to be up to 10k bytes.

This changes the `MAXLENGTH` constant into the `MAX_BYTES` and uses a `truncate` method that works at the bytes level instead of the character/grapheme level.

Ref - https://meta.discourse.org/t/increase-amazon-translate-limit-to-10-000-characters/304579
  • Loading branch information
ZogStriP committed Apr 19, 2024
1 parent 37ae6ef commit 60ab557
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 11 deletions.
14 changes: 11 additions & 3 deletions services/discourse_translator/amazon.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ module DiscourseTranslator
class Amazon < Base
require "aws-sdk-translate"

MAXLENGTH = 5000
MAX_BYTES = 10_000

# Hash which maps Discourse's locale code to Amazon Translate's language code found in
# https://docs.aws.amazon.com/translate/latest/dg/what-is-languages.html
Expand Down Expand Up @@ -88,12 +88,20 @@ class Amazon < Base
cy: "cy",
}

# The API expects a maximum of 10k __bytes__ of text
def self.truncate(text)
return text if text.bytesize <= MAX_BYTES
text = text.byteslice(...MAX_BYTES)
text = text.byteslice(...text.bytesize - 1) until text.valid_encoding?
text
end

def self.access_token_key
"aws-translator"
end

def self.detect(topic_or_post)
text = get_text(topic_or_post).truncate(MAXLENGTH, omission: nil)
text = truncate get_text(topic_or_post)

return if text.blank?

Expand All @@ -114,7 +122,7 @@ def self.translate(topic_or_post)
result =
client.translate_text(
{
text: get_text(topic_or_post).truncate(MAXLENGTH, omission: nil),
text: truncate(get_text(topic_or_post)),
source_language_code: "auto",
target_language_code: SUPPORTED_LANG_MAPPING[I18n.locale],
},
Expand Down
2 changes: 1 addition & 1 deletion services/discourse_translator/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def self.from_custom_fields(topic_or_post)
def self.get_text(topic_or_post)
case topic_or_post.class.name
when "Post"
text = topic_or_post.cooked
topic_or_post.cooked
when "Topic"
topic_or_post.title
end
Expand Down
19 changes: 12 additions & 7 deletions spec/services/amazon_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,21 @@
RSpec.describe DiscourseTranslator::Amazon do
let(:mock_response) { Struct.new(:status, :body) }

describe ".truncate" do
it "truncates text to 10000 bytes" do
text = "こんにちは" * (described_class::MAX_BYTES / 5)
truncated = described_class.truncate(text)

expect(truncated.bytesize).to be <= described_class::MAX_BYTES
expect(truncated.valid_encoding?).to eq(true)
expect(truncated[-1]).to eq "に"
end
end

describe ".detect" do
let(:post) { Fabricate(:post) }
let!(:client) { Aws::Translate::Client.new(stub_responses: true) }
let(:text) { post.cooked.truncate(described_class::MAXLENGTH, omission: nil) }
let(:text) { described_class.truncate(post.cooked) }
let(:detected_lang) { "en" }

before do
Expand All @@ -33,12 +44,6 @@
end
end

it "should truncate string to 5000 characters and still process the request" do
length = 6000
post.cooked = rand(36**length).to_s(36)
expect(described_class.detect(post)).to eq(detected_lang)
end

it "should fail graciously when the cooked translated text is blank" do
post.cooked = ""
expect(described_class.detect(post)).to be_nil
Expand Down

0 comments on commit 60ab557

Please sign in to comment.