Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/jobs/scheduled/automatic_translation_backfill.rb
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def process_batch
backfill_locales.each_with_index do |target_locale, i|
topic_ids =
fetch_untranslated_model_ids(Topic, "title", records_to_translate, target_locale)
post_ids = fetch_untranslated_model_ids(Post, "cooked", records_to_translate, target_locale)
post_ids = fetch_untranslated_model_ids(Post, "raw", records_to_translate, target_locale)

# if we end up translating fewer records than records_to_translate,
# add to the value so that the next locales can have more quota
Expand Down
1 change: 0 additions & 1 deletion app/models/concerns/discourse_translator/translatable.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ def set_detected_locale(locale)
# @param text [String] the translated text
def set_translation(locale, text)
locale = locale.to_s.gsub("_", "-")
text = DiscourseTranslator::TranslatedContentSanitizer.sanitize(text)
translations.find_or_initialize_by(locale: locale).update!(translation: text)
end

Expand Down
16 changes: 8 additions & 8 deletions app/services/discourse_ai/language_detector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
module DiscourseAi
class LanguageDetector
PROMPT_TEXT = <<~TEXT
I want you to act as a language expert, determining the locale for a set of text.
The locale is a language identifier, such as "en" for English, "de" for German, etc,
and can also include a region identifier, such as "en-GB" for British English, or "zh-Hans" for Simplified Chinese.
I will provide you with text, and you will determine the locale of the text.
Include your locale between <language></language> XML tags.
You are a language expert and will determine the locale for user-written content.
- the locale is a language identifier, such as "en" for English, "de" for German, or "zh-CN" for Simplified Chinese, etc.
- use the vocabulary and grammar of content to determine the locale
- do not use links or code to determine the locale
- do not write explanations
- only return the locale
TEXT

def initialize(text)
Expand All @@ -21,14 +22,13 @@ def detect
messages: [{ type: :user, content: @text, id: "user" }],
)

response =
locale =
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
prompt,
user: Discourse.system_user,
feature_name: "translator-language-detect",
)

(Nokogiri::HTML5.fragment(response).at("language")&.text || response)
locale&.strip
end
end
end
30 changes: 14 additions & 16 deletions app/services/discourse_ai/translator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
module DiscourseAi
class Translator
PROMPT_TEMPLATE = <<~TEXT.freeze
You are a highly skilled linguist of many languages and have expert knowledge in HTML.
Your task is to identify the language of the text I provide and accurately translate it into this language locale "%{target_language}" while preserving the meaning, tone, and nuance of the original text.
The text may or may not contain html tags. If they do, preserve them.
Maintain proper grammar, spelling, and punctuation in the translated version.
You will find the text between <input></input> XML tags.
Include your translation between <output></output> XML tags.
Do not write explanations.
You are an expert translator specializing in converting Markdown content from any source language to target locale "%{target_language}". Your task is to:
1. Translate the content accurately while preserving all Markdown formatting elements
2. Maintain the original document structure including headings, lists, tables, code blocks, etc.
3. Preserve all links, images, and other media references without translation
4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
6. For ambiguous terms or phrases, choose the most contextually appropriate translation
7. You are being consumed via an API, only EVER return the translated text, do not return any other information
TEXT

def initialize(text, target_language)
Expand All @@ -21,17 +22,14 @@ def translate
prompt =
DiscourseAi::Completions::Prompt.new(
build_prompt(@target_language),
messages: [{ type: :user, content: "<input>#{@text}</input>", id: "user" }],
messages: [{ type: :user, content: "#{@text}", id: "user" }],
)

llm_translation =
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
prompt,
user: Discourse.system_user,
feature_name: "translator-translate",
)

(Nokogiri::HTML5.fragment(llm_translation).at("output")&.inner_html || "").strip
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
prompt,
user: Discourse.system_user,
feature_name: "translator-translate",
)
end

private
Expand Down
30 changes: 16 additions & 14 deletions app/services/discourse_translator/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def self.translate(translatable, target_locale_sym = I18n.locale)
detected_lang = detect(translatable)

if translatable.locale_matches?(target_locale_sym)
return detected_lang, get_untranslated(translatable)
return detected_lang, get_untranslated_cooked(translatable)
end

translation = translatable.translation_for(target_locale_sym)
Expand All @@ -50,7 +50,9 @@ def self.translate(translatable, target_locale_sym = I18n.locale)
end

translated = translate!(translatable, target_locale_sym)
save_translation(translatable, target_locale_sym) { translated }
save_translation(translatable, target_locale_sym) do
TranslatedContentNormalizer.normalize(translatable, translated)
end
[detected_lang, translated]
end

Expand Down Expand Up @@ -122,25 +124,25 @@ def self.translate_supported?(detected_lang, target_lang)

private

def self.strip_tags_for_detection(detection_text)
html_doc = Nokogiri::HTML::DocumentFragment.parse(detection_text)
html_doc.css("img", "aside.quote", "div.lightbox-wrapper", "a.mention,a.lightbox").remove
html_doc.to_html
end

def self.text_for_detection(translatable)
strip_tags_for_detection(get_untranslated(translatable)).truncate(
DETECTION_CHAR_LIMIT,
omission: nil,
)
get_untranslated_raw(translatable).truncate(DETECTION_CHAR_LIMIT, omission: nil)
end

def self.text_for_translation(translatable)
max_char = SiteSetting.max_characters_per_translation
get_untranslated(translatable).truncate(max_char, omission: nil)
get_untranslated_raw(translatable).truncate(max_char, omission: nil)
end

def self.get_untranslated_raw(translatable)
case translatable.class.name
when "Post"
translatable.raw
when "Topic"
translatable.title
end
end

def self.get_untranslated(translatable)
def self.get_untranslated_cooked(translatable)
case translatable.class.name
when "Post"
translatable.cooked
Expand Down
14 changes: 14 additions & 0 deletions lib/discourse_translator/translated_content_normalizer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# frozen_string_literal: true

module DiscourseTranslator
class TranslatedContentNormalizer
def self.normalize(translatable, content)
case translatable.class.name
when "Post"
PrettyText.cook(content)
when "Topic"
PrettyText.cleanup(content, {})
end
end
end
end
9 changes: 0 additions & 9 deletions lib/discourse_translator/translated_content_sanitizer.rb

This file was deleted.

4 changes: 3 additions & 1 deletion lib/discourse_translator/translator_selection_validator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def valid_value?(val)
def error_message
return I18n.t("translator.discourse_ai.not_installed") if !defined?(::DiscourseAi)

I18n.t("translator.discourse_ai.ai_helper_required") if !SiteSetting.ai_helper_enabled
if !SiteSetting.ai_helper_enabled
I18n.t("translator.discourse_ai.ai_helper_required", { base_url: Discourse.base_url })
end
end
end
end
14 changes: 7 additions & 7 deletions spec/jobs/automatic_translation_backfill_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def expect_google_translate(text)
described_class.new.execute

expect(topic.translations.pluck(:locale, :translation)).to eq([%w[es hola]])
expect(post.translations.pluck(:locale, :translation)).to eq([%w[de hallo]])
expect(post.translations.pluck(:locale, :translation)).to eq([%w[de <p>hallo</p>]])
end
end

Expand All @@ -126,7 +126,7 @@ def expect_google_translate(text)

expect(topic.translations.pluck(:locale, :translation)).to eq([%w[de hallo]])
expect(posts.map { |p| p.translations.pluck(:locale, :translation).flatten }).to eq(
[%w[de hallo]] * 4,
[%w[de <p>hallo</p>]] * 4,
)
end
end
Expand Down Expand Up @@ -176,27 +176,27 @@ def expect_google_translate(text)
post_1.update!(updated_at: 2.days.ago)
post_2.update!(updated_at: 3.days.ago)

result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de")
expect(result).to include(post_6.id, post_1.id, post_2.id)
end

it "does not return posts that are deleted" do
post_1.trash!
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de")
expect(result).not_to include(post_1.id)
end

it "does not return posts that are empty" do
post_1.cooked = ""
post_1.raw = ""
post_1.save!(validate: false)
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de")
expect(result).not_to include(post_1.id)
end

it "does not return posts by bots" do
post_1.update(user: Discourse.system_user)

result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de")

expect(result).not_to include(post_1.id)
end
Expand Down
22 changes: 22 additions & 0 deletions spec/lib/translated_content_normalizer_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

describe DiscourseTranslator::TranslatedContentNormalizer do
fab!(:post)
fab!(:topic)

it "normalizes the content" do
expect(
DiscourseTranslator::TranslatedContentNormalizer.normalize(
post,
"<script>alert('test')</script><p> <h1>Testing</h1> This is a test post</p>",
),
).to eq("<p> </p><h1>Testing</h1> This is a test post<p></p>")

expect(
DiscourseTranslator::TranslatedContentNormalizer.normalize(
topic,
"<script>alert('test')</script><p> <h1>Testing</h1> This is a test post</p>",
),
).to eq("<p> </p><h1>Testing</h1> This is a test post<p></p>")
end
end
12 changes: 0 additions & 12 deletions spec/lib/translated_content_sanitizer_spec.rb

This file was deleted.

2 changes: 1 addition & 1 deletion spec/lib/translator_selection_validator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
it "returns the ai_helper_required error message" do
SiteSetting.ai_helper_enabled = false
expect(described_class.new.error_message).to eq(
I18n.t("translator.discourse_ai.ai_helper_required"),
I18n.t("translator.discourse_ai.ai_helper_required", { base_url: Discourse.base_url }),
)
end
end
Expand Down
6 changes: 3 additions & 3 deletions spec/services/amazon_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
describe ".detect" do
let(:post) { Fabricate(:post) }
let!(:client) { Aws::Translate::Client.new(stub_responses: true) }
let(:text) { described_class.truncate(post.cooked) }
let(:text) { described_class.truncate(post.raw) }
let(:detected_lang) { "en" }

before do
Expand All @@ -40,8 +40,8 @@
expect(post.detected_locale).to eq(detected_lang)
end

it "should fail graciously when the cooked translated text is blank" do
post.cooked = ""
it "should fail graciously when the raw translated text is blank" do
post.raw = ""
expect(described_class.detect(post)).to be_nil
end
end
Expand Down
Loading
Loading