diff --git a/app/jobs/scheduled/automatic_translation_backfill.rb b/app/jobs/scheduled/automatic_translation_backfill.rb index d1d6d51f..eccd46a9 100644 --- a/app/jobs/scheduled/automatic_translation_backfill.rb +++ b/app/jobs/scheduled/automatic_translation_backfill.rb @@ -101,7 +101,7 @@ def process_batch backfill_locales.each_with_index do |target_locale, i| topic_ids = fetch_untranslated_model_ids(Topic, "title", records_to_translate, target_locale) - post_ids = fetch_untranslated_model_ids(Post, "cooked", records_to_translate, target_locale) + post_ids = fetch_untranslated_model_ids(Post, "raw", records_to_translate, target_locale) # if we end up translating fewer records than records_to_translate, # add to the value so that the next locales can have more quota diff --git a/app/models/concerns/discourse_translator/translatable.rb b/app/models/concerns/discourse_translator/translatable.rb index 1a2dbaf1..299bcd96 100644 --- a/app/models/concerns/discourse_translator/translatable.rb +++ b/app/models/concerns/discourse_translator/translatable.rb @@ -24,7 +24,6 @@ def set_detected_locale(locale) # @param text [String] the translated text def set_translation(locale, text) locale = locale.to_s.gsub("_", "-") - text = DiscourseTranslator::TranslatedContentSanitizer.sanitize(text) translations.find_or_initialize_by(locale: locale).update!(translation: text) end diff --git a/app/services/discourse_ai/language_detector.rb b/app/services/discourse_ai/language_detector.rb index 6357fd0c..ac558910 100644 --- a/app/services/discourse_ai/language_detector.rb +++ b/app/services/discourse_ai/language_detector.rb @@ -3,11 +3,12 @@ module DiscourseAi class LanguageDetector PROMPT_TEXT = <<~TEXT - I want you to act as a language expert, determining the locale for a set of text. - The locale is a language identifier, such as "en" for English, "de" for German, etc, - and can also include a region identifier, such as "en-GB" for British English, or "zh-Hans" for Simplified Chinese. - I will provide you with text, and you will determine the locale of the text. - Include your locale between XML tags. + You are a language expert and will determine the locale for user-written content. + - the locale is a language identifier, such as "en" for English, "de" for German, or "zh-CN" for Simplified Chinese, etc. + - use the vocabulary and grammar of content to determine the locale + - do not use links or code to determine the locale + - do not write explanations + - only return the locale TEXT def initialize(text) @@ -21,14 +22,13 @@ def detect messages: [{ type: :user, content: @text, id: "user" }], ) - response = + locale = DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate( prompt, user: Discourse.system_user, feature_name: "translator-language-detect", ) - - (Nokogiri::HTML5.fragment(response).at("language")&.text || response) + locale&.strip end end end diff --git a/app/services/discourse_ai/translator.rb b/app/services/discourse_ai/translator.rb index 53a2051b..d3f181ee 100644 --- a/app/services/discourse_ai/translator.rb +++ b/app/services/discourse_ai/translator.rb @@ -3,13 +3,14 @@ module DiscourseAi class Translator PROMPT_TEMPLATE = <<~TEXT.freeze - You are a highly skilled linguist of many languages and have expert knowledge in HTML. - Your task is to identify the language of the text I provide and accurately translate it into this language locale "%{target_language}" while preserving the meaning, tone, and nuance of the original text. - The text may or may not contain html tags. If they do, preserve them. - Maintain proper grammar, spelling, and punctuation in the translated version. - You will find the text between XML tags. - Include your translation between XML tags. - Do not write explanations. + You are an expert translator specializing in converting Markdown content from any source language to target locale "%{target_language}". Your task is to: + 1. Translate the content accurately while preserving all Markdown formatting elements + 2. Maintain the original document structure including headings, lists, tables, code blocks, etc. + 3. Preserve all links, images, and other media references without translation + 4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments + 5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses + 6. For ambiguous terms or phrases, choose the most contextually appropriate translation + 7. You are being consumed via an API, only EVER return the translated text, do not return any other information TEXT def initialize(text, target_language) @@ -21,17 +22,14 @@ def translate prompt = DiscourseAi::Completions::Prompt.new( build_prompt(@target_language), - messages: [{ type: :user, content: "#{@text}", id: "user" }], + messages: [{ type: :user, content: "#{@text}", id: "user" }], ) - llm_translation = - DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate( - prompt, - user: Discourse.system_user, - feature_name: "translator-translate", - ) - - (Nokogiri::HTML5.fragment(llm_translation).at("output")&.inner_html || "").strip + DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate( + prompt, + user: Discourse.system_user, + feature_name: "translator-translate", + ) end private diff --git a/app/services/discourse_translator/base.rb b/app/services/discourse_translator/base.rb index eac1d435..3ab0595d 100644 --- a/app/services/discourse_translator/base.rb +++ b/app/services/discourse_translator/base.rb @@ -33,7 +33,7 @@ def self.translate(translatable, target_locale_sym = I18n.locale) detected_lang = detect(translatable) if translatable.locale_matches?(target_locale_sym) - return detected_lang, get_untranslated(translatable) + return detected_lang, get_untranslated_cooked(translatable) end translation = translatable.translation_for(target_locale_sym) @@ -50,7 +50,9 @@ def self.translate(translatable, target_locale_sym = I18n.locale) end translated = translate!(translatable, target_locale_sym) - save_translation(translatable, target_locale_sym) { translated } + save_translation(translatable, target_locale_sym) do + TranslatedContentNormalizer.normalize(translatable, translated) + end [detected_lang, translated] end @@ -122,25 +124,25 @@ def self.translate_supported?(detected_lang, target_lang) private - def self.strip_tags_for_detection(detection_text) - html_doc = Nokogiri::HTML::DocumentFragment.parse(detection_text) - html_doc.css("img", "aside.quote", "div.lightbox-wrapper", "a.mention,a.lightbox").remove - html_doc.to_html - end - def self.text_for_detection(translatable) - strip_tags_for_detection(get_untranslated(translatable)).truncate( - DETECTION_CHAR_LIMIT, - omission: nil, - ) + get_untranslated_raw(translatable).truncate(DETECTION_CHAR_LIMIT, omission: nil) end def self.text_for_translation(translatable) max_char = SiteSetting.max_characters_per_translation - get_untranslated(translatable).truncate(max_char, omission: nil) + get_untranslated_raw(translatable).truncate(max_char, omission: nil) + end + + def self.get_untranslated_raw(translatable) + case translatable.class.name + when "Post" + translatable.raw + when "Topic" + translatable.title + end end - def self.get_untranslated(translatable) + def self.get_untranslated_cooked(translatable) case translatable.class.name when "Post" translatable.cooked diff --git a/lib/discourse_translator/translated_content_normalizer.rb b/lib/discourse_translator/translated_content_normalizer.rb new file mode 100644 index 00000000..24884f55 --- /dev/null +++ b/lib/discourse_translator/translated_content_normalizer.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +module DiscourseTranslator + class TranslatedContentNormalizer + def self.normalize(translatable, content) + case translatable.class.name + when "Post" + PrettyText.cook(content) + when "Topic" + PrettyText.cleanup(content, {}) + end + end + end +end diff --git a/lib/discourse_translator/translated_content_sanitizer.rb b/lib/discourse_translator/translated_content_sanitizer.rb deleted file mode 100644 index 5e028697..00000000 --- a/lib/discourse_translator/translated_content_sanitizer.rb +++ /dev/null @@ -1,9 +0,0 @@ -# frozen_string_literal: true - -module DiscourseTranslator - class TranslatedContentSanitizer - def self.sanitize(content) - PrettyText.cleanup(content, {}) - end - end -end diff --git a/lib/discourse_translator/translator_selection_validator.rb b/lib/discourse_translator/translator_selection_validator.rb index 68835b05..f63a9aca 100644 --- a/lib/discourse_translator/translator_selection_validator.rb +++ b/lib/discourse_translator/translator_selection_validator.rb @@ -20,7 +20,9 @@ def valid_value?(val) def error_message return I18n.t("translator.discourse_ai.not_installed") if !defined?(::DiscourseAi) - I18n.t("translator.discourse_ai.ai_helper_required") if !SiteSetting.ai_helper_enabled + if !SiteSetting.ai_helper_enabled + I18n.t("translator.discourse_ai.ai_helper_required", { base_url: Discourse.base_url }) + end end end end diff --git a/spec/jobs/automatic_translation_backfill_spec.rb b/spec/jobs/automatic_translation_backfill_spec.rb index f91183b8..86efa1e4 100644 --- a/spec/jobs/automatic_translation_backfill_spec.rb +++ b/spec/jobs/automatic_translation_backfill_spec.rb @@ -102,7 +102,7 @@ def expect_google_translate(text) described_class.new.execute expect(topic.translations.pluck(:locale, :translation)).to eq([%w[es hola]]) - expect(post.translations.pluck(:locale, :translation)).to eq([%w[de hallo]]) + expect(post.translations.pluck(:locale, :translation)).to eq([%w[de

hallo

]]) end end @@ -126,7 +126,7 @@ def expect_google_translate(text) expect(topic.translations.pluck(:locale, :translation)).to eq([%w[de hallo]]) expect(posts.map { |p| p.translations.pluck(:locale, :translation).flatten }).to eq( - [%w[de hallo]] * 4, + [%w[de

hallo

]] * 4, ) end end @@ -176,27 +176,27 @@ def expect_google_translate(text) post_1.update!(updated_at: 2.days.ago) post_2.update!(updated_at: 3.days.ago) - result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de") + result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de") expect(result).to include(post_6.id, post_1.id, post_2.id) end it "does not return posts that are deleted" do post_1.trash! - result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de") + result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de") expect(result).not_to include(post_1.id) end it "does not return posts that are empty" do - post_1.cooked = "" + post_1.raw = "" post_1.save!(validate: false) - result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de") + result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de") expect(result).not_to include(post_1.id) end it "does not return posts by bots" do post_1.update(user: Discourse.system_user) - result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de") + result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de") expect(result).not_to include(post_1.id) end diff --git a/spec/lib/translated_content_normalizer_spec.rb b/spec/lib/translated_content_normalizer_spec.rb new file mode 100644 index 00000000..b22eec19 --- /dev/null +++ b/spec/lib/translated_content_normalizer_spec.rb @@ -0,0 +1,22 @@ +# frozen_string_literal: true + +describe DiscourseTranslator::TranslatedContentNormalizer do + fab!(:post) + fab!(:topic) + + it "normalizes the content" do + expect( + DiscourseTranslator::TranslatedContentNormalizer.normalize( + post, + "

Testing

This is a test post

", + ), + ).to eq("

Testing

This is a test post

") + + expect( + DiscourseTranslator::TranslatedContentNormalizer.normalize( + topic, + "

Testing

This is a test post

", + ), + ).to eq("

Testing

This is a test post

") + end +end diff --git a/spec/lib/translated_content_sanitizer_spec.rb b/spec/lib/translated_content_sanitizer_spec.rb deleted file mode 100644 index 8fb4e347..00000000 --- a/spec/lib/translated_content_sanitizer_spec.rb +++ /dev/null @@ -1,12 +0,0 @@ -# frozen_string_literal: true - -describe DiscourseTranslator::TranslatedContentSanitizer do - it "sanitizes the content" do - sanitized = - DiscourseTranslator::TranslatedContentSanitizer.sanitize( - "

Testing

This is a test post

", - ) - - expect(sanitized).to eq("

Testing

This is a test post

") - end -end diff --git a/spec/lib/translator_selection_validator_spec.rb b/spec/lib/translator_selection_validator_spec.rb index de7c898c..27dc29c5 100644 --- a/spec/lib/translator_selection_validator_spec.rb +++ b/spec/lib/translator_selection_validator_spec.rb @@ -61,7 +61,7 @@ it "returns the ai_helper_required error message" do SiteSetting.ai_helper_enabled = false expect(described_class.new.error_message).to eq( - I18n.t("translator.discourse_ai.ai_helper_required"), + I18n.t("translator.discourse_ai.ai_helper_required", { base_url: Discourse.base_url }), ) end end diff --git a/spec/services/amazon_spec.rb b/spec/services/amazon_spec.rb index d8b4e0fd..c7f1ac2a 100644 --- a/spec/services/amazon_spec.rb +++ b/spec/services/amazon_spec.rb @@ -19,7 +19,7 @@ describe ".detect" do let(:post) { Fabricate(:post) } let!(:client) { Aws::Translate::Client.new(stub_responses: true) } - let(:text) { described_class.truncate(post.cooked) } + let(:text) { described_class.truncate(post.raw) } let(:detected_lang) { "en" } before do @@ -40,8 +40,8 @@ expect(post.detected_locale).to eq(detected_lang) end - it "should fail graciously when the cooked translated text is blank" do - post.cooked = "" + it "should fail graciously when the raw translated text is blank" do + post.raw = "" expect(described_class.detect(post)).to be_nil end end diff --git a/spec/services/base_spec.rb b/spec/services/base_spec.rb index 22785d93..0db60829 100644 --- a/spec/services/base_spec.rb +++ b/spec/services/base_spec.rb @@ -36,64 +36,23 @@ class EmptyTranslator < DiscourseTranslator::Base describe ".text_for_detection" do fab!(:post) - it "strips img tags" do - post.cooked = "" - expect(DiscourseTranslator::Base.text_for_detection(post)).to eq("") - end - - it "strips @ mention anchor tags" do - post.cooked = "cat" - expect(DiscourseTranslator::Base.text_for_detection(post)).to eq("") - end - - it "strips lightbox anchor tags" do - post.cooked = "" - expect(DiscourseTranslator::Base.text_for_detection(post)).to eq("") - end - - it "strips lightboxes" do - post.cooked = "