diff --git a/lib/cdo/profanity_filter.rb b/lib/cdo/profanity_filter.rb new file mode 100644 index 0000000000000..63789958bec8c --- /dev/null +++ b/lib/cdo/profanity_filter.rb @@ -0,0 +1,33 @@ +require 'cdo/web_purify' + +class ProfanityFilter + # List of words that should be allowed only in specific languages. + # + # Entries are in the format + # foobar: %w(en es) + # Where the key "foobar" is the word in question, + # and the value is a list of languages that should allow that word. + # + # Words in this list will be blocked for all languages _except_ for those + # listed along with the word. + # Words in this list should be _unblocked_ on WebPurify, since we are + # handling them with our custom code, here. + LANGUAGE_SPECIFIC_ALLOWLIST = { + fu: %w(it), # past-tense "to be" in Italian + fick: %w(sv) # "got" in Swedish + } + + # Look for profanity in a given text, return the first expletive found + # or nil if no profanity is found. + # + # @param [String] text to check for profanity + # @param [String] language_code a two-character ISO 639-1 language code + def self.find_potential_profanity(text, language_code) + LANGUAGE_SPECIFIC_ALLOWLIST.each do |word, languages| + next if languages.include? language_code + r = Regexp.new "\\b#{word}\\b", Regexp::IGNORECASE + return word.to_s if r =~ text + end + WebPurify.find_potential_profanity(text, ['en', language_code]) + end +end diff --git a/lib/cdo/share_filtering.rb b/lib/cdo/share_filtering.rb index 9e6535b749514..a49b3a13e81d0 100644 --- a/lib/cdo/share_filtering.rb +++ b/lib/cdo/share_filtering.rb @@ -1,6 +1,6 @@ require 'cdo/regexp' require 'cdo/geocoder' -require 'cdo/web_purify' +require 'cdo/profanity_filter' require 'dynamic_config/gatekeeper' USER_ENTERED_TEXT_INDICATORS = ['TITLE', 'TEXT', 'title name\=\"VAL\"'].freeze @@ -23,6 +23,9 @@ module FailureType # # May throw OpenURI::HTTPError, IO::EAGAINWaitReadable depending on # service availability. + # + # @param [String] program the student's program text + # @param [String] locale a two-character ISO 639-1 language code def self.find_share_failure(program, locale) return nil unless should_filter_program(program) @@ -38,7 +41,7 @@ def self.find_share_failure(program, locale) phone_number = RegexpUtils.find_potential_phone_number(program_tags_removed) return ShareFailure.new(FailureType::PHONE, phone_number) if phone_number - expletive = WebPurify.find_potential_profanity(program_tags_removed, ['en', locale]) + expletive = ProfanityFilter.find_potential_profanity(program_tags_removed, locale) return ShareFailure.new(FailureType::PROFANITY, expletive) if expletive nil diff --git a/lib/test/cdo/test_share_filtering.rb b/lib/test/cdo/test_share_filtering.rb index 2f269ee371c68..c19cc10932881 100644 --- a/lib/test/cdo/test_share_filtering.rb +++ b/lib/test/cdo/test_share_filtering.rb @@ -66,6 +66,72 @@ def test_find_share_failure_with_profanity ) end + def test_profanity_with_italian_edge_case + # "fu" is a past-tense "to be" in Italian, but should be blocked + # as profanity in English. WebPurify doesn't support this, so we + # have custom filtering that takes locale into account for this word. + program = generate_program('My Custom Profanity', 'fu') + innocent_program = generate_program('My Innocent Program', 'funny tofu') + + # Stub WebPurify because we expect our custom blocking to handle this case. + WebPurify.stubs(:find_potential_profanity).returns(nil) + + # Blocked in English + assert_equal( + ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fu'), + ShareFiltering.find_share_failure(program, 'en') + ) + + # But the innocent program is fine + assert_nil( + ShareFiltering.find_share_failure(innocent_program, 'en') + ) + + # Blocked in Spanish + assert_equal( + ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fu'), + ShareFiltering.find_share_failure(program, 'es') + ) + + # Allowed in Italian + assert_nil( + ShareFiltering.find_share_failure(program, 'it') + ) + end + + def test_profanity_with_swedish_edge_case + # "fick" means "got" in Swedish, but should be blocked + # as profanity in English. WebPurify doesn't support this, so we + # have custom filtering that takes locale into account for this word. + questionable_program = generate_program('My Custom Profanity', 'fick') + innocent_program = generate_program('My Innocent Program', 'fickle') + + # Stub WebPurify because we expect our custom blocking to handle this case. + WebPurify.stubs(:find_potential_profanity).returns(nil) + + # Blocked in English + assert_equal( + ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fick'), + ShareFiltering.find_share_failure(questionable_program, 'en') + ) + + # But the innocent program is fine + assert_nil( + ShareFiltering.find_share_failure(innocent_program, 'en') + ) + + # Blocked in Italian + assert_equal( + ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fick'), + ShareFiltering.find_share_failure(questionable_program, 'it') + ) + + # Allowed in Swedish + assert_nil( + ShareFiltering.find_share_failure(questionable_program, 'sv') + ) + end + def test_find_share_failure_for_non_playlab program = ''\ '4'\