From 444f032f65c003ec9cea86e663a7ba6d773c6fc7 Mon Sep 17 00:00:00 2001 From: Brad Buchanan Date: Mon, 3 Jun 2019 13:06:54 -0700 Subject: [PATCH 1/4] Failing test --- lib/cdo/share_filtering.rb | 3 +++ lib/test/cdo/test_share_filtering.rb | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/lib/cdo/share_filtering.rb b/lib/cdo/share_filtering.rb index 9e6535b749514..faacd2081bb9d 100644 --- a/lib/cdo/share_filtering.rb +++ b/lib/cdo/share_filtering.rb @@ -23,6 +23,9 @@ module FailureType # # May throw OpenURI::HTTPError, IO::EAGAINWaitReadable depending on # service availability. + # + # @param [String] program the student's program text + # @param [String] locale a two-character ISO 639-1 language code def self.find_share_failure(program, locale) return nil unless should_filter_program(program) diff --git a/lib/test/cdo/test_share_filtering.rb b/lib/test/cdo/test_share_filtering.rb index 2f269ee371c68..d1d0e9504ccad 100644 --- a/lib/test/cdo/test_share_filtering.rb +++ b/lib/test/cdo/test_share_filtering.rb @@ -66,6 +66,33 @@ def test_find_share_failure_with_profanity ) end + def test_find_share_failure_with_custom_profanity + # "fu" is a past-tense "to be" in Italian, but should be blocked + # as profanity in English. WebPurify doesn't support this, so we + # have custom filtering that takes locale into account for this word. + program = generate_program('My Custom Profanity', 'fu') + + # Stub WebPurify because we expect our custom blocking to handle this case. + WebPurify.stubs(:find_potential_profanity).returns(nil) + + # Block program in English + assert_equal( + ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fu'), + ShareFiltering.find_share_failure(program, 'en') + ) + + # Block program in Spanish + assert_equal( + ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fu'), + ShareFiltering.find_share_failure(program, 'es') + ) + + # Allow program in Italian + assert_nil( + ShareFiltering.find_share_failure(program, 'it') + ) + end + def test_find_share_failure_for_non_playlab program = ''\ '4'\ From d6a0a2896e7860b287ecc329a7c80d0db480fe54 Mon Sep 17 00:00:00 2001 From: Brad Buchanan Date: Mon, 3 Jun 2019 13:09:23 -0700 Subject: [PATCH 2/4] Another failing test --- lib/test/cdo/test_share_filtering.rb | 29 +++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/lib/test/cdo/test_share_filtering.rb b/lib/test/cdo/test_share_filtering.rb index d1d0e9504ccad..c24ece7066f39 100644 --- a/lib/test/cdo/test_share_filtering.rb +++ b/lib/test/cdo/test_share_filtering.rb @@ -66,7 +66,7 @@ def test_find_share_failure_with_profanity ) end - def test_find_share_failure_with_custom_profanity + def test_find_share_failure_with_italian_nonprofanity # "fu" is a past-tense "to be" in Italian, but should be blocked # as profanity in English. WebPurify doesn't support this, so we # have custom filtering that takes locale into account for this word. @@ -93,6 +93,33 @@ def test_find_share_failure_with_custom_profanity ) end + def test_find_share_failure_with_swedish_nonprofanity + # "fick" means "got" in Swedish, but should be blocked + # as profanity in English. WebPurify doesn't support this, so we + # have custom filtering that takes locale into account for this word. + program = generate_program('My Custom Profanity', 'fick') + + # Stub WebPurify because we expect our custom blocking to handle this case. + WebPurify.stubs(:find_potential_profanity).returns(nil) + + # Block program in English + assert_equal( + ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fick'), + ShareFiltering.find_share_failure(program, 'en') + ) + + # Block program in Italian + assert_equal( + ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fick'), + ShareFiltering.find_share_failure(program, 'it') + ) + + # Allow program in Italian + assert_nil( + ShareFiltering.find_share_failure(program, 'sv') + ) + end + def test_find_share_failure_for_non_playlab program = ''\ '4'\ From 07c21350c9941973bcda50b4833ec62fcb2518aa Mon Sep 17 00:00:00 2001 From: Brad Buchanan Date: Mon, 3 Jun 2019 13:22:18 -0700 Subject: [PATCH 3/4] Allow specific words in specific languages --- lib/cdo/profanity_filter.rb | 14 ++++++++++++++ lib/cdo/share_filtering.rb | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 lib/cdo/profanity_filter.rb diff --git a/lib/cdo/profanity_filter.rb b/lib/cdo/profanity_filter.rb new file mode 100644 index 0000000000000..4eefc0c780c36 --- /dev/null +++ b/lib/cdo/profanity_filter.rb @@ -0,0 +1,14 @@ +require 'cdo/web_purify' + +class ProfanityFilter + # Look for profanity in a given text, return the first expletive found + # or nil if no profanity is found. + # + # @param [String] text to check for profanity + # @param [String] language_code a two-character ISO 639-1 language code + def self.find_potential_profanity(text, language_code) + return 'fu' if /\bfu\b/i =~ text && language_code != 'it' + return 'fick' if /\bfick\b/i =~ text && language_code != 'sv' + WebPurify.find_potential_profanity(text, ['en', language_code]) + end +end diff --git a/lib/cdo/share_filtering.rb b/lib/cdo/share_filtering.rb index faacd2081bb9d..a49b3a13e81d0 100644 --- a/lib/cdo/share_filtering.rb +++ b/lib/cdo/share_filtering.rb @@ -1,6 +1,6 @@ require 'cdo/regexp' require 'cdo/geocoder' -require 'cdo/web_purify' +require 'cdo/profanity_filter' require 'dynamic_config/gatekeeper' USER_ENTERED_TEXT_INDICATORS = ['TITLE', 'TEXT', 'title name\=\"VAL\"'].freeze @@ -41,7 +41,7 @@ def self.find_share_failure(program, locale) phone_number = RegexpUtils.find_potential_phone_number(program_tags_removed) return ShareFailure.new(FailureType::PHONE, phone_number) if phone_number - expletive = WebPurify.find_potential_profanity(program_tags_removed, ['en', locale]) + expletive = ProfanityFilter.find_potential_profanity(program_tags_removed, locale) return ShareFailure.new(FailureType::PROFANITY, expletive) if expletive nil From e6a8e1cfdba393a4a003b64a4497c215024abadb Mon Sep 17 00:00:00 2001 From: Brad Buchanan Date: Mon, 3 Jun 2019 15:50:34 -0700 Subject: [PATCH 4/4] Extract configuration --- lib/cdo/profanity_filter.rb | 23 ++++++++++++++++-- lib/test/cdo/test_share_filtering.rb | 36 ++++++++++++++++++---------- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/lib/cdo/profanity_filter.rb b/lib/cdo/profanity_filter.rb index 4eefc0c780c36..63789958bec8c 100644 --- a/lib/cdo/profanity_filter.rb +++ b/lib/cdo/profanity_filter.rb @@ -1,14 +1,33 @@ require 'cdo/web_purify' class ProfanityFilter + # List of words that should be allowed only in specific languages. + # + # Entries are in the format + # foobar: %w(en es) + # Where the key "foobar" is the word in question, + # and the value is a list of languages that should allow that word. + # + # Words in this list will be blocked for all languages _except_ for those + # listed along with the word. + # Words in this list should be _unblocked_ on WebPurify, since we are + # handling them with our custom code, here. + LANGUAGE_SPECIFIC_ALLOWLIST = { + fu: %w(it), # past-tense "to be" in Italian + fick: %w(sv) # "got" in Swedish + } + # Look for profanity in a given text, return the first expletive found # or nil if no profanity is found. # # @param [String] text to check for profanity # @param [String] language_code a two-character ISO 639-1 language code def self.find_potential_profanity(text, language_code) - return 'fu' if /\bfu\b/i =~ text && language_code != 'it' - return 'fick' if /\bfick\b/i =~ text && language_code != 'sv' + LANGUAGE_SPECIFIC_ALLOWLIST.each do |word, languages| + next if languages.include? language_code + r = Regexp.new "\\b#{word}\\b", Regexp::IGNORECASE + return word.to_s if r =~ text + end WebPurify.find_potential_profanity(text, ['en', language_code]) end end diff --git a/lib/test/cdo/test_share_filtering.rb b/lib/test/cdo/test_share_filtering.rb index c24ece7066f39..c19cc10932881 100644 --- a/lib/test/cdo/test_share_filtering.rb +++ b/lib/test/cdo/test_share_filtering.rb @@ -66,57 +66,69 @@ def test_find_share_failure_with_profanity ) end - def test_find_share_failure_with_italian_nonprofanity + def test_profanity_with_italian_edge_case # "fu" is a past-tense "to be" in Italian, but should be blocked # as profanity in English. WebPurify doesn't support this, so we # have custom filtering that takes locale into account for this word. program = generate_program('My Custom Profanity', 'fu') + innocent_program = generate_program('My Innocent Program', 'funny tofu') # Stub WebPurify because we expect our custom blocking to handle this case. WebPurify.stubs(:find_potential_profanity).returns(nil) - # Block program in English + # Blocked in English assert_equal( ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fu'), ShareFiltering.find_share_failure(program, 'en') ) - # Block program in Spanish + # But the innocent program is fine + assert_nil( + ShareFiltering.find_share_failure(innocent_program, 'en') + ) + + # Blocked in Spanish assert_equal( ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fu'), ShareFiltering.find_share_failure(program, 'es') ) - # Allow program in Italian + # Allowed in Italian assert_nil( ShareFiltering.find_share_failure(program, 'it') ) end - def test_find_share_failure_with_swedish_nonprofanity + def test_profanity_with_swedish_edge_case # "fick" means "got" in Swedish, but should be blocked # as profanity in English. WebPurify doesn't support this, so we # have custom filtering that takes locale into account for this word. - program = generate_program('My Custom Profanity', 'fick') + questionable_program = generate_program('My Custom Profanity', 'fick') + innocent_program = generate_program('My Innocent Program', 'fickle') # Stub WebPurify because we expect our custom blocking to handle this case. WebPurify.stubs(:find_potential_profanity).returns(nil) - # Block program in English + # Blocked in English assert_equal( ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fick'), - ShareFiltering.find_share_failure(program, 'en') + ShareFiltering.find_share_failure(questionable_program, 'en') ) - # Block program in Italian + # But the innocent program is fine + assert_nil( + ShareFiltering.find_share_failure(innocent_program, 'en') + ) + + # Blocked in Italian assert_equal( ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fick'), - ShareFiltering.find_share_failure(program, 'it') + ShareFiltering.find_share_failure(questionable_program, 'it') ) - # Allow program in Italian + # Allowed in Swedish assert_nil( - ShareFiltering.find_share_failure(program, 'sv') + ShareFiltering.find_share_failure(questionable_program, 'sv') ) end