Skip to content

Commit

Permalink
Merge pull request #28900 from code-dot-org/custom-language-filter
Browse files Browse the repository at this point in the history
Language-specific profanity filtering
  • Loading branch information
islemaster committed Jun 5, 2019
2 parents 75246e3 + e6a8e1c commit c2df8d1
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 2 deletions.
33 changes: 33 additions & 0 deletions lib/cdo/profanity_filter.rb
@@ -0,0 +1,33 @@
require 'cdo/web_purify'

class ProfanityFilter
# List of words that should be allowed only in specific languages.
#
# Entries are in the format
# foobar: %w(en es)
# Where the key "foobar" is the word in question,
# and the value is a list of languages that should allow that word.
#
# Words in this list will be blocked for all languages _except_ for those
# listed along with the word.
# Words in this list should be _unblocked_ on WebPurify, since we are
# handling them with our custom code, here.
LANGUAGE_SPECIFIC_ALLOWLIST = {
fu: %w(it), # past-tense "to be" in Italian
fick: %w(sv) # "got" in Swedish
}

# Look for profanity in a given text, return the first expletive found
# or nil if no profanity is found.
#
# @param [String] text to check for profanity
# @param [String] language_code a two-character ISO 639-1 language code
def self.find_potential_profanity(text, language_code)
LANGUAGE_SPECIFIC_ALLOWLIST.each do |word, languages|
next if languages.include? language_code
r = Regexp.new "\\b#{word}\\b", Regexp::IGNORECASE
return word.to_s if r =~ text
end
WebPurify.find_potential_profanity(text, ['en', language_code])
end
end
7 changes: 5 additions & 2 deletions lib/cdo/share_filtering.rb
@@ -1,6 +1,6 @@
require 'cdo/regexp'
require 'cdo/geocoder'
require 'cdo/web_purify'
require 'cdo/profanity_filter'
require 'dynamic_config/gatekeeper'

USER_ENTERED_TEXT_INDICATORS = ['TITLE', 'TEXT', 'title name\=\"VAL\"'].freeze
Expand All @@ -23,6 +23,9 @@ module FailureType
#
# May throw OpenURI::HTTPError, IO::EAGAINWaitReadable depending on
# service availability.
#
# @param [String] program the student's program text
# @param [String] locale a two-character ISO 639-1 language code
def self.find_share_failure(program, locale)
return nil unless should_filter_program(program)

Expand All @@ -38,7 +41,7 @@ def self.find_share_failure(program, locale)
phone_number = RegexpUtils.find_potential_phone_number(program_tags_removed)
return ShareFailure.new(FailureType::PHONE, phone_number) if phone_number

expletive = WebPurify.find_potential_profanity(program_tags_removed, ['en', locale])
expletive = ProfanityFilter.find_potential_profanity(program_tags_removed, locale)
return ShareFailure.new(FailureType::PROFANITY, expletive) if expletive

nil
Expand Down
66 changes: 66 additions & 0 deletions lib/test/cdo/test_share_filtering.rb
Expand Up @@ -66,6 +66,72 @@ def test_find_share_failure_with_profanity
)
end

def test_profanity_with_italian_edge_case
# "fu" is a past-tense "to be" in Italian, but should be blocked
# as profanity in English. WebPurify doesn't support this, so we
# have custom filtering that takes locale into account for this word.
program = generate_program('My Custom Profanity', 'fu')
innocent_program = generate_program('My Innocent Program', 'funny tofu')

# Stub WebPurify because we expect our custom blocking to handle this case.
WebPurify.stubs(:find_potential_profanity).returns(nil)

# Blocked in English
assert_equal(
ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fu'),
ShareFiltering.find_share_failure(program, 'en')
)

# But the innocent program is fine
assert_nil(
ShareFiltering.find_share_failure(innocent_program, 'en')
)

# Blocked in Spanish
assert_equal(
ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fu'),
ShareFiltering.find_share_failure(program, 'es')
)

# Allowed in Italian
assert_nil(
ShareFiltering.find_share_failure(program, 'it')
)
end

def test_profanity_with_swedish_edge_case
# "fick" means "got" in Swedish, but should be blocked
# as profanity in English. WebPurify doesn't support this, so we
# have custom filtering that takes locale into account for this word.
questionable_program = generate_program('My Custom Profanity', 'fick')
innocent_program = generate_program('My Innocent Program', 'fickle')

# Stub WebPurify because we expect our custom blocking to handle this case.
WebPurify.stubs(:find_potential_profanity).returns(nil)

# Blocked in English
assert_equal(
ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fick'),
ShareFiltering.find_share_failure(questionable_program, 'en')
)

# But the innocent program is fine
assert_nil(
ShareFiltering.find_share_failure(innocent_program, 'en')
)

# Blocked in Italian
assert_equal(
ShareFailure.new(ShareFiltering::FailureType::PROFANITY, 'fick'),
ShareFiltering.find_share_failure(questionable_program, 'it')
)

# Allowed in Swedish
assert_nil(
ShareFiltering.find_share_failure(questionable_program, 'sv')
)
end

def test_find_share_failure_for_non_playlab
program = '<xml><block type=\"controls_repeat\">'\
'<title name=\"TIMES\">4</title><statement name=\"DO\">'\
Expand Down

0 comments on commit c2df8d1

Please sign in to comment.