Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
[redaction] redaction engine throws exception when the query is havin…
…g non-ascii character (#1973)

* [redaction] redaction engine throws exception when the query is having non-utf8 character (asnaik)
Co-authored-by: Akhil Naik <asnaik@cloudere.com>
  • Loading branch information
Akhilsnaik committed Apr 6, 2021
1 parent ab0114b commit 1f687ee
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 21 deletions.
3 changes: 2 additions & 1 deletion desktop/core/src/desktop/redaction/engine.py
Expand Up @@ -19,6 +19,7 @@
import json
import re

from django.utils.encoding import smart_str

class RedactionEngine(object):
"""
Expand Down Expand Up @@ -106,7 +107,7 @@ def redact(self, message):
"""

if message and (self.trigger is None or self.trigger.search(message)):
return self.regex.sub(self.replace, message)
return self.regex.sub(smart_str(self.replace), message)
else:
return message

Expand Down
58 changes: 38 additions & 20 deletions desktop/core/src/desktop/redaction/tests.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Licensed to Cloudera, Inc. under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
Expand All @@ -19,6 +20,8 @@
standard_library.install_aliases()
from builtins import zip, range, object

from django.utils.encoding import smart_str

import json
import logging
import os
Expand Down Expand Up @@ -332,6 +335,22 @@ def test_real_rules(self):
for message, redacted_message in messages:
assert_equal(redacted_message, policy.redact(message))

def test_unicode_strings(self):
path = get_path('real-1.json')
policy = parse_redaction_policy_from_file(path)

messages = [
("äöüß 123-45-6789", "äöüß XXX-XX-XXXX"),
("你好阿基尔 1234234534654576", "你好阿基尔 XXXXXXXXXXXXXXXX"),
("ã 你好 1234,2345,3456,4576", "ã 你好 XXXX-XXXX-XXXX-XXXX"),
]

for message, redacted_message in messages:
message_to_redact = smart_str(message)
self.logger.debug("Message to redact : %s " % message_to_redact)
self.logger.debug("Message after redact : %s " % policy.redact(message_to_redact))
assert_equal(redacted_message, policy.redact(message_to_redact))

def test_huge_rules(self):
path = get_path('huge-1.json')
policy = parse_redaction_policy_from_file(path)
Expand Down Expand Up @@ -421,28 +440,27 @@ def run(self):
assert_equal(errors, [])

def byte_range(first, last):
return list(range(first, last+1))
return list(range(first, last+1))

first_values = byte_range(0x00, 0x7F) + byte_range(0xC2, 0xF4)
trailing_values = byte_range(0x80, 0xBF)

def random_utf8_char():
first = random.choice(first_values)
if first <= 0x7F:
value = bytearray([first])
elif first <= 0xDF:
value = bytearray([first, random.choice(trailing_values)])
elif first == 0xE0:
value = bytearray([first, random.choice(byte_range(0xA0, 0xBF)), random.choice(trailing_values)])
elif first == 0xED:
value = bytearray([first, random.choice(byte_range(0x80, 0x9F)), random.choice(trailing_values)])
elif first <= 0xEF:
value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values)])
elif first == 0xF0:
value = bytearray([first, random.choice(byte_range(0x90, 0xBF)), random.choice(trailing_values), random.choice(trailing_values)])
elif first <= 0xF3:
value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values), random.choice(trailing_values)])
elif first == 0xF4:
value = bytearray([first, random.choice(byte_range(0x80, 0x8F)), random.choice(trailing_values), random.choice(trailing_values)])

return value.decode('utf8')
first = random.choice(first_values)
if first <= 0x7F:
value = bytearray([first])
elif first <= 0xDF:
value = bytearray([first, random.choice(trailing_values)])
elif first == 0xE0:
value = bytearray([first, random.choice(byte_range(0xA0, 0xBF)), random.choice(trailing_values)])
elif first == 0xED:
value = bytearray([first, random.choice(byte_range(0x80, 0x9F)), random.choice(trailing_values)])
elif first <= 0xEF:
value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values)])
elif first == 0xF0:
value = bytearray([first, random.choice(byte_range(0x90, 0xBF)), random.choice(trailing_values), random.choice(trailing_values)])
elif first <= 0xF3:
value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values), random.choice(trailing_values)])
elif first == 0xF4:
value = bytearray([first, random.choice(byte_range(0x80, 0x8F)), random.choice(trailing_values), random.choice(trailing_values)])
return value.decode('utf8')

0 comments on commit 1f687ee

Please sign in to comment.