Skip to content

Commit 1f687ee

Browse files
Akhil S NaikAkhil Naik
andauthored
[redaction] redaction engine throws exception when the query is having non-ascii character (#1973)
* [redaction] redaction engine throws exception when the query is having non-utf8 character (asnaik) Co-authored-by: Akhil Naik <asnaik@cloudere.com>
1 parent ab0114b commit 1f687ee

File tree

2 files changed

+40
-21
lines changed

2 files changed

+40
-21
lines changed

desktop/core/src/desktop/redaction/engine.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import json
2020
import re
2121

22+
from django.utils.encoding import smart_str
2223

2324
class RedactionEngine(object):
2425
"""
@@ -106,7 +107,7 @@ def redact(self, message):
106107
"""
107108

108109
if message and (self.trigger is None or self.trigger.search(message)):
109-
return self.regex.sub(self.replace, message)
110+
return self.regex.sub(smart_str(self.replace), message)
110111
else:
111112
return message
112113

desktop/core/src/desktop/redaction/tests.py

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
23
# Licensed to Cloudera, Inc. under one
34
# or more contributor license agreements. See the NOTICE file
45
# distributed with this work for additional information
@@ -19,6 +20,8 @@
1920
standard_library.install_aliases()
2021
from builtins import zip, range, object
2122

23+
from django.utils.encoding import smart_str
24+
2225
import json
2326
import logging
2427
import os
@@ -332,6 +335,22 @@ def test_real_rules(self):
332335
for message, redacted_message in messages:
333336
assert_equal(redacted_message, policy.redact(message))
334337

338+
def test_unicode_strings(self):
339+
path = get_path('real-1.json')
340+
policy = parse_redaction_policy_from_file(path)
341+
342+
messages = [
343+
("äöüß 123-45-6789", "äöüß XXX-XX-XXXX"),
344+
("你好阿基尔 1234234534654576", "你好阿基尔 XXXXXXXXXXXXXXXX"),
345+
("ã 你好 1234,2345,3456,4576", "ã 你好 XXXX-XXXX-XXXX-XXXX"),
346+
]
347+
348+
for message, redacted_message in messages:
349+
message_to_redact = smart_str(message)
350+
self.logger.debug("Message to redact : %s " % message_to_redact)
351+
self.logger.debug("Message after redact : %s " % policy.redact(message_to_redact))
352+
assert_equal(redacted_message, policy.redact(message_to_redact))
353+
335354
def test_huge_rules(self):
336355
path = get_path('huge-1.json')
337356
policy = parse_redaction_policy_from_file(path)
@@ -421,28 +440,27 @@ def run(self):
421440
assert_equal(errors, [])
422441

423442
def byte_range(first, last):
424-
return list(range(first, last+1))
443+
return list(range(first, last+1))
425444

426445
first_values = byte_range(0x00, 0x7F) + byte_range(0xC2, 0xF4)
427446
trailing_values = byte_range(0x80, 0xBF)
428447

429448
def random_utf8_char():
430-
first = random.choice(first_values)
431-
if first <= 0x7F:
432-
value = bytearray([first])
433-
elif first <= 0xDF:
434-
value = bytearray([first, random.choice(trailing_values)])
435-
elif first == 0xE0:
436-
value = bytearray([first, random.choice(byte_range(0xA0, 0xBF)), random.choice(trailing_values)])
437-
elif first == 0xED:
438-
value = bytearray([first, random.choice(byte_range(0x80, 0x9F)), random.choice(trailing_values)])
439-
elif first <= 0xEF:
440-
value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values)])
441-
elif first == 0xF0:
442-
value = bytearray([first, random.choice(byte_range(0x90, 0xBF)), random.choice(trailing_values), random.choice(trailing_values)])
443-
elif first <= 0xF3:
444-
value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values), random.choice(trailing_values)])
445-
elif first == 0xF4:
446-
value = bytearray([first, random.choice(byte_range(0x80, 0x8F)), random.choice(trailing_values), random.choice(trailing_values)])
447-
448-
return value.decode('utf8')
449+
first = random.choice(first_values)
450+
if first <= 0x7F:
451+
value = bytearray([first])
452+
elif first <= 0xDF:
453+
value = bytearray([first, random.choice(trailing_values)])
454+
elif first == 0xE0:
455+
value = bytearray([first, random.choice(byte_range(0xA0, 0xBF)), random.choice(trailing_values)])
456+
elif first == 0xED:
457+
value = bytearray([first, random.choice(byte_range(0x80, 0x9F)), random.choice(trailing_values)])
458+
elif first <= 0xEF:
459+
value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values)])
460+
elif first == 0xF0:
461+
value = bytearray([first, random.choice(byte_range(0x90, 0xBF)), random.choice(trailing_values), random.choice(trailing_values)])
462+
elif first <= 0xF3:
463+
value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values), random.choice(trailing_values)])
464+
elif first == 0xF4:
465+
value = bytearray([first, random.choice(byte_range(0x80, 0x8F)), random.choice(trailing_values), random.choice(trailing_values)])
466+
return value.decode('utf8')

0 commit comments

Comments
 (0)