|
1 | 1 | #!/usr/bin/env python |
| 2 | +# -*- coding: utf-8 -*- |
2 | 3 | # Licensed to Cloudera, Inc. under one |
3 | 4 | # or more contributor license agreements. See the NOTICE file |
4 | 5 | # distributed with this work for additional information |
|
19 | 20 | standard_library.install_aliases() |
20 | 21 | from builtins import zip, range, object |
21 | 22 |
|
| 23 | +from django.utils.encoding import smart_str |
| 24 | + |
22 | 25 | import json |
23 | 26 | import logging |
24 | 27 | import os |
@@ -332,6 +335,22 @@ def test_real_rules(self): |
332 | 335 | for message, redacted_message in messages: |
333 | 336 | assert_equal(redacted_message, policy.redact(message)) |
334 | 337 |
|
| 338 | + def test_unicode_strings(self): |
| 339 | + path = get_path('real-1.json') |
| 340 | + policy = parse_redaction_policy_from_file(path) |
| 341 | + |
| 342 | + messages = [ |
| 343 | + ("äöüß 123-45-6789", "äöüß XXX-XX-XXXX"), |
| 344 | + ("你好阿基尔 1234234534654576", "你好阿基尔 XXXXXXXXXXXXXXXX"), |
| 345 | + ("ã 你好 1234,2345,3456,4576", "ã 你好 XXXX-XXXX-XXXX-XXXX"), |
| 346 | + ] |
| 347 | + |
| 348 | + for message, redacted_message in messages: |
| 349 | + message_to_redact = smart_str(message) |
| 350 | + self.logger.debug("Message to redact : %s " % message_to_redact) |
| 351 | + self.logger.debug("Message after redact : %s " % policy.redact(message_to_redact)) |
| 352 | + assert_equal(redacted_message, policy.redact(message_to_redact)) |
| 353 | + |
335 | 354 | def test_huge_rules(self): |
336 | 355 | path = get_path('huge-1.json') |
337 | 356 | policy = parse_redaction_policy_from_file(path) |
@@ -421,28 +440,27 @@ def run(self): |
421 | 440 | assert_equal(errors, []) |
422 | 441 |
|
423 | 442 | def byte_range(first, last): |
424 | | - return list(range(first, last+1)) |
| 443 | + return list(range(first, last+1)) |
425 | 444 |
|
426 | 445 | first_values = byte_range(0x00, 0x7F) + byte_range(0xC2, 0xF4) |
427 | 446 | trailing_values = byte_range(0x80, 0xBF) |
428 | 447 |
|
429 | 448 | def random_utf8_char(): |
430 | | - first = random.choice(first_values) |
431 | | - if first <= 0x7F: |
432 | | - value = bytearray([first]) |
433 | | - elif first <= 0xDF: |
434 | | - value = bytearray([first, random.choice(trailing_values)]) |
435 | | - elif first == 0xE0: |
436 | | - value = bytearray([first, random.choice(byte_range(0xA0, 0xBF)), random.choice(trailing_values)]) |
437 | | - elif first == 0xED: |
438 | | - value = bytearray([first, random.choice(byte_range(0x80, 0x9F)), random.choice(trailing_values)]) |
439 | | - elif first <= 0xEF: |
440 | | - value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values)]) |
441 | | - elif first == 0xF0: |
442 | | - value = bytearray([first, random.choice(byte_range(0x90, 0xBF)), random.choice(trailing_values), random.choice(trailing_values)]) |
443 | | - elif first <= 0xF3: |
444 | | - value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values), random.choice(trailing_values)]) |
445 | | - elif first == 0xF4: |
446 | | - value = bytearray([first, random.choice(byte_range(0x80, 0x8F)), random.choice(trailing_values), random.choice(trailing_values)]) |
447 | | - |
448 | | - return value.decode('utf8') |
| 449 | + first = random.choice(first_values) |
| 450 | + if first <= 0x7F: |
| 451 | + value = bytearray([first]) |
| 452 | + elif first <= 0xDF: |
| 453 | + value = bytearray([first, random.choice(trailing_values)]) |
| 454 | + elif first == 0xE0: |
| 455 | + value = bytearray([first, random.choice(byte_range(0xA0, 0xBF)), random.choice(trailing_values)]) |
| 456 | + elif first == 0xED: |
| 457 | + value = bytearray([first, random.choice(byte_range(0x80, 0x9F)), random.choice(trailing_values)]) |
| 458 | + elif first <= 0xEF: |
| 459 | + value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values)]) |
| 460 | + elif first == 0xF0: |
| 461 | + value = bytearray([first, random.choice(byte_range(0x90, 0xBF)), random.choice(trailing_values), random.choice(trailing_values)]) |
| 462 | + elif first <= 0xF3: |
| 463 | + value = bytearray([first, random.choice(trailing_values), random.choice(trailing_values), random.choice(trailing_values)]) |
| 464 | + elif first == 0xF4: |
| 465 | + value = bytearray([first, random.choice(byte_range(0x80, 0x8F)), random.choice(trailing_values), random.choice(trailing_values)]) |
| 466 | + return value.decode('utf8') |
0 commit comments