From b74ee4dd6b65efe94579d7dd6c7812b1fe0826a2 Mon Sep 17 00:00:00 2001 From: roll Date: Tue, 24 Sep 2019 10:53:31 +0300 Subject: [PATCH 01/18] Added a test showing the problem --- tests/test_lib.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_lib.py b/tests/test_lib.py index d082723..488b19c 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -379,6 +379,27 @@ def test_sort_reverse_many_rows(): assert results[998:1000] == [{'a': 100, 'b': 0}, {'a': 0, 'b': 0}] +def test_sort_rows_numbers(): + from dataflows import sort_rows + + f = Flow( + [ + {'a': 0.1}, + {'a': 10}, + {'a': 8}, + {'a': 1.1}, + ], + sort_rows(key='{a}'), + ) + results, _, _ = f.results() + assert list(results[0]) == [ + {'a': 0.1}, + {'a': 1.1}, + {'a': 8}, + {'a': 10}, + ] + + def test_duplicate(): from dataflows import duplicate From e5b28a44ec20aa3c1789a9299b3a8b09ca9690b0 Mon Sep 17 00:00:00 2001 From: roll Date: Tue, 24 Sep 2019 16:28:37 +0300 Subject: [PATCH 02/18] Simplified _sorter for debugging --- dataflows/processors/sort_rows.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index c75b94d..13397d7 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -13,9 +13,11 @@ def __call__(self, row): def _sorter(rows, key_calc, reverse, batch_size): db = KVFile() - db.insert(((key_calc(row) + "{:08x}".format(row_num), row) for row_num, row in enumerate(rows)), - batch_size=batch_size) - + def process(rows): + for row_num, row in enumerate(rows): + key = key_calc(row) + "{:08x}".format(row_num) + yield (key, row) + db.insert(process(rows), batch_size=batch_size) for _, value in db.items(reverse=reverse): yield value From 91146b1e45a9e90df5f8b664d33588ec5bf9e02b Mon Sep 17 00:00:00 2001 From: roll Date: Tue, 24 Sep 2019 17:06:27 +0300 Subject: [PATCH 03/18] Improved sorting of numbers --- dataflows/processors/sort_rows.py | 15 +++++++++++++-- tests/test_lib.py | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index 13397d7..14877e4 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -1,14 +1,25 @@ +import re from kvfile import KVFile - from ..helpers.resource_matcher import ResourceMatcher class KeyCalc(object): def __init__(self, key_spec): self.key_spec = key_spec + self.key_list = re.findall(r'\{(.*?)\}', key_spec) def __call__(self, row): - return self.key_spec.format(**row) + context = {} + for key, value in row.items(): + # We need to strinfify some values to make them properly comparable + if key in self.key_list: + # numbers + # 1000 -> +1.000000e+03 -> p03ep1.000000 + if isinstance(value, (int, float)): + value = 'e'.join(reversed('{:+e}'.format(value).split('e'))) + value = value.replace('+', 'p').replace('-', 'm') + context[key] = value + return self.key_spec.format(**context) def _sorter(rows, key_calc, reverse, batch_size): diff --git a/tests/test_lib.py b/tests/test_lib.py index 488b19c..3294b11 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -376,7 +376,7 @@ def test_sort_reverse_many_rows(): results, _, _ = f.results() results = results[0] assert results[0:2] == [{'a': 999, 'b': 4}, {'a': 994, 'b': 4}] - assert results[998:1000] == [{'a': 100, 'b': 0}, {'a': 0, 'b': 0}] + assert results[998:1000] == [{'a': 5, 'b': 0}, {'a': 0, 'b': 0}] def test_sort_rows_numbers(): From f8a8b46beb643a8c23756407d28c256c88e04c74 Mon Sep 17 00:00:00 2001 From: roll Date: Tue, 24 Sep 2019 17:27:10 +0300 Subject: [PATCH 04/18] Improved sorting date/time --- dataflows/processors/sort_rows.py | 9 +++++++-- tests/test_lib.py | 24 +++++++++++++++++++++--- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index 14877e4..549d95f 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -1,4 +1,5 @@ import re +import datetime from kvfile import KVFile from ..helpers.resource_matcher import ResourceMatcher @@ -9,15 +10,19 @@ def __init__(self, key_spec): self.key_list = re.findall(r'\{(.*?)\}', key_spec) def __call__(self, row): - context = {} + context = row.copy() for key, value in row.items(): - # We need to strinfify some values to make them properly comparable + # We need to stringify some values to make them properly comparable if key in self.key_list: # numbers # 1000 -> +1.000000e+03 -> p03ep1.000000 if isinstance(value, (int, float)): value = 'e'.join(reversed('{:+e}'.format(value).split('e'))) value = value.replace('+', 'p').replace('-', 'm') + # date/time + # **** -> iso format + elif isinstance(value, (datetime.datetime, datetime.date, datetime.time)): + value = value.isoformat() context[key] = value return self.key_spec.format(**context) diff --git a/tests/test_lib.py b/tests/test_lib.py index 3294b11..29e0d7b 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -379,7 +379,7 @@ def test_sort_reverse_many_rows(): assert results[998:1000] == [{'a': 5, 'b': 0}, {'a': 0, 'b': 0}] -def test_sort_rows_numbers(): +def test_sort_rows_number(): from dataflows import sort_rows f = Flow( @@ -387,19 +387,37 @@ def test_sort_rows_numbers(): {'a': 0.1}, {'a': 10}, {'a': 8}, - {'a': 1.1}, ], sort_rows(key='{a}'), ) results, _, _ = f.results() assert list(results[0]) == [ {'a': 0.1}, - {'a': 1.1}, {'a': 8}, {'a': 10}, ] +def test_sort_rows_datetime(): + import datetime + from dataflows import sort_rows + + f = Flow( + [ + {'a': datetime.date(2000, 1, 3)}, + {'a': datetime.date(2010, 1, 2)}, + {'a': datetime.date(2020, 1, 1)}, + ], + sort_rows(key='{a}'), + ) + results, _, _ = f.results() + assert list(results[0]) == [ + {'a': datetime.date(2000, 1, 3)}, + {'a': datetime.date(2010, 1, 2)}, + {'a': datetime.date(2020, 1, 1)}, + ] + + def test_duplicate(): from dataflows import duplicate From c040435374d3be396dd1a9b8c991705dd47db5f0 Mon Sep 17 00:00:00 2001 From: roll Date: Tue, 24 Sep 2019 17:30:48 +0300 Subject: [PATCH 05/18] Fixed linting --- dataflows/processors/sort_rows.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index 549d95f..08da076 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -29,10 +29,12 @@ def __call__(self, row): def _sorter(rows, key_calc, reverse, batch_size): db = KVFile() + def process(rows): for row_num, row in enumerate(rows): key = key_calc(row) + "{:08x}".format(row_num) yield (key, row) + db.insert(process(rows), batch_size=batch_size) for _, value in db.items(reverse=reverse): yield value From b72ff0a7c14d6037773791d14cca93e6b5644215 Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 25 Sep 2019 09:08:21 +0300 Subject: [PATCH 06/18] Drop datetime stringification because it'ok by default --- dataflows/processors/sort_rows.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index 08da076..0469013 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -12,17 +12,13 @@ def __init__(self, key_spec): def __call__(self, row): context = row.copy() for key, value in row.items(): - # We need to stringify some values to make them properly comparable + # We need to stringify some types to make them properly comparable if key in self.key_list: # numbers # 1000 -> +1.000000e+03 -> p03ep1.000000 if isinstance(value, (int, float)): value = 'e'.join(reversed('{:+e}'.format(value).split('e'))) value = value.replace('+', 'p').replace('-', 'm') - # date/time - # **** -> iso format - elif isinstance(value, (datetime.datetime, datetime.date, datetime.time)): - value = value.isoformat() context[key] = value return self.key_spec.format(**context) From b82e5bfc3817e7613304a9ea4a8dbffe96cd3579 Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 25 Sep 2019 09:14:28 +0300 Subject: [PATCH 07/18] Fixed implementation for numbers --- dataflows/processors/sort_rows.py | 8 +++++--- tests/test_lib.py | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index 0469013..bbf8002 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -1,4 +1,5 @@ import re +import decimal import datetime from kvfile import KVFile from ..helpers.resource_matcher import ResourceMatcher @@ -15,9 +16,10 @@ def __call__(self, row): # We need to stringify some types to make them properly comparable if key in self.key_list: # numbers - # 1000 -> +1.000000e+03 -> p03ep1.000000 - if isinstance(value, (int, float)): - value = 'e'.join(reversed('{:+e}'.format(value).split('e'))) + # 1000 -> +1.000000e+03 -> pp03e1.000000 + if isinstance(value, (int, float, decimal.Decimal)): + parts = '{:+e}'.format(value).split('e') + value = '{}{}e{}'.format(parts[0][0], parts[1], parts[0][1:]) value = value.replace('+', 'p').replace('-', 'm') context[key] = value return self.key_spec.format(**context) diff --git a/tests/test_lib.py b/tests/test_lib.py index 29e0d7b..d516d4d 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -385,6 +385,7 @@ def test_sort_rows_number(): f = Flow( [ {'a': 0.1}, + {'a': -3}, {'a': 10}, {'a': 8}, ], @@ -392,6 +393,7 @@ def test_sort_rows_number(): ) results, _, _ = f.results() assert list(results[0]) == [ + {'a': -3}, {'a': 0.1}, {'a': 8}, {'a': 10}, From 06ddfa5eff690fb308b4b173012aa91ab95236d5 Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 25 Sep 2019 09:20:08 +0300 Subject: [PATCH 08/18] Added test breaking current approach --- tests/test_lib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_lib.py b/tests/test_lib.py index d516d4d..48d8cca 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -386,6 +386,7 @@ def test_sort_rows_number(): [ {'a': 0.1}, {'a': -3}, + {'a': -4}, {'a': 10}, {'a': 8}, ], @@ -393,6 +394,7 @@ def test_sort_rows_number(): ) results, _, _ = f.results() assert list(results[0]) == [ + {'a': -4}, {'a': -3}, {'a': 0.1}, {'a': 8}, From 796b5f8021aedf904afed08c76fe6c3d9793594f Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 25 Sep 2019 10:56:21 +0300 Subject: [PATCH 09/18] Fixed implementation --- dataflows/processors/sort_rows.py | 11 ++++++++--- tests/test_lib.py | 9 +++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index bbf8002..075b5da 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -17,10 +17,15 @@ def __call__(self, row): if key in self.key_list: # numbers # 1000 -> +1.000000e+03 -> pp03e1.000000 + # -1000 -> -1/1000 -> -1.000000e-03 -> mm96e1.000000 + # 0 -> o if isinstance(value, (int, float, decimal.Decimal)): - parts = '{:+e}'.format(value).split('e') - value = '{}{}e{}'.format(parts[0][0], parts[1], parts[0][1:]) - value = value.replace('+', 'p').replace('-', 'm') + if value == 0: + value = 'o' + else: + parts = '{:+e}'.format(value if value >= 0 else 1/value).split('e') + value = '{}{}e{}'.format(parts[0][0], (parts[1] if int(parts[1]) >= 0 else str(-99 - int(parts[1]))), parts[0][1:]) + value = value.replace('+', 'p').replace('-', 'm') context[key] = value return self.key_spec.format(**context) diff --git a/tests/test_lib.py b/tests/test_lib.py index 48d8cca..e1ffb89 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -389,16 +389,25 @@ def test_sort_rows_number(): {'a': -4}, {'a': 10}, {'a': 8}, + {'a': 0}, + {'a': -1000000}, + {'a': 1000000}, + {'a': -0.1}, ], sort_rows(key='{a}'), ) results, _, _ = f.results() + print(results) assert list(results[0]) == [ + {'a': -1000000}, {'a': -4}, {'a': -3}, + {'a': -0.1}, + {'a': 0}, {'a': 0.1}, {'a': 8}, {'a': 10}, + {'a': 1000000}, ] From 5738c395e25691f683a9c86201a50b7b4ee4accb Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 25 Sep 2019 11:07:36 +0300 Subject: [PATCH 10/18] Fixed linting --- dataflows/processors/sort_rows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index 075b5da..f6937b9 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -1,6 +1,5 @@ import re import decimal -import datetime from kvfile import KVFile from ..helpers.resource_matcher import ResourceMatcher @@ -24,7 +23,8 @@ def __call__(self, row): value = 'o' else: parts = '{:+e}'.format(value if value >= 0 else 1/value).split('e') - value = '{}{}e{}'.format(parts[0][0], (parts[1] if int(parts[1]) >= 0 else str(-99 - int(parts[1]))), parts[0][1:]) + value = '{}{}e{}'.format( + parts[0][0], (parts[1] if int(parts[1]) >= 0 else str(-99 - int(parts[1]))), parts[0][1:]) value = value.replace('+', 'p').replace('-', 'm') context[key] = value return self.key_spec.format(**context) From 0e0e04544ec83ed04e6cb1c1db3da6907dbe8a98 Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 25 Sep 2019 11:10:30 +0300 Subject: [PATCH 11/18] Improved readability --- dataflows/processors/sort_rows.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index f6937b9..a6db35a 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -24,9 +24,11 @@ def __call__(self, row): else: parts = '{:+e}'.format(value if value >= 0 else 1/value).split('e') value = '{}{}e{}'.format( - parts[0][0], (parts[1] if int(parts[1]) >= 0 else str(-99 - int(parts[1]))), parts[0][1:]) + parts[0][0], + parts[1] if int(parts[1]) >= 0 else str(-99 - int(parts[1])), + parts[0][1:]) value = value.replace('+', 'p').replace('-', 'm') - context[key] = value + context[key] = value return self.key_spec.format(**context) From e1d1cafcbdac567f6c070e9ec460f7b1555ae6f0 Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 25 Sep 2019 11:11:18 +0300 Subject: [PATCH 12/18] Improved readability --- dataflows/processors/sort_rows.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index a6db35a..570baa6 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -19,15 +19,15 @@ def __call__(self, row): # -1000 -> -1/1000 -> -1.000000e-03 -> mm96e1.000000 # 0 -> o if isinstance(value, (int, float, decimal.Decimal)): - if value == 0: - value = 'o' - else: + if value: parts = '{:+e}'.format(value if value >= 0 else 1/value).split('e') value = '{}{}e{}'.format( parts[0][0], parts[1] if int(parts[1]) >= 0 else str(-99 - int(parts[1])), parts[0][1:]) value = value.replace('+', 'p').replace('-', 'm') + else: + value = 'o' context[key] = value return self.key_spec.format(**context) From 7f41b3be012549480dff93986b9d9c0b5cbc7346 Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 25 Sep 2019 11:18:02 +0300 Subject: [PATCH 13/18] Removed debug print --- tests/test_lib.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_lib.py b/tests/test_lib.py index e1ffb89..d243624 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -397,7 +397,6 @@ def test_sort_rows_number(): sort_rows(key='{a}'), ) results, _, _ = f.results() - print(results) assert list(results[0]) == [ {'a': -1000000}, {'a': -4}, From 06fc5f518d8b3a13fc595bd6a2fb9fc12def936d Mon Sep 17 00:00:00 2001 From: roll Date: Wed, 25 Sep 2019 11:45:56 +0300 Subject: [PATCH 14/18] Added more tests --- tests/test_lib.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_lib.py b/tests/test_lib.py index d243624..0e2596d 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -393,20 +393,36 @@ def test_sort_rows_number(): {'a': -1000000}, {'a': 1000000}, {'a': -0.1}, + {'a': -0.2}, + {'a': 0.2}, + {'a': -1000001}, + {'a': 1000001}, + {'a': 6}, + {'a': -10}, + {'a': -0.001}, + {'a': 0.001}, ], sort_rows(key='{a}'), ) results, _, _ = f.results() assert list(results[0]) == [ + {'a': -1000001}, {'a': -1000000}, + {'a': -10}, {'a': -4}, {'a': -3}, + {'a': -0.2}, {'a': -0.1}, + {'a': -0.001}, {'a': 0}, + {'a': 0.001}, {'a': 0.1}, + {'a': 0.2}, + {'a': 6}, {'a': 8}, {'a': 10}, {'a': 1000000}, + {'a': 1000001}, ] From 94a3275055f1fe8f4f506091b94276cc8fd7f7b9 Mon Sep 17 00:00:00 2001 From: roll Date: Thu, 26 Sep 2019 09:08:26 +0300 Subject: [PATCH 15/18] Minor optimization --- dataflows/processors/sort_rows.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index 570baa6..6c26020 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -21,9 +21,10 @@ def __call__(self, row): if isinstance(value, (int, float, decimal.Decimal)): if value: parts = '{:+e}'.format(value if value >= 0 else 1/value).split('e') + power = int(parts[1]) value = '{}{}e{}'.format( parts[0][0], - parts[1] if int(parts[1]) >= 0 else str(-99 - int(parts[1])), + parts[1] if power >= 0 else str(-99 - power), parts[0][1:]) value = value.replace('+', 'p').replace('-', 'm') else: From 4fb96dcf9f31a743b70d9908db9aa89e9418fc88 Mon Sep 17 00:00:00 2001 From: roll Date: Thu, 26 Sep 2019 09:10:00 +0300 Subject: [PATCH 16/18] Added edge case to tests --- tests/test_lib.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_lib.py b/tests/test_lib.py index 0e2596d..a9febb7 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -401,6 +401,8 @@ def test_sort_rows_number(): {'a': -10}, {'a': -0.001}, {'a': 0.001}, + {'a': 1}, + {'a': -1}, ], sort_rows(key='{a}'), ) @@ -411,6 +413,7 @@ def test_sort_rows_number(): {'a': -10}, {'a': -4}, {'a': -3}, + {'a': -1}, {'a': -0.2}, {'a': -0.1}, {'a': -0.001}, @@ -418,6 +421,7 @@ def test_sort_rows_number(): {'a': 0.001}, {'a': 0.1}, {'a': 0.2}, + {'a': 1}, {'a': 6}, {'a': 8}, {'a': 10}, From d962a1a27829ff7f174b320186d7c4662c23b803 Mon Sep 17 00:00:00 2001 From: roll Date: Fri, 4 Oct 2019 17:23:18 +0300 Subject: [PATCH 17/18] Rebase number sort on bitstring --- dataflows/processors/sort_rows.py | 18 ++++------------ setup.py | 1 + tests/test_lib.py | 36 +++++++++++++++---------------- 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index 6c26020..ac13adc 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -1,6 +1,7 @@ import re import decimal from kvfile import KVFile +from bitstring import BitArray from ..helpers.resource_matcher import ResourceMatcher @@ -15,21 +16,10 @@ def __call__(self, row): # We need to stringify some types to make them properly comparable if key in self.key_list: # numbers - # 1000 -> +1.000000e+03 -> pp03e1.000000 - # -1000 -> -1/1000 -> -1.000000e-03 -> mm96e1.000000 - # 0 -> o if isinstance(value, (int, float, decimal.Decimal)): - if value: - parts = '{:+e}'.format(value if value >= 0 else 1/value).split('e') - power = int(parts[1]) - value = '{}{}e{}'.format( - parts[0][0], - parts[1] if power >= 0 else str(-99 - power), - parts[0][1:]) - value = value.replace('+', 'p').replace('-', 'm') - else: - value = 'o' - context[key] = value + bits = BitArray(float=value, length=32) + bits.invert(0) + context[key] = bits.bin return self.key_spec.format(**context) diff --git a/setup.py b/setup.py index e920cf6..a3c752a 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ def read(*paths): 'tabulate', 'tableschema-sql', 'xmljson', + 'bitstring', ] SPEEDUP_REQUIRES = [ 'plyvel', diff --git a/tests/test_lib.py b/tests/test_lib.py index bf574a7..85a0777 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -385,38 +385,38 @@ def test_sort_rows_number(): f = Flow( [ {'a': 0.1}, - {'a': -3}, - {'a': -4}, + # {'a': -3}, + # {'a': -4}, {'a': 10}, {'a': 8}, {'a': 0}, - {'a': -1000000}, + # {'a': -1000000}, {'a': 1000000}, - {'a': -0.1}, - {'a': -0.2}, + # {'a': -0.1}, + # {'a': -0.2}, {'a': 0.2}, - {'a': -1000001}, + # {'a': -1000001}, {'a': 1000001}, {'a': 6}, - {'a': -10}, - {'a': -0.001}, + # {'a': -10}, + # {'a': -0.001}, {'a': 0.001}, {'a': 1}, - {'a': -1}, + # {'a': -1}, ], sort_rows(key='{a}'), ) results, _, _ = f.results() assert list(results[0]) == [ - {'a': -1000001}, - {'a': -1000000}, - {'a': -10}, - {'a': -4}, - {'a': -3}, - {'a': -1}, - {'a': -0.2}, - {'a': -0.1}, - {'a': -0.001}, + # {'a': -1000001}, + # {'a': -1000000}, + # {'a': -10}, + # {'a': -4}, + # {'a': -3}, + # {'a': -1}, + # {'a': -0.2}, + # {'a': -0.1}, + # {'a': -0.001}, {'a': 0}, {'a': 0.001}, {'a': 0.1}, From bec525f3f7b6ad7ca463ce41159e56c03fc22cb9 Mon Sep 17 00:00:00 2001 From: roll Date: Fri, 4 Oct 2019 17:34:00 +0300 Subject: [PATCH 18/18] Added support for negative numbers --- dataflows/processors/sort_rows.py | 9 ++++++-- setup.py | 2 +- tests/test_lib.py | 36 +++++++++++++++---------------- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/dataflows/processors/sort_rows.py b/dataflows/processors/sort_rows.py index ac13adc..0d4da71 100644 --- a/dataflows/processors/sort_rows.py +++ b/dataflows/processors/sort_rows.py @@ -16,10 +16,15 @@ def __call__(self, row): # We need to stringify some types to make them properly comparable if key in self.key_list: # numbers + # https://www.h-schmidt.net/FloatConverter/IEEE754.html if isinstance(value, (int, float, decimal.Decimal)): - bits = BitArray(float=value, length=32) + bits = BitArray(float=value, length=64) + # invert the sign bit bits.invert(0) - context[key] = bits.bin + # invert negative numbers + if value < 0: + bits.invert(range(1, 64)) + context[key] = bits.hex return self.key_spec.format(**context) diff --git a/setup.py b/setup.py index a3c752a..fed2df4 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def read(*paths): 'tabulate', 'tableschema-sql', 'xmljson', - 'bitstring', + 'bitstring>=3', ] SPEEDUP_REQUIRES = [ 'plyvel', diff --git a/tests/test_lib.py b/tests/test_lib.py index 85a0777..bf574a7 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -385,38 +385,38 @@ def test_sort_rows_number(): f = Flow( [ {'a': 0.1}, - # {'a': -3}, - # {'a': -4}, + {'a': -3}, + {'a': -4}, {'a': 10}, {'a': 8}, {'a': 0}, - # {'a': -1000000}, + {'a': -1000000}, {'a': 1000000}, - # {'a': -0.1}, - # {'a': -0.2}, + {'a': -0.1}, + {'a': -0.2}, {'a': 0.2}, - # {'a': -1000001}, + {'a': -1000001}, {'a': 1000001}, {'a': 6}, - # {'a': -10}, - # {'a': -0.001}, + {'a': -10}, + {'a': -0.001}, {'a': 0.001}, {'a': 1}, - # {'a': -1}, + {'a': -1}, ], sort_rows(key='{a}'), ) results, _, _ = f.results() assert list(results[0]) == [ - # {'a': -1000001}, - # {'a': -1000000}, - # {'a': -10}, - # {'a': -4}, - # {'a': -3}, - # {'a': -1}, - # {'a': -0.2}, - # {'a': -0.1}, - # {'a': -0.001}, + {'a': -1000001}, + {'a': -1000000}, + {'a': -10}, + {'a': -4}, + {'a': -3}, + {'a': -1}, + {'a': -0.2}, + {'a': -0.1}, + {'a': -0.001}, {'a': 0}, {'a': 0.001}, {'a': 0.1},