From 4e0b82edb4f0c3deeebdf9a227ad88dd13c04fec Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 6 Nov 2020 14:06:43 -0600 Subject: [PATCH 1/4] BUG: use greater precision when serializing floating points This allows the exact binary representation to be transferred correctly, round-trip. --- docs/source/changelog.rst | 11 ++++++++ pandas_gbq/load.py | 2 +- conftest.py => tests/conftest.py | 0 tests/system/test_to_gbq.py | 47 ++++++++++++++++++++++++++++++++ tests/unit/test_load.py | 29 ++++++++++++++------ 5 files changed, 80 insertions(+), 9 deletions(-) rename conftest.py => tests/conftest.py (100%) create mode 100644 tests/system/test_to_gbq.py diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 46570643..d4c52044 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,17 @@ Changelog ========= +.. _changelog-0.14.1: + +0.14.1 / TBD +------------ + +Bug fixes +~~~~~~~~~ + +- Encode floating point values with greater precision. (:issue:`326`) + + .. _changelog-0.14.0: 0.14.0 / 2020-10-05 diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index 04b32efa..ec00d4a1 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -19,7 +19,7 @@ def encode_chunk(dataframe): index=False, header=False, encoding="utf-8", - float_format="%.15g", + float_format="%.17g", date_format="%Y-%m-%d %H:%M:%S.%f", ) diff --git a/conftest.py b/tests/conftest.py similarity index 100% rename from conftest.py rename to tests/conftest.py diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py new file mode 100644 index 00000000..8e8bac9f --- /dev/null +++ b/tests/system/test_to_gbq.py @@ -0,0 +1,47 @@ +import functools +import pandas +import pandas.testing + +import pytest + + +pytest.importorskip("google.cloud.bigquery", minversion="1.24.0") + + +@pytest.fixture +def method_under_test(credentials): + import pandas_gbq + + return functools.partial(pandas_gbq.to_gbq, credentials=credentials) + + +def test_float_round_trip( + method_under_test, random_dataset_id, bigquery_client +): + """Ensure that 64-bit floating point numbers are unchanged. + + See: https://github.com/pydata/pandas-gbq/issues/326 + """ + + table_id = f"{random_dataset_id}.float_round_trip" + input_floats = pandas.Series( + [ + 0.14285714285714285, + 0.4406779661016949, + 1.05148, + 1.05153, + 1.8571428571428572, + 2.718281828459045, + 3.141592653589793, + 2.0988936657440586e43, + ], + name="float_col", + ) + df = pandas.DataFrame({"float_col": input_floats}) + method_under_test(df, table_id) + + round_trip = bigquery_client.list_rows(table_id).to_dataframe() + round_trip_floats = round_trip["float_col"].sort_values() + pandas.testing.assert_series_equal( + round_trip_floats, input_floats, check_exact=True, + ) diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index d2b5860e..f19ff604 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -24,17 +24,30 @@ def test_encode_chunk_with_unicode(): def test_encode_chunk_with_floats(): - """Test that floats in a dataframe are encoded with at most 15 significant + """Test that floats in a dataframe are encoded with at most 17 significant figures. - See: https://github.com/pydata/pandas-gbq/issues/192 + See: https://github.com/pydata/pandas-gbq/issues/192 and + https://github.com/pydata/pandas-gbq/issues/326 """ - input_csv = StringIO(u"01/01/17 23:00,1.05148,1.05153,1.05148,1.05153,4") - df = pandas.read_csv(input_csv, header=None) - csv_buffer = load.encode_chunk(df) - csv_bytes = csv_buffer.read() - csv_string = csv_bytes.decode("utf-8") - assert "1.05153" in csv_string + input_csv = StringIO( + """01/01/17 23:00,0.14285714285714285,4 + 01/02/17 22:00,1.05148,3 + 01/03/17 21:00,1.05153,2 + 01/04/17 20:00,3.141592653589793,1 + 01/05/17 19:00,2.0988936657440586e+43,0 + """ + ) + input_df = pandas.read_csv( + input_csv, header=None, float_precision="round_trip" + ) + csv_buffer = load.encode_chunk(input_df) + round_trip = pandas.read_csv( + csv_buffer, header=None, float_precision="round_trip" + ) + pandas.testing.assert_frame_equal( + round_trip, input_df, check_exact=True, + ) def test_encode_chunk_with_newlines(): From e205680984a6c970943ba5408c6708d4b8b93020 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 6 Nov 2020 14:30:25 -0600 Subject: [PATCH 2/4] blacken --- tests/system/test_to_gbq.py | 4 +++- tests/unit/test_load.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 8e8bac9f..c3a6344f 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -43,5 +43,7 @@ def test_float_round_trip( round_trip = bigquery_client.list_rows(table_id).to_dataframe() round_trip_floats = round_trip["float_col"].sort_values() pandas.testing.assert_series_equal( - round_trip_floats, input_floats, check_exact=True, + round_trip_floats, + input_floats, + check_exact=True, ) diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index fed4287a..06b13a5f 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -46,7 +46,9 @@ def test_encode_chunk_with_floats(): csv_buffer, header=None, float_precision="round_trip" ) pandas.testing.assert_frame_equal( - round_trip, input_df, check_exact=True, + round_trip, + input_df, + check_exact=True, ) From 4e5319f9ca6a27e66d1b85040b302e62e09a5536 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 6 Nov 2020 14:39:54 -0600 Subject: [PATCH 3/4] remove f-string --- tests/conftest.py => conftest.py | 2 +- tests/system/test_to_gbq.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename tests/conftest.py => conftest.py (96%) diff --git a/tests/conftest.py b/conftest.py similarity index 96% rename from tests/conftest.py rename to conftest.py index 7f9a6721..b5803f37 100644 --- a/tests/conftest.py +++ b/conftest.py @@ -1,4 +1,4 @@ -"""Shared pytest fixtures for system tests.""" +"""Shared pytest fixtures for `tests/system` and `samples/tests` tests.""" import os import os.path diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index c3a6344f..ca5e406a 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -23,7 +23,7 @@ def test_float_round_trip( See: https://github.com/pydata/pandas-gbq/issues/326 """ - table_id = f"{random_dataset_id}.float_round_trip" + table_id = "{}.float_round_trip".format(random_dataset_id) input_floats = pandas.Series( [ 0.14285714285714285, From d64d2406595ad5341b12499c0aac591400645635 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 6 Nov 2020 14:44:25 -0600 Subject: [PATCH 4/4] adjust string formatting --- tests/unit/test_load.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py index 06b13a5f..7ed463c1 100644 --- a/tests/unit/test_load.py +++ b/tests/unit/test_load.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import textwrap from io import StringIO import numpy @@ -30,16 +31,16 @@ def test_encode_chunk_with_floats(): See: https://github.com/pydata/pandas-gbq/issues/192 and https://github.com/pydata/pandas-gbq/issues/326 """ - input_csv = StringIO( + input_csv = textwrap.dedent( """01/01/17 23:00,0.14285714285714285,4 - 01/02/17 22:00,1.05148,3 - 01/03/17 21:00,1.05153,2 - 01/04/17 20:00,3.141592653589793,1 - 01/05/17 19:00,2.0988936657440586e+43,0 - """ + 01/02/17 22:00,1.05148,3 + 01/03/17 21:00,1.05153,2 + 01/04/17 20:00,3.141592653589793,1 + 01/05/17 19:00,2.0988936657440586e+43,0 + """ ) input_df = pandas.read_csv( - input_csv, header=None, float_precision="round_trip" + StringIO(input_csv), header=None, float_precision="round_trip" ) csv_buffer = load.encode_chunk(input_df) round_trip = pandas.read_csv(