# Exploring Python, NumPy, DuckDB, and Arrow floating point precision

This notebook explores how floating point number precision is captured within Python, NumPy, DuckDB, and Arrow to better understand the implications of storing data within these formats.
The work originally was inspired by [CytoTable issue #187](https://github.com/cytomining/CytoTable/issues/187).

In [1]:
import decimal
import sys

import duckdb
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.csv as pacsv
import pyarrow.parquet as parquet

# add modules from a directory above this one
sys.path = sys.path + [".."]

from utilities import get_system_info

In [2]:
# show the system information
_ = get_system_info(show_output=True)


System Information:
Operating System: Darwin
Machine Type: arm64
Processor: arm
CPU Cores (Logical): 12
CPU Cores (Physical): 12
Total RAM (GB): 48.0
Python Version: 3.10.16


In [3]:
# default interpreted value in Python
float_value = 3.5215257120407011
float_value

3.521525712040701

In [4]:
# as a formatted string
"{0:.16f}".format(3.5215257120407011)

'3.5215257120407011'

In [5]:
# as a python Decimal
decimal.Decimal(3.5215257120407011)

Decimal('3.521525712040701083793692305334843695163726806640625')

In [6]:
# as numpy value within np.float64 array
arr = np.array([3.5215257120407011], dtype=np.float64)
arr[0]

3.521525712040701

In [7]:
# as numpy value within np.longdouble array
arr = np.array([3.5215257120407011], dtype=np.longdouble)
arr[0]

3.521525712040701

In [8]:
# try to read with pandas
pd.DataFrame({"col_a": [3.5215257120407011]})

Unnamed: 0,col_a
0,3.521526


In [9]:
# try to read with pandas through pyarrow
# (referenced auto-inferred duckdb decimal settings, see below, which appear to align)
pd.DataFrame({"col_a": [3.5215257120407011]})["col_a"].astype(
    pd.ArrowDtype(pa.decimal128(17, 16))
)

0    3.5215257120407011
Name: col_a, dtype: decimal128(17, 16)[pyarrow]

In [10]:
# show results from pyarrow array
pa.array([decimal.Decimal("3.5215257120407011")], type=pa.decimal128(17, 16))

<pyarrow.lib.Decimal128Array object at 0x15eeac580>
[
  3.5215257120407011
]

In [11]:
# write the data to a parquet file to see how it's retained
pd.DataFrame({"col_a": [3.5215257120407011]}).astype(
    pd.ArrowDtype(pa.decimal128(17, 16))
).to_parquet("example.parquet")

In [12]:
# show what's inside the file from pandas and pyarrow's perspective
print(pd.read_parquet("example.parquet"), "\n")
print(parquet.read_table("example.parquet"), "\n")
print(parquet.read_schema("example.parquet"), "\n")

                col_a
0  3.5215257120407011 

pyarrow.Table
col_a: decimal128(17, 16)
----
col_a: [[3.5215257120407011]] 

col_a: decimal128(17, 16)
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 395 



In [13]:
# show how number is interpreted without cast in duckdb
with duckdb.connect() as ddb:
    result = ddb.execute(
        """
  SELECT 3.5215257120407011;
  """
    ).arrow()
result

pyarrow.Table
3.5215257120407011: decimal128(17, 16)
----
3.5215257120407011: [[3.5215257120407011]]

In [14]:
# show how number is interpreted with cast to DOUBLE
with duckdb.connect() as ddb:
    result = ddb.execute(
        """
  SELECT CAST(3.5215257120407011 AS DOUBLE);
  """
    ).arrow()
result

pyarrow.Table
CAST(3.5215257120407011 AS DOUBLE): double
----
CAST(3.5215257120407011 AS DOUBLE): [[3.521525712040701]]

In [15]:
# show how number is interpreted with cast to DECIMAL (inferenced size)
with duckdb.connect() as ddb:
    result = ddb.execute(
        """
  SELECT CAST(3.5215257120407011 AS DECIMAL);
  """
    ).arrow()
result

pyarrow.Table
CAST(3.5215257120407011 AS DECIMAL(18,3)): decimal128(18, 3)
----
CAST(3.5215257120407011 AS DECIMAL(18,3)): [[3.522]]

In [16]:
# show how the data are read from Parquet
with duckdb.connect() as ddb:
    result = ddb.execute(
        """
  SELECT *
  FROM read_parquet('example.parquet');
  """
    ).arrow()
result

pyarrow.Table
col_a: decimal128(17, 16)
----
col_a: [[3.5215257120407011]]

In [17]:
# write a one column, one value csv with the floating point number
with open(file="example.csv", mode="w", encoding="utf-8") as file:
    file.write("col_a\n")
    file.write("3.5215257120407011")

In [18]:
# read the value from pandas
pd.read_csv("example.csv")

Unnamed: 0,col_a
0,3.521526


In [19]:
# try to read the value from pyarrow's csv reader
pacsv.read_csv(input_file="example.csv")

pyarrow.Table
col_a: double
----
col_a: [[3.521525712040701]]

In [20]:
# show how the csv reader interprets the value by default (automatic settings)
with duckdb.connect() as ddb:
    result = ddb.execute(
        """
  SELECT *
  FROM read_csv('example.csv');
  """
    ).arrow()

result

pyarrow.Table
col_a: double
----
col_a: [[3.521525712040701]]

In [21]:
# try to modify the auto_type_candidates to incorporate the type we saw earlier
with duckdb.connect() as ddb:
    result = ddb.execute(
        """
  SELECT *
  FROM read_csv('example.csv', auto_type_candidates=['DECIMAL(18,16)']);
  """
    ).arrow()

result

pyarrow.Table
col_a: decimal128(18, 16)
----
col_a: [[3.5215257120407011]]