In [None]:
# /// script
# requires-python = ">=3.13"
# dependencies = [
#     "pandas",
#     "pyarrow",
# ]
# ///

In [10]:
import pyarrow as pa
import pyarrow.ipc as ipc

fileA = "gaia-sample.arrow" # downloaded from: https://observablehq.com/@uwdata/peeking-into-the-gaia-star-catalog
fileB = "stevens.arrow" # from dvdkouril/spatial-genome-data

def openArrowFileReader(file_path):
    with open(file_path, "rb") as f:
        reader = ipc.RecordBatchFileReader(f)
        table = reader.read_all()
        return table

def openArrowStreamReader(file_path):
    with open(file_path, "rb") as f:
        reader = ipc.RecordBatchStreamReader(f)
        table = reader.read_all()
        return table

In [11]:
# gaia.arrow
# openArrowFileReader(fileA).to_pandas().head()

In [12]:
# stevens.arrow
openArrowFileReader(fileB).to_pandas().head()

Unnamed: 0,x,y,z,chr,coord
0,9.925,-6.58,-1.571,chr a,3000000
1,9.318,-7.277,-0.979,chr a,3100000
2,8.566,-7.13,-0.231,chr a,3200000
3,7.593,-6.825,-0.851,chr a,3300000
4,7.034,-5.991,-0.617,chr a,3400000


In [13]:
# gaia.arrow
openArrowStreamReader(fileA).to_pandas().head()

Unnamed: 0,source_id,ra,dec,parallax,parallax_error,phot_g_mean_mag,bp_rp,dr2_radial_velocity,dr2_radial_velocity_error
0,5.972832e+18,259.859885,-38.126187,,,20.746231,,,
1,6.498936e+18,350.558203,-55.654576,,,21.128216,1.293613,,
2,4.073224e+18,281.893218,-25.34871,0.305901,0.318304,19.32138,1.123394,,
3,4.070236e+18,268.673323,-22.517255,0.353762,2.043955,20.65499,,,
4,5.208927e+18,121.165957,-78.968019,3.175011,0.013755,11.387156,0.953646,32.508114,2.188383


In [9]:
# stevens.arrow
# openArrowStreamReader(fileB).to_pandas().head()

In [14]:
# chatgpt version of my js code
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc

def compute_normalization_factor(positions: list[np.ndarray]) -> float:
    bb_max = np.max(positions, axis=0)
    bb_min = np.min(positions, axis=0)
    
    bb_center = (bb_max + bb_min) * 0.5
    print(bb_center)
    
    bb_sides = bb_max - bb_min
    max_dim = np.max(bb_sides)
    scale_factor = 1 / max_dim

    return scale_factor

def normalize_xyz_columns(table: pa.Table) -> pa.Table:
    column_names = table.schema.names

    assert "x" in column_names, "x column is missing"
    assert "y" in column_names, "y column is missing"
    assert "z" in column_names, "z column is missing"

    # Extract columns as numpy arrays
    x = table["x"].to_numpy()
    y = table["y"].to_numpy()
    z = table["z"].to_numpy()

    # Assemble them into an array of vec3s
    positions = np.stack((x, y, z), axis=1)

    # Compute normalization factor
    scale = compute_normalization_factor(positions)

    # Scale the positions
    normalized_positions = positions * scale

    # Rebuild columns
    new_x = normalized_positions[:, 0]
    new_y = normalized_positions[:, 1]
    new_z = normalized_positions[:, 2]

    # Build new table with same columns, replacing x/y/z
    arrays = {
        name: table[name].to_numpy()
        for name in column_names
    }
    arrays["x"] = new_x
    arrays["y"] = new_y
    arrays["z"] = new_z

    # Convert back to Arrow Table
    new_table = pa.table(arrays)

    return new_table

In [17]:
# Re-write the table in the Stream format
stevens_table = openArrowFileReader(fileB)
stevens_table_normalized = normalize_xyz_columns(stevens_table)

# Write to a file in stream format
with open("stevens-stream.arrow", "wb") as sink:
    with ipc.RecordBatchStreamWriter(sink, schema=stevens_table.schema) as writer:
        writer.write_table(stevens_table)

# Write to a file in stream format
with open("stevens-normalized-stream.arrow", "wb") as sink:
    with ipc.RecordBatchStreamWriter(sink, schema=stevens_table_normalized.schema) as writer:
        writer.write_table(stevens_table_normalized)

# stevens_table
stevens_table_normalized

[ 2.6435 -0.609  -0.0825]


pyarrow.Table
x: double
y: double
z: double
chr: string
coord: int64
----
x: [[0.17405868013538872,0.16341347924448887,0.15022535557075464,0.13316146682801072,0.12335806106522158,...,-0.14340330755335753,-0.1541011206397643,-0.1509970011048561,-0.16425527437259957,-0.18186282246891494]]
y: [[-0.11539608214517459,-0.127619648901282,-0.125041651321443,-0.1196927447782396,-0.10506655442731624,...,-0.24952210589081217,-0.23517651391592573,-0.23517651391592573,-0.22954700899668545,-0.24175303835429052]]
z: [[-0.02755125304712299,-0.017169113133757737,-0.004051139054032725,-0.014924326125462548,-0.010820574875922905,...,-0.23172164641097137,-0.23705301555567246,-0.2280212553269848,-0.21357043896108452,-0.21960330404587783]]
chr: [["chr a","chr a","chr a","chr a","chr a",...,"chr t","chr t","chr t","chr t","chr t"]]
coord: [[3000000,3100000,3200000,3300000,3400000,...,169600000,169700000,169800000,169900000,170000000]]