In [2]:
import pyarrow as pa

In [4]:
days = pa.array([1, 12, 17, 23, 28], type=pa.int8())
months = pa.array([1, 3, 5, 7, 1], type=pa.int8())
years = pa.array([1990, 2000, 1995, 2000, 1995], type=pa.int16())

birthdays_table = pa.table([days, months, years],
                           names=["days", "months", "years"])

In [6]:
birthdays_table

pyarrow.Table
days: int8
months: int8
years: int16
----
days: [[1,12,17,23,28]]
months: [[1,3,5,7,1]]
years: [[1990,2000,1995,2000,1995]]

In [7]:
import pyarrow.parquet as pq
pq.write_table(birthdays_table,'birthdays.parquet')

In [9]:
reloaded_birthdays = pq.read_table('birthdays.parquet')

In [10]:
reloaded_birthdays

pyarrow.Table
days: int8
months: int8
years: int16
----
days: [[1,12,17,23,28]]
months: [[1,3,5,7,1]]
years: [[1990,2000,1995,2000,1995]]

In [11]:
import pyarrow.compute as pc

In [12]:
pc.value_counts(birthdays_table["years"])

<pyarrow.lib.StructArray object at 0x10f52ed00>
-- is_valid: all not null
-- child 0 type: int16
  [
    1990,
    2000,
    1995
  ]
-- child 1 type: int64
  [
    1,
    2,
    2
  ]

In [13]:
import pyarrow.dataset as ds

ds.write_dataset(birthdays_table, "savedir", format="parquet",
                 partitioning=ds.partitioning(
                    pa.schema([birthdays_table.schema.field("years")])
                ))

In [14]:
birthdays_dataset = ds.dataset("savedir", format="parquet", partitioning=["years"])

In [16]:
birthdays_dataset.files

['savedir/1990/part-0.parquet',
 'savedir/1995/part-0.parquet',
 'savedir/2000/part-0.parquet']

In [18]:
import datetime
current_year = datetime.datetime.utcnow().year
for table_chunk in birthdays_dataset.to_batches():
    print("AGES", pc.subtract(current_year, table_chunk["years"]))

AGES [
  33
]
AGES [
  28,
  28
]
AGES [
  23,
  23
]
