<a href="https://colab.research.google.com/github/cloudhood/learn-pyarrow/blob/main/notebooks/getting_started.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyarrow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pyarrow as pa
import pyarrow.parquet as pq

In [5]:
# pa arrays are collections of data of uniform type, like numpy.
days = pa.array([1, 12, 17, 23, 28], type=pa.int8())
months = pa.array([1, 3, 5, 7, 1], type=pa.int8())
years = pa.array([1990, 2000, 1995, 2000, 1995], type=pa.int16())
birthdays_table = pa.table(
    [days, months, years], 
    names=["days", "months", "years"]
)
birthdays_table

pyarrow.Table
days: int8
months: int8
years: int16
----
days: [[1,12,17,23,28]]
months: [[1,3,5,7,1]]
years: [[1990,2000,1995,2000,1995]]

In [9]:
pq.write_table(birthdays_table, 'birthdays.parquet')
reloaded_birthdays = pq.read_table('birthdays.parquet')
reloaded_birthdays

pyarrow.Table
days: int8
months: int8
years: int16
----
days: [[1,12,17,23,28]]
months: [[1,3,5,7,1]]
years: [[1990,2000,1995,2000,1995]]

In [10]:
## Compute functions
import pyarrow.compute as pc
pc.value_counts(birthdays_table['years'])

<pyarrow.lib.StructArray object at 0x7f6cb5185220>
-- is_valid: all not null
-- child 0 type: int16
  [
    1990,
    2000,
    1995
  ]
-- child 1 type: int64
  [
    1,
    2,
    2
  ]

In [18]:
## Working with large data - partition into smaller chunks
import pyarrow.dataset as ds
ds.write_dataset(
    birthdays_table, 
    "savedir", 
    format="parquet",
    existing_data_behavior='delete_matching',
    partitioning=ds.partitioning(
        pa.schema([birthdays_table.schema.field("years")])
    )
)
birthdays_dataset = ds.dataset("savedir", format="parquet", partitioning=["years"])
print(birthdays_dataset.files)

## Lazy load on each iteration
import datetime
current_year = datetime.datetime.utcnow().year
for table_chunk in birthdays_dataset.to_batches():
    print("AGES", pc.subtract(current_year, table_chunk["years"]))

['savedir/1990/part-0.parquet', 'savedir/1995/part-0.parquet', 'savedir/2000/part-0.parquet']
AGES [
  33
]
AGES [
  28,
  28
]
AGES [
  23,
  23
]
