In [3]:
import pyarrow as pa

In [4]:
import pyarrow.compute as pc

In [5]:
a = pa.array([1,1,2,3])

In [6]:
pc.sum(a)

<pyarrow.Int64Scalar: 7>

In [7]:
a = pa.array([1,1,2,3])

In [8]:
b = pa.array([4,1,2,8])

In [9]:
pc.equal(a,b)

<pyarrow.lib.BooleanArray object at 0x107d24d00>
[
  false,
  true,
  true,
  false
]

In [10]:
x,y = pa.scalar(7.8),pa.scalar(9.3)

In [11]:
pc.multiply(x,y)

<pyarrow.DoubleScalar: 72.54>

In [12]:
t = pa.table({'x':[1,2,3],'y':[3,2,1]})

In [13]:
t

pyarrow.Table
x: int64
y: int64
----
x: [[1,2,3]]
y: [[3,2,1]]

In [14]:
i = pc.sort_indices(t,sort_keys=[('y','ascending')])

In [15]:
i

<pyarrow.lib.UInt64Array object at 0x107ce9640>
[
  2,
  1,
  0
]

# Grouped Aggregations

In [16]:
t = pa.table([
    pa.array(["a","a","b","b","c"]),
    pa.array([1,2,3,4,5]),
],names = ["keys","values"])

In [17]:
t.group_by("keys").aggregate([("values","sum")])

pyarrow.Table
values_sum: int64
keys: string
----
values_sum: [[3,7,5]]
keys: [["a","b","c"]]

In [18]:
t.group_by("keys").aggregate([("values","sum"),("keys","count")])

pyarrow.Table
values_sum: int64
keys_count: int64
keys: string
----
values_sum: [[3,7,5]]
keys_count: [[2,2,1]]
keys: [["a","b","c"]]

In [19]:
table_with_nulls = pa.table([
   pa.array(["a", "a", "a"]),
   pa.array([1, None, None])
], names=["keys", "values"])

In [21]:
table_with_nulls.group_by(["keys"]).aggregate([
    ("values","count",pc.CountOptions(mode="all"))
])

pyarrow.Table
values_count: int64
keys: string
----
values_count: [[3]]
keys: [["a"]]

# Joins

In [22]:
table1 = pa.table({'id':[1,2,3],'year':[2020,2022,2019]})

In [23]:
table2 = pa.table({'id':[3,4],'n_legs':[5,100],'animal':["Brittle stars", "Centipede"]})

In [24]:
joined_table = table1.join(table2,keys="id")

In [25]:
joined_table

pyarrow.Table
id: int64
year: int64
n_legs: int64
animal: string
----
id: [[3,1,2]]
year: [[2019,2020,2022]]
n_legs: [[5,null,null]]
animal: [["Brittle stars",null,null]]

In [26]:
table1.join(table2,keys='id',join_type="full outer")

pyarrow.Table
id: int64
year: int64
n_legs: int64
animal: string
----
id: [[3,1,2,4]]
year: [[2019,2020,2022,null]]
n_legs: [[5,null,null,100]]
animal: [["Brittle stars",null,null,"Centipede"]]

In [27]:
table2_withyear = table2.append_column("year",pa.array([2019,2022]))

In [28]:
table1.join(table2_withyear,keys=["id","year"])

pyarrow.Table
id: int64
year: int64
n_legs: int64
animal: string
----
id: [[3,1,2]]
year: [[2019,2020,2022]]
n_legs: [[5,null,null]]
animal: [["Brittle stars",null,null]]

In [29]:
table1

pyarrow.Table
id: int64
year: int64
----
id: [[1,2,3]]
year: [[2020,2022,2019]]

In [30]:
table2_withyear

pyarrow.Table
id: int64
n_legs: int64
animal: string
year: int64
----
id: [[3,4]]
n_legs: [[5,100]]
animal: [["Brittle stars","Centipede"]]
year: [[2019,2022]]

In [31]:
table1.join(table2_withyear,keys=["id","year"])

pyarrow.Table
id: int64
year: int64
n_legs: int64
animal: string
----
id: [[3,1,2]]
year: [[2019,2020,2022]]
n_legs: [[5,null,null]]
animal: [["Brittle stars",null,null]]

In [32]:
import pyarrow.dataset as ds

In [33]:
ds1 = ds.dataset(table1)
ds2 = ds.dataset(table2)

In [38]:
joined_ds = ds1.join(ds2, keys="id")

In [40]:
joined_ds.head(5)

pyarrow.Table
id: int64
year: int64
n_legs: int64
animal: string
----
id: [[3,1,2]]
year: [[2019,2020,2022]]
n_legs: [[5,null,null]]
animal: [["Brittle stars",null,null]]

# Filter

In [41]:
even_filter = (pc.bit_wise_and(pc.field("nums"),pc.scalar(1))==pc.scalar(0))

In [42]:
even_filter

<pyarrow.compute.Expression (bit_wise_and(nums, 1) == 0)>

In [43]:
table = pa.table({'nums': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                  'chars': ["a", "b", "c", "d", "e", "f", "g", "h", "i", "l"]})

In [44]:
table

pyarrow.Table
nums: int64
chars: string
----
nums: [[1,2,3,4,5,6,7,8,9,10]]
chars: [["a","b","c","d","e","f","g","h","i","l"]]

In [45]:
table.filter(even_filter)

pyarrow.Table
nums: int64
chars: string
----
nums: [[2,4,6,8,10]]
chars: [["b","d","f","h","l"]]

In [46]:
table.filter(~even_filter)

pyarrow.Table
nums: int64
chars: string
----
nums: [[1,3,5,7,9]]
chars: [["a","c","e","g","i"]]

In [47]:
table.filter(even_filter & (pc.field("nums") > 5))

pyarrow.Table
nums: int64
chars: string
----
nums: [[6,8,10]]
chars: [["f","h","l"]]