In [None]:
import pyarrow as pa

In [4]:
t1 = pa.int32()

In [5]:
t2 = pa.string()

In [6]:
t3 = pa.binary()

In [7]:
t4 = pa.binary(10)

In [8]:
t5 = pa.timestamp("ms")

In [9]:
t1

DataType(int32)

In [10]:
print(t1)

int32


In [11]:
print(t4)

fixed_size_binary[10]


In [12]:
print(t5)

timestamp[ms]


In [13]:
f0 = pa.field("int32_field",t1)

In [14]:
f0

pyarrow.Field<int32_field: int32>

In [15]:
f0.name

'int32_field'

In [16]:
f0.type

DataType(int32)

In [17]:
t6 = pa.list_(t1)

In [18]:
t6

ListType(list<item: int32>)

In [21]:
fields = [
    pa.field('s0',t1),
    pa.field('s1',t2),
    pa.field('s2',t4),
    pa.field('s3',t6),
]

In [22]:
t7 = pa.struct(fields)

In [23]:
t7

StructType(struct<s0: int32, s1: string, s2: fixed_size_binary[10], s3: list<item: int32>>)

In [24]:
print(t7)

struct<s0: int32, s1: string, s2: fixed_size_binary[10], s3: list<item: int32>>


In [25]:
t8 = pa.struct([("s0",t1),("s1",t2),("s2",t4),("s3",t6)])

In [26]:
print(t8)

struct<s0: int32, s1: string, s2: fixed_size_binary[10], s3: list<item: int32>>


In [28]:
myschema = pa.schema([('field0',t1),("field1",t2),("field2",t4),("field3",t6)])

In [29]:
myschema

field0: int32
field1: string
field2: fixed_size_binary[10]
field3: list<item: int32>
  child 0, item: int32

In [30]:
arr = pa.array([1,2,None,3])

In [31]:
arr

<pyarrow.lib.Int64Array object at 0x10323ea60>
[
  1,
  2,
  null,
  3
]

In [34]:
pa.array([1,2],type=pa.uint16())

<pyarrow.lib.UInt16Array object at 0x104c7e820>
[
  1,
  2
]

In [35]:
arr.type

DataType(int64)

In [36]:
len(arr)

4

In [37]:
arr.null_count

1

In [38]:
arr[0]

<pyarrow.Int64Scalar: 1>

In [40]:
arr[2]

<pyarrow.Int64Scalar: None>

In [41]:
arr[1:3]

<pyarrow.lib.Int64Array object at 0x104c4cb20>
[
  2,
  null
]

In [42]:
nested_ar = pa.array([[],None,[1,2],[None,1]])

In [43]:
print(nested_ar)

[
  [],
  null,
  [
    1,
    2
  ],
  [
    null,
    1
  ]
]


In [44]:
ty = pa.struct([('x',pa.int8()),('y',pa.bool_())])

In [45]:
pa.array([{'x':1,'y':True},{'x':2,'y':False}],type=ty)

<pyarrow.lib.StructArray object at 0x104c76460>
-- is_valid: all not null
-- child 0 type: int8
  [
    1,
    2
  ]
-- child 1 type: bool
  [
    true,
    false
  ]

In [46]:
pa.array([(3,True),(4,False)],type=ty)

<pyarrow.lib.StructArray object at 0x104c7ebe0>
-- is_valid: all not null
-- child 0 type: int8
  [
    3,
    4
  ]
-- child 1 type: bool
  [
    true,
    false
  ]

In [47]:
pa.array([{'x':1},None,{'y':None}],type=ty)

<pyarrow.lib.StructArray object at 0x104c7eac0>
-- is_valid:
  [
    true,
    false,
    true
  ]
-- child 0 type: int8
  [
    1,
    0,
    null
  ]
-- child 1 type: bool
  [
    null,
    false,
    null
  ]

In [48]:
xs = pa.array([5,6,7],type=pa.int16())

In [49]:
ys = pa.array([False,True,True])

In [50]:
arr = pa.StructArray.from_arrays((xs,ys),names=('x','y'))

In [51]:
arr.type

StructType(struct<x: int16, y: bool>)

In [52]:
arr

<pyarrow.lib.StructArray object at 0x104c7ea60>
-- is_valid: all not null
-- child 0 type: int16
  [
    5,
    6,
    7
  ]
-- child 1 type: bool
  [
    false,
    true,
    true
  ]

# Record Batches

In [53]:
data= [
    pa.array([1,2,3,4]),
    pa.array(['foo','bar','baz',None]),
    pa.array([True,None,False,True])
]

In [54]:
data

[<pyarrow.lib.Int64Array object at 0x104c7e700>
 [
   1,
   2,
   3,
   4
 ],
 <pyarrow.lib.StringArray object at 0x104c7e9a0>
 [
   "foo",
   "bar",
   "baz",
   null
 ],
 <pyarrow.lib.BooleanArray object at 0x104c7eee0>
 [
   true,
   null,
   false,
   true
 ]]

In [55]:
batch = pa.RecordBatch.from_arrays(data,['f0','f1','f2'])

In [57]:
batch.num_columns

3

In [58]:
batch.num_rows

4

In [59]:
batch.schema

f0: int64
f1: string
f2: bool

In [60]:
batch[1]

<pyarrow.lib.StringArray object at 0x104c07be0>
[
  "foo",
  "bar",
  "baz",
  null
]

In [62]:
batches = [batch] * 5

In [63]:
table = pa.Table.from_batches(batches)

In [64]:
table

pyarrow.Table
f0: int64
f1: string
f2: bool
----
f0: [[1,2,3,4],[1,2,3,4],...,[1,2,3,4],[1,2,3,4]]
f1: [["foo","bar","baz",null],["foo","bar","baz",null],...,["foo","bar","baz",null],["foo","bar","baz",null]]
f2: [[true,null,false,true],[true,null,false,true],...,[true,null,false,true],[true,null,false,true]]

In [65]:
c = table[0]

In [66]:
c

<pyarrow.lib.ChunkedArray object at 0x104c7b4f0>
[
  [
    1,
    2,
    3,
    4
  ],
  [
    1,
    2,
    3,
    4
  ],
...,
  [
    1,
    2,
    3,
    4
  ],
  [
    1,
    2,
    3,
    4
  ]
]

In [67]:
c.num_chunks

5

In [68]:
c.chunk(0)

<pyarrow.lib.Int64Array object at 0x104c07fa0>
[
  1,
  2,
  3,
  4
]

In [69]:
schema = pa.schema([('x',pa.int64())])

In [71]:
def iter_record_batches():
    for i in range(2):
        yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema)

In [73]:
reader = pa.RecordBatchReader.from_batches(schema,iter_record_batches())

In [74]:
reader

<pyarrow.lib.RecordBatchReader at 0x104c59cc0>

In [75]:
print(reader.schema)

x: int64


In [76]:
for batch in reader:
    print(batch)

pyarrow.RecordBatch
x: int64
pyarrow.RecordBatch
x: int64
