In [2]:
import pyarrow as pa
import random

In [None]:
# Define fields with types and metadata.
field1 = pa.field("id", pa.int64())
field2 = pa.field("name", pa.string(), metadata={"note": f"Random note {random.randint(1, 100)}".encode()})
field3 = pa.field("score", pa.float64())
schema = pa.schema([field1, field2, field3])
print("Schema:", schema)

# Use the schema to create a RecordBatch.
data = [
    pa.array([1, 2, 3]),
    pa.array(["Alice", "Bob", "Charlie"]),
    pa.array([95.5, 87.0, 92.3])
]
# RecordBatch is the nothing but we can say it as a slice of table created with multiple array columns. 
record_batch = pa.RecordBatch.from_arrays(data, schema=schema)
print("\nRecordBatch with custom schema:")
print(record_batch)

# You can also inspect the schema from the record batch.
print("\nAccessing the schema from the record batch:")
print(record_batch.schema)


Schema: id: int64
name: string
  -- field metadata --
  note: 'Random note 61'
score: double

RecordBatch with custom schema:
pyarrow.RecordBatch
id: int64
name: string
score: double
----
id: [1,2,3]
name: ["Alice","Bob","Charlie"]
score: [95.5,87,92.3]

Accessing the schema from the record batch:
id: int64
name: string
  -- field metadata --
  note: 'Random note 61'
score: double


In [3]:
import numpy as np

# Create an array of 10 random integers.
random_ints = np.random.randint(0, 100, size=10).tolist()
arr = pa.array(random_ints)
print("Array:", arr)

# Convert the Arrow array to a Python list.
print("Array as Python list:", arr.to_pylist())


Array: [
  56,
  92,
  85,
  34,
  68,
  42,
  1,
  61,
  64,
  72
]
Array as Python list: [56, 92, 85, 34, 68, 42, 1, 61, 64, 72]


In [5]:
# Create an array of random floats, then inject None and np.nan.
random_floats = np.random.rand(5).tolist()
random_floats[1] = None   # A missing value.
random_floats[3] = np.nan  # Not-a-number value.
arr_with_nulls = pa.array(random_floats)
print("Array with nulls and NaN:", arr_with_nulls)

print("Validity (is valid):", [arr_with_nulls[i].is_valid for i in range(len(arr_with_nulls))])

# Convert to list to inspect the values.
print("Converted list:", arr_with_nulls.to_pylist())


Array with nulls and NaN: [
  0.27179593319600037,
  null,
  0.9732419288204855,
  nan,
  0.3529178782684954
]
Validity (is valid): [True, False, True, True, True]
Converted list: [0.27179593319600037, None, 0.9732419288204855, nan, 0.3529178782684954]


In [6]:
# Create a list of lists with random lengths and random integers.
list_data = [ [random.randint(0, 10) for _ in range(random.randint(1, 5))] for _ in range(4)]
list_data[2] = None  # Introduce a None entry.
list_arr = pa.array(list_data)
print("List Array:", list_arr)

# Access an individual list element.
print("First element (a sublist):", list_arr[0].as_py())


List Array: [
  [
    10,
    4,
    1,
    8,
    8
  ],
  [
    8
  ],
  null,
  [
    1,
    0
  ]
]
First element (a sublist): [10, 4, 1, 8, 8]


In [7]:
print("Underlying flat values array:", list_arr.values)
print("Offsets array:", list_arr.offsets)


Underlying flat values array: [
  10,
  4,
  1,
  8,
  8,
  8,
  1,
  0
]
Offsets array: [
  0,
  5,
  6,
  6,
  8
]


In [8]:
offsets = list_arr.offsets.to_pylist()
flat_values = list_arr.values.to_pylist()
sublists = [flat_values[offsets[i]:offsets[i+1]] if list_arr[i].is_valid else None for i in range(len(offsets)-1)]
print("Reconstructed sublists:", sublists)

Reconstructed sublists: [[10, 4, 1, 8, 8], [8], None, [1, 0]]


In [20]:
sublists = []
for i in range(len(offsets) - 1):
    if list_arr[i].is_valid:
        sublist = flat_values[offsets[i]:offsets[i+1]]
    else:
        sublist = None
    sublists.append(sublist)
print("Reconstructed sublists:", sublists)

Reconstructed sublists: [[10, 4, 1, 8, 8], [8], None, [1, 0]]


In [10]:
import pyarrow as pa
import numpy as np
import random

num_arr = pa.array(np.random.randint(0, 100, size=5).tolist())
char_arr = pa.array([chr(random.randint(65, 90)) for _ in range(5)])

struct_arr = pa.StructArray.from_arrays(
    [num_arr, char_arr],
    fields=[pa.field("num", num_arr.type), pa.field("char", char_arr.type)]
)
print("Struct Array:", struct_arr)


Struct Array: -- is_valid: all not null
-- child 0 type: int64
  [
    45,
    8,
    94,
    49,
    88
  ]
-- child 1 type: string
  [
    "P",
    "L",
    "J",
    "X",
    "E"
  ]


In [22]:
struct_arr[0]

<pyarrow.StructScalar: [('num', 45), ('char', 'P')]>

In [11]:
data = [[('x', 1), ('y', 0)], [('a', 2), ('b', 45)]]

ty = pa.map_(pa.string(), pa.int64())

pa.array(data, type=ty)

<pyarrow.lib.MapArray object at 0x117a5f8e0>
[
  keys:
  [
    "x",
    "y"
  ]
  values:
  [
    1,
    0
  ],
  keys:
  [
    "a",
    "b"
  ]
  values:
  [
    2,
    45
  ]
]

At a high level, union arrays let you store different kinds of data in a single column without forcing everything into a common type. Imagine you have several boxes—one for numbers, one for text, one for booleans—and you want one list that can contain items from any of those boxes. 

In [59]:
xs = pa.array([5, 6, 7])

ys = pa.array([False, False, True])

# The types array: indicates which child holds the value for each element.
types = pa.array([0,0,0], type=pa.int8())

union_arr = pa.UnionArray.from_sparse(types, [xs, ys])

print(union_arr.type)
print("------")
print(union_arr)
print('#############')
print(union_arr[0],union_arr[1],union_arr[2])

sparse_union<0: int64=0, 1: bool=1>
------
-- is_valid: all not null
-- type_ids:   [
    0,
    0,
    0
  ]
-- child 0 type: int64
  [
    5,
    6,
    7
  ]
-- child 1 type: bool
  [
    false,
    false,
    true
  ]
#############
5 6 7


In [3]:
xs = pa.array([5, 6, 7])

ys = pa.array([False, False, True])

# The types array: indicates which child holds the value for each element.
types = pa.array([0,1,0], type=pa.int8())

union_arr = pa.UnionArray.from_sparse(types, [xs, ys])

print(union_arr.type)
print("------")
print(union_arr)
print('#############')
print(union_arr[0],union_arr[1],union_arr[2])

sparse_union<0: int64=0, 1: bool=1>
------
-- is_valid: all not null
-- type_ids:   [
    0,
    1,
    0
  ]
-- child 0 type: int64
  [
    5,
    6,
    7
  ]
-- child 1 type: bool
  [
    false,
    false,
    true
  ]
#############
5 False 7


In [39]:
import pyarrow as pa

# Create three child arrays.
xs = pa.array([5, 6, 7])
ys = pa.array([False, False, True])
zs = pa.array(["foo", "bar", "baz"])  # third child (e.g., strings)

# The types array indicates which child to use for each element.
types = pa.array([0, 1, 2], type=pa.int8())

# Create the union array from all three children.
union_arr2 = pa.UnionArray.from_sparse(types, [xs, ys, zs])

print(union_arr2.type)
print("------")
print(union_arr2)


sparse_union<0: int64=0, 1: bool=1, 2: string=2>
------
-- is_valid: all not null
-- type_ids:   [
    0,
    1,
    2
  ]
-- child 0 type: int64
  [
    5,
    6,
    7
  ]
-- child 1 type: bool
  [
    false,
    false,
    true
  ]
-- child 2 type: string
  [
    "foo",
    "bar",
    "baz"
  ]


In [4]:
import pyarrow as pa

# Create three child arrays.
xs = pa.array([5, 6, 7])
ys = pa.array([False, False, True])
zs = pa.array(["foo", "bar", "baz"])  # third child (e.g., strings)

# The types array indicates which child to use for each element.
types = pa.array([0, 1, 2], type=pa.int8())

# Create the union array from all three children.
union_arr2 = pa.UnionArray.from_sparse(types, [xs, ys, zs])

print(union_arr2.type)
print("------")
print(union_arr2)
print('#############')
print(union_arr2[0],union_arr2[1],union_arr2[2])

sparse_union<0: int64=0, 1: bool=1, 2: string=2>
------
-- is_valid: all not null
-- type_ids:   [
    0,
    1,
    2
  ]
-- child 0 type: int64
  [
    5,
    6,
    7
  ]
-- child 1 type: bool
  [
    false,
    false,
    true
  ]
-- child 2 type: string
  [
    "foo",
    "bar",
    "baz"
  ]
#############
5 False baz


In [14]:
data = [
    pa.array([1, 2, 3, 4]),
    pa.array(['foo', 'bar', 'baz', None]),
    pa.array([True, None, False, True])
]


In [16]:
batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])

print(batch.num_columns)

print(batch.num_rows)

print(batch.schema)

print(batch[1])

3
4
f0: int64
f1: string
f2: bool
[
  "foo",
  "bar",
  "baz",
  null
]


In [17]:
# Generate random data for a table.
ids = pa.array(np.random.randint(1, 50, size=5).tolist())
values = pa.array([random.choice(["x", "y", "z"]) for _ in range(5)])
table = pa.Table.from_arrays([ids, values], names=["id", "value"])
print("Table:")
print(table)

# Access the schema and individual columns.
print("\nTable Schema:", table.schema)
print("Column 'id' data:", table.column("id").to_pylist())
print("Column 'value' data:", table.column("value").to_pylist())


Table:
pyarrow.Table
id: int64
value: string
----
id: [[10,44,46,34,4]]
value: [["y","z","y","y","z"]]

Table Schema: id: int64
value: string
Column 'id' data: [10, 44, 46, 34, 4]
Column 'value' data: ['y', 'z', 'y', 'y', 'z']


In [18]:
field1 = pa.field("username", pa.string(), metadata={"description": f"User login {random.randint(1,100)}".encode()})
field2 = pa.field("age", pa.int32(), metadata={"description": f"Age {random.randint(18,80)}".encode()})
custom_schema = pa.schema([field1, field2], metadata={"source": b"generated data"})
print("Custom Schema:", custom_schema)

# Retrieve metadata from a field.
if field1.metadata and b"description" in field1.metadata:
    print("Field 'username' metadata:", field1.metadata[b"description"].decode())

# Use the custom schema in a RecordBatch.
usernames = pa.array(["alice", "bob", "charlie"])
ages = pa.array([25, 30, 22])
user_batch = pa.RecordBatch.from_arrays([usernames, ages], schema=custom_schema)
print("\nRecord Batch using custom schema:")
print(user_batch)


Custom Schema: username: string
  -- field metadata --
  description: 'User login 40'
age: int32
  -- field metadata --
  description: 'Age 45'
-- schema metadata --
source: 'generated data'
Field 'username' metadata: User login 40

Record Batch using custom schema:
pyarrow.RecordBatch
username: string
age: int32
----
username: ["alice","bob","charlie"]
age: [25,30,22]
