In [14]:
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds


import pandas as pd 

In [6]:
arr = pa.array([10, 20, 30, 40, 50])
print("Original array:", arr)

# Compute the sum.
sum_val = pc.sum(arr)
print("Sum:", sum_val.as_py())

# Compute the mean.
mean_val = pc.mean(arr)
print("Mean:", mean_val.as_py())

# Compare values: find which elements are greater than 25.
greater_than_25 = pc.greater(arr, 25)
print("Elements > 25 (as Boolean array):", greater_than_25)
filtered_arr = pc.filter(arr, greater_than_25)
print("Elements > 25:", filtered_arr)

Original array: [
  10,
  20,
  30,
  40,
  50
]
Sum: 150
Mean: 30.0
Elements > 25 (as Boolean array): [
  false,
  false,
  true,
  true,
  true
]
Elements > 25: [
  30,
  40,
  50
]


In [5]:
df = pd.DataFrame({
    'category': ['A', 'A', 'B', 'B', 'B', 'C'],
    'value': [10, 20, 30, 40, 50, 60]
})

# Convert to an Arrow Table.
table = pa.Table.from_pandas(df)
print("Original Table:")
print(table)

# Group by 'category' and aggregate the sum of 'value'.
grouped = table.group_by('category').aggregate([('value', 'sum')])
print("\nGrouped Aggregation (Sum of value by category):")
print(grouped)

Original Table:
pyarrow.Table
category: string
value: int64
----
category: [["A","A","B","B","B","C"]]
value: [[10,20,30,40,50,60]]

Grouped Aggregation (Sum of value by category):
pyarrow.Table
category: string
value_sum: int64
----
category: [["A","B","C"]]
value_sum: [[30,120,60]]


In [7]:
df1 = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'value1': [10, 20, 30, 40]
})

df2 = pd.DataFrame({
    'id': [3, 4, 5, 6],
    'value2': [300, 400, 500, 600]
})

# Convert to Arrow Tables.
table1 = pa.Table.from_pandas(df1)
table2 = pa.Table.from_pandas(df2)

# Perform an inner join on the 'id' column.
joined = table1.join(table2, keys='id', join_type='inner')
print("Joined Table (Inner Join on 'id'):")
print(joined)

Joined Table (Inner Join on 'id'):
pyarrow.Table
id: int64
value1: int64
value2: int64
----
id: [[3,4]]
value1: [[30,40]]
value2: [[300,400]]


In [8]:
df1 = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'value1': [10, 20, 30, 40]
})
df2 = pd.DataFrame({
    'id': [3, 4, 5, 6],
    'value2': [300, 400, 500, 600]
})

rb1 = pa.RecordBatch.from_pandas(df1)
rb2 = pa.RecordBatch.from_pandas(df2)

# Convert the RecordBatches to Tables.
table1_rb = pa.Table.from_batches([rb1])
table2_rb = pa.Table.from_batches([rb2])

# Now, perform a join (for example, a left outer join).
rb_left_join = table1_rb.join(table2_rb, keys='id', join_type='left outer')
print("\nLeft Outer Join (from RecordBatches):")
print(rb_left_join.to_pandas())


Left Outer Join (from RecordBatches):
   id  value1  value2
0   3      30   300.0
1   4      40   400.0
2   1      10     NaN
3   2      20     NaN


In [9]:
df = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'value': [10, 20, 30, 40, 50]
})
table = pa.Table.from_pandas(df)
print("Original Table:")
print(table)

# Define a filter expression: select rows where value is greater than 25.
expr = pc.greater(table['value'], 25)
filtered_table = table.filter(expr)
print("\nFiltered Table (value > 25):")
print(filtered_table)

Original Table:
pyarrow.Table
id: int64
value: int64
----
id: [[1,2,3,4,5]]
value: [[10,20,30,40,50]]

Filtered Table (value > 25):
pyarrow.Table
id: int64
value: int64
----
id: [[3,4,5]]
value: [[30,40,50]]


In [21]:
import numpy as np 
def to_np(val):
    if isinstance(val, pa.Scalar):
        return val.as_py()
    else:
        return np.array(val)

In [17]:
function_name = "add_two2"
function_docs = {
    "summary": "Adds two numbers",
    "description": "Performs elementwise addition of two int64 values."
}
input_types = {"x": pa.int64(), "y": pa.int64()}
output_type = pa.int64()

def add_two_udf(ctx, x, y):
    np_x = to_np(x)
    np_y = to_np(y)
    return pa.array(np_x + np_y)

pc.register_scalar_function(add_two_udf,
                              function_name,
                              function_docs,
                              input_types,
                              output_type)



In [22]:
a = pa.array([10, 20, 30], type=pa.int64())
b = pa.array([1, 2, 3], type=pa.int64())
result_add = pc.call_function("add_two2", [a, b])
print("Addition result:", result_add)


Addition result: [
  11,
  22,
  33
]


In [23]:
function_name = "categorize"
function_docs = {
    "summary": "Categorizes int64 values",
    "description": "Returns 'Low' for values <25, 'Medium' for values <75, otherwise 'High'."
}
input_types = {"x": pa.int64()}
output_type = pa.string()

def categorize_udf(ctx, x):
    np_x = to_np(x)
    # Use nested np.where to vectorize the categorization.
    result = np.where(np_x < 25, "Low",
             np.where(np_x < 75, "Medium", "High"))
    return pa.array(result, type=pa.string())

pc.register_scalar_function(categorize_udf,
                              function_name,
                              function_docs,
                              input_types,
                              output_type)


In [24]:
numbers = pa.array([10, 30, 80, 50, 5], type=pa.int64())
result_category = pc.call_function("categorize", [numbers])
print("Categorized values:", result_category)


Categorized values: [
  "Low",
  "Medium",
  "High",
  "Medium",
  "Low"
]


In [None]:
values = pa.array([90, 630, 1827, 2709], type=pa.int64())
categories = pa.array(["A", "B", "C", "D"], type=pa.string())

data_table = pa.Table.from_arrays([values, categories], names=["value", "category"])

dataset = ds.dataset(data_table)

expr = ds.field("value")._call("categorize", [ds.field("value")])

projected_table = dataset.to_table(columns={
    'value': ds.field("value"),
    'category': ds.field("category"),
    'value_category': expr 
})

print("Dataset Table with Categorized Value Column (using UDF expression):")
print(projected_table)

Dataset Table with Categorized Value Column (using UDF expression):
pyarrow.Table
value: int64
category: string
value_category: string
----
value: [[90,630,1827,2709]]
category: [["A","B","C","D"]]
value_category: [["High","High","High","High"]]
