# The Pandas-like Table API in GreenplumPython

GreenplumPython is a Python library that enables the user to interact with Greenplum in a Pythonic way.

In the coming release, GreenplumPython will provide a pandas-like table API that
1. looks familiar and intuitive to Python users
2. is powerful to do complex analytics, such as statistical analysis, with UDFs and UDAs
3. encapsulates common best practices and avoids common pitfalls in Greeenplum, compared to writing SQL directly

# Selecting the Database of Your Data

To begin with, we need to select the database that contains the data we want:

In [1]:
import greenplumpython as gp


db = gp.database(host="localhost", dbname="gpadmin")
print(db)

<greenplumpython.db.Database object at 0x7f9011ab5610>


# Accessing a Table in the Database

After selecting the database, we can access a table in the database by specifying its name:

In [3]:
from tabulate import tabulate


def format(t: gp.Table):
    return tabulate(t.fetch(), headers="keys", tablefmt="html")

In [3]:
t = gp.table("demo", db=db)
format(t)

i,n
1,2
2,3
6,7
10,11
3,4
4,5
7,8
8,9
5,6
9,10


# Basic Data Manipulation

Now we have a table. We can do basic data manipulation on it, just like in SQL.

For example, I can `SELECT` a subset of its columns:

In [5]:
t_ij = t[["i", "j"]]
format(t_ij)

WITH demo AS (TABLE demo)
                SELECT i,j 
                FROM demo
            


i,j
3,3
10,10
1,1
4,4
8,8
2,2
5,5
6,6
7,7
9,9


And I can also `SELECT` a subset of its rows. Say I want all the even numbers:

In [6]:
t_even = t_ij[t_ij["i"] % 2 == 0]
format(t_even)

WITH demo AS (TABLE demo),cte_1ce15e76cd294d73b91f9f7d712162aa AS (
                SELECT i,j 
                FROM demo
            )SELECT * FROM cte_1ce15e76cd294d73b91f9f7d712162aa WHERE cte_1ce15e76cd294d73b91f9f7d712162aa.i %% 2 = 0


i,j
10,10
4,4
8,8
2,2
6,6


For a quick glance, I can `SELECT` the first N rows of a table, like this:

In [7]:
t_n = t_even[:3]
format(t_n)

WITH demo AS (TABLE demo),cte_1ce15e76cd294d73b91f9f7d712162aa AS (
                SELECT i,j 
                FROM demo
            ),cte_30ef2fb1aaee4624a867b1af8b7724d4 AS (SELECT * FROM cte_1ce15e76cd294d73b91f9f7d712162aa WHERE cte_1ce15e76cd294d73b91f9f7d712162aa.i %% 2 = 0)SELECT * FROM cte_30ef2fb1aaee4624a867b1af8b7724d4 LIMIT 3 


i,j
10,10
2,2
6,6


Finally when I am done, I can save the resulting table to the database, either temporarily or persistently:

In [8]:
t_n.save_as(table_name="t_n", temp=True)

WITH demo AS (TABLE demo),cte_1ce15e76cd294d73b91f9f7d712162aa AS (
                SELECT i,j 
                FROM demo
            ),cte_30ef2fb1aaee4624a867b1af8b7724d4 AS (SELECT * FROM cte_1ce15e76cd294d73b91f9f7d712162aa WHERE cte_1ce15e76cd294d73b91f9f7d712162aa.i %% 2 = 0)SELECT * FROM cte_30ef2fb1aaee4624a867b1af8b7724d4 LIMIT 3 

            CREATE TEMP TABLE t_n (i,j) 
            AS WITH demo AS (TABLE demo),cte_1ce15e76cd294d73b91f9f7d712162aa AS (
                SELECT i,j 
                FROM demo
            ),cte_30ef2fb1aaee4624a867b1af8b7724d4 AS (SELECT * FROM cte_1ce15e76cd294d73b91f9f7d712162aa WHERE cte_1ce15e76cd294d73b91f9f7d712162aa.i %% 2 = 0)SELECT * FROM cte_30ef2fb1aaee4624a867b1af8b7724d4 LIMIT 3 
            


<greenplumpython.table.Table at 0x7f74300e9a20>

# `JOIN`-ing Two Tables

We can also `JOIN` two tables with GreenplumPython. For example, suppose we have two tables like this:

In [9]:
rows = [(1, "'a'",), (2, "'b'",), (3, "'c'",), (4, "'d'")]
t1 = gp.values(rows, db=db, column_names=["id, val"])
format(t1)

SELECT * FROM (VALUES (1,'a'),(2,'b'),(3,'c'),(4,'d')) AS vals (id, val)


id,val
1,a
2,b
3,c
4,d


In [10]:
rows = [(1, "'a'",), (2, "'b'",), (3, "'a'",), (4, "'b'")]
t2 = gp.values(rows, db=db, column_names=["id, val"])
format(t2)

SELECT * FROM (VALUES (1,'a'),(2,'b'),(3,'a'),(4,'b')) AS vals (id, val)


id,val
1,a
2,b
3,a
4,b


We can `JOIN` the two table like this:

In [11]:
t_join = t1.join(
    t2,
    cond=t1["val"] == t2["val"],
    targets=[
        t1["id"].rename("t1_id"),
        t1["val"].rename("t1_val"),
        t2["id"].rename("t2_id"),
        t2["val"].rename("t2_val"),
    ],
)
format(t_join)


WITH cte_14860c4cfd414f0fa123dcd3f5af85a6 AS (SELECT * FROM (VALUES (1,'a'),(2,'b'),(3,'a'),(4,'b')) AS vals (id, val)),cte_e8860a621e9b4ebbb91343b1124794c2 AS (SELECT * FROM (VALUES (1,'a'),(2,'b'),(3,'c'),(4,'d')) AS vals (id, val))
                SELECT cte_e8860a621e9b4ebbb91343b1124794c2.id AS t1_id,cte_e8860a621e9b4ebbb91343b1124794c2.val AS t1_val,cte_14860c4cfd414f0fa123dcd3f5af85a6.id AS t2_id,cte_14860c4cfd414f0fa123dcd3f5af85a6.val AS t2_val 
                FROM cte_e8860a621e9b4ebbb91343b1124794c2 INNER JOIN cte_14860c4cfd414f0fa123dcd3f5af85a6
                ON cte_e8860a621e9b4ebbb91343b1124794c2.val = cte_14860c4cfd414f0fa123dcd3f5af85a6.val  
            


t1_id,t1_val,t2_id,t2_val
1,a,3,a
1,a,1,a
2,b,4,b
2,b,2,b


# Creating and Calling Functions

Calling functions is essential for data analytics. GreeenplumPython supports creating Greenplum UDFs and UDAs from Python functions and calling them in Python.

Suppose I have a table of numbers:

In [12]:
rows = [(i,) for i in range(10)]
numbers = gp.values(rows, db=db, column_names=["val"])
format(numbers)

SELECT * FROM (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)) AS vals (val)


val
0
1
2
3
4
5
6
7
8
9


If I want to get the square of each number, I can write a function to do that:

In [13]:
@gp.create_function(sig="(int, int)->int", language_handler="plcontainer")
def square(a: int) -> int:
    # container: plc_python_shared
    return a ** 2

format(square(numbers["val"], as_name="result", db=db).to_table())

CREATE  FUNCTION pg_temp.square (a integer) RETURNS integer LANGUAGE plpython3u AS $$
return a ** 2 $$
WITH cte_c3797751b29b4058b13e3db881f4d297 AS (SELECT * FROM (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)) AS vals (val)),cte_78874cfa108b40788b9e6c759c986e05 AS (SELECT pg_temp.square(cte_c3797751b29b4058b13e3db881f4d297.val) AS result  FROM cte_c3797751b29b4058b13e3db881f4d297 )SELECT * FROM cte_78874cfa108b40788b9e6c759c986e05


result
0
1
4
9
16
25
36
49
64
81


Note that this function is called in exactly the same way as ordinary Python functions.

If I also want to get the sum of these numbers, what I need is to write an aggregate function like this:

In [14]:
@gp.create_aggregate
def my_sum(result: int, val: int) -> int:
    if result is None:
        return val
    return result + val

format(my_sum(numbers["val"], as_name="result", db=db).to_table())


@gp.create_array_function
def my_sum_arr(val_array: List[int], val2_array: List[int]) -> int:
    return sum(val_array)

# -> SELECT my_sum(array_agg(val), array_agg(val2)) FROM numbers;
format(my_sum_arr(numbers["val"], numbers["val2"], as_name="result", db=db).to_table())    


CREATE  FUNCTION pg_temp.func_78d8e83b0058440aa48d13d6d573a720 (result integer,val integer) RETURNS integer LANGUAGE plpython3u AS $$
if result is None:
    return val
return result + val $$

            CREATE  AGGREGATE pg_temp.my_sum (val integer) (
                SFUNC = pg_temp.func_78d8e83b0058440aa48d13d6d573a720,
                STYPE = integer
            )
            
WITH cte_c3797751b29b4058b13e3db881f4d297 AS (SELECT * FROM (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)) AS vals (val)),cte_cff579bcb5b14b0f85ccaff8cf38d62b AS (SELECT pg_temp.my_sum(cte_c3797751b29b4058b13e3db881f4d297.val) AS result  FROM cte_c3797751b29b4058b13e3db881f4d297 )SELECT * FROM cte_cff579bcb5b14b0f85ccaff8cf38d62b


result
45


In [16]:
rows = [(i, i % 2 == 0) for i in range(10)]
numbers = gp.values(rows, db=db, column_names=["val", "is_even"])
format(numbers)

SELECT * FROM (VALUES (0,True),(1,False),(2,True),(3,False),(4,True),(5,False),(6,True),(7,False),(8,True),(9,False)) AS vals (val,is_even)


val,is_even
0,True
1,False
2,True
3,False
4,True
5,False
6,True
7,False
8,True
9,False


In [17]:
count = gp.aggregate("count", db=db)

results = count(numbers["val"], group_by=["is_even"]).to_table()
format(results)

WITH cte_1cf9bda316de4934850da7e26e225ead AS (SELECT * FROM (VALUES (0,True),(1,False),(2,True),(3,False),(4,True),(5,False),(6,True),(7,False),(8,True),(9,False)) AS vals (val,is_even)),cte_407083b0934f4ab0b9f957e9050a4004 AS (SELECT count(cte_1cf9bda316de4934850da7e26e225ead.val)  ,is_even FROM cte_1cf9bda316de4934850da7e26e225ead GROUP BY is_even)SELECT * FROM cte_407083b0934f4ab0b9f957e9050a4004


count,is_even
5,False
5,True
