In [2]:
# BigQuery table data on which notebook should be run
TABLE='bigquery-public-data.stackoverflow.comments'

# Change this up to test the scale, down to run the notebook faster
MAX_ROWS=100000

In [3]:
%%time

##############################
#       Pandas World         #
##############################

import pandas as pd
df = pd.read_gbq(TABLE, max_results=MAX_ROWS)[['id', 'text', 'score']]
df.head(10)



CPU times: user 25.4 s, sys: 2.5 s, total: 27.9 s
Wall time: 2min 31s


Unnamed: 0,id,text,score
0,10,It will help if you give some details of which...,6
1,25,infact it does. Look a the first lines of your...,10
2,27,"""Currently + is implemented using StringBuffer...",7
3,41,I don't think that's the magic number he was r...,18
4,59,It's still very useful to know that magic numb...,12
5,96,This implementation is also nice if you wish t...,9
6,108,"That's not full text searching, it's searching...",6
7,109,"That's not full text searching, it's searching...",6
8,137,In vim you can open > 1 buffer. :e filename. T...,9
9,154,"Sure, but what about a solution using O(1) mem...",8


In [4]:
# User defined function
# https://www.codespeedy.com/find-nth-prime-number-in-python/
def nth_prime(n):
    prime_numbers = [2,3]
    i=3
    if(0<n<=2):
        return prime_numbers[n-1]
    elif(n>2):
        while (True):
            i+=1
            status = True
            for j in range(2,int(i/2)+1):
                if(i%j==0):
                    status = False
                    break
            if(status==True):
                prime_numbers.append(i)
            if(len(prime_numbers)==n):
                break
        return prime_numbers[n-1]
    else:
        return -1

In [5]:
%%time

df = df.assign(n_prime=df['score'].apply(nth_prime))
df.head(10)

CPU times: user 4.22 s, sys: 18.2 ms, total: 4.24 s
Wall time: 4.26 s


Unnamed: 0,id,text,score,n_prime
0,10,It will help if you give some details of which...,6,13
1,25,infact it does. Look a the first lines of your...,10,29
2,27,"""Currently + is implemented using StringBuffer...",7,17
3,41,I don't think that's the magic number he was r...,18,61
4,59,It's still very useful to know that magic numb...,12,37
5,96,This implementation is also nice if you wish t...,9,23
6,108,"That's not full text searching, it's searching...",6,13
7,109,"That's not full text searching, it's searching...",6,13
8,137,In vim you can open > 1 buffer. :e filename. T...,9,23
9,154,"Sure, but what about a solution using O(1) mem...",8,19


In [6]:
%%time

##############################
#     BigQuery DataFrames     #
##############################

import bigframes.pandas as pd

df = pd.read_gbq(TABLE).head(MAX_ROWS)[['id', 'text', 'score']]
df.head(10)



HTML(value='Query job 6b0a39de-40a0-4dd4-be88-248bd8ebcd77 is RUNNING. <a target="_blank" href="https://consol…

CPU times: user 695 ms, sys: 94.3 ms, total: 790 ms
Wall time: 20.3 s


HTML(value='Query job 635f0a21-4f43-4159-bdf2-f167b14e60cc is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job ecaf079e-76ef-47bb-828d-a27e9552b597 is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job 4c1d9d3e-be25-4818-b74d-6214164d99ab is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0,id,text,score
0,11012908,you're welcome! according to the docs it shoul...,0
1,11013760,You *should* be concerned with the disk being ...,0
2,11013784,have you looked at `Integrate` or `NIntegrate`?,0
3,11015512,"sorry, is a typo. The variable name is dist. (...",0
4,11016238,"Pfff, I'm having trouble with that formula too...",0
5,11016276,Thanks thinksteep! Does this mean that by usin...,0
6,11016551,"Jason, thanks for the reply. I've been workin...",0
7,11017973,I assume an `off` of 0.5 would put be exactly ...,0
8,11018225,Thank you very much. I do worry too much abou...,0
9,11018370,"@IanClelland, I edited my question a bit. The ...",0


In [7]:
# Tell the user what needs to be done offline before using BigQuery DataFrame
# remote functions
help(pd.remote_function)

Help on function remote_function in module bigframes.pandas:

remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)
    Decorator to turn a user defined function into a BigQuery remote function.
    
    .. note::
        Please make sure following is setup before using this API:
    
    1. Have the below APIs enabled for your project:
    
        * BigQuery Connection API
        * Cloud Functions API
        * Cloud Run API
        * Cloud Build API
        * Artifact Registry API
        * Cloud Resource Manager API
    
       This can be done from the cloud console (change `PROJECT_ID` to yours):
       https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID
    
       Or from t

In [8]:
# BigQuery DataFrames user is a data scientist and may not have privileges to
# create a BQ connector and set it up for invoking a cloud function. They
# should get such a connector created from their cloud admin and use it with
# BigQuery DataFrames remote functions. If the provided connection name does not
# exist, BigQuery DataFrames will try to create it on the fly assuming the user
# has sufficient privileges.
bq_connection_name = 'bigframes-rf-conn'

In [9]:
from time import perf_counter
start = perf_counter()
#-------------------------------------------------------------------------------------

# User defined function
# https://www.codespeedy.com/find-nth-prime-number-in-python/
@pd.remote_function([int], int, bigquery_connection=bq_connection_name)
def nth_prime(n):
    prime_numbers = [2,3]
    i=3
    if(0<n<=2):
        return prime_numbers[n-1]
    elif(n>2):
        while (True):
            i+=1
            status = True
            for j in range(2,int(i/2)+1):
                if(i%j==0):
                    status = False
                    break
            if(status==True):
                prime_numbers.append(i)
            if(len(prime_numbers)==n):
                break
        return prime_numbers[n-1]
    else:
        return -1

#-------------------------------------------------------------------------------------
print(f"\nWall time: {(perf_counter()-start):.4f} s")

[INFO][2023-08-18 21:23:29,687][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmpl2ewfnue --entry-point=udf_http --trigger-http --no-allow-unauthenticated
[INFO][2023-08-18 21:24:43,689][bigframes.remote_function] Successfully created cloud function bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 with uri (https://bigframes-b0feb1fbaf8188b64d7e70118d93c5d4-7krlje3eoq-uc.a.run.app)
[INFO][2023-08-18 21:24:57,348][bigframes.remote_function] Connector bigframes-rf-conn already exists
[INFO][2023-08-18 21:24:57,351][bigframes.remote_function] Creating BQ remote function: 
            CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4(n INT64)
            RETURNS INT64
            REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`
            OPTIONS (
              endpoint


Wall time: 89.0601 s


In [10]:
%%time

# Let's apply the function to the dataframe
df = df.assign(n_prime=df['score'].apply(nth_prime))
df.head(10)

CPU times: user 16.8 ms, sys: 61 µs, total: 16.8 ms
Wall time: 17 ms


HTML(value='Query job 0f421233-9d02-4746-bb39-86a3b0880aba is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job 4f8d5734-8070-4630-8a59-c05a31d60476 is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job ec057f9e-726b-44f0-a5c0-24c05c7ecfeb is RUNNING. <a target="_blank" href="https://consol…

Unnamed: 0,id,text,score,n_prime
0,11012908,you're welcome! according to the docs it shoul...,0,-1
1,11013760,You *should* be concerned with the disk being ...,0,-1
2,11013784,have you looked at `Integrate` or `NIntegrate`?,0,-1
3,11015512,"sorry, is a typo. The variable name is dist. (...",0,-1
4,11016238,"Pfff, I'm having trouble with that formula too...",0,-1
5,11016276,Thanks thinksteep! Does this mean that by usin...,0,-1
6,11016551,"Jason, thanks for the reply. I've been workin...",0,-1
7,11017973,I assume an `off` of 0.5 would put be exactly ...,0,-1
8,11018225,Thank you very much. I do worry too much abou...,0,-1
9,11018370,"@IanClelland, I edited my question a bit. The ...",0,-1


In [11]:
# We can see the path to the BQ remote function and the google cloud function
# that was created under the hood
print(nth_prime.bigframes_remote_function)
print(nth_prime.bigframes_cloud_function)

bigframes-dev.bigframes_temp_us.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4
projects/bigframes-dev/locations/us-central1/functions/bigframes-b0feb1fbaf8188b64d7e70118d93c5d4


In [12]:
# Let's try to simulate a scenario in which user shares this remote funciton to
# their colleague who simply wants to reuse it. BigFrames provides an API to do
# so via `read_gbq_function`. Usage details are available via `help` command.
help(pd.read_gbq_function)

Help on function read_gbq_function in module bigframes.pandas:

read_gbq_function(function_name: 'str')
    Loads a BigQuery function from BigQuery.
    
    Then it can be applied to a DataFrame or Series.
    
    Args:
        function_name (str):
            the function's name in BigQuery in the format
            `project_id.dataset_id.function_name`, or
            `dataset_id.function_name` to load from the default project, or
            `function_name` to load from the default project and the dataset
            associated with the current session.
    
    Returns:
        callable: A function object pointing to the BigQuery function read
        from BigQuery.
    
        The object is similar to the one created by the `remote_function`
        decorator, including the `bigframes_remote_function` property, but
        not including the `bigframes_cloud_function` property.



In [14]:
EXISTING_REMOTE_FUNCTION=nth_prime.bigframes_remote_function

# Let's read the existing remote function in bigframes
nth_prime_existing = pd.read_gbq_function(EXISTING_REMOTE_FUNCTION)

In [15]:
%%time

# Let's apply the existing function to the dataframe
df = df.assign(n_prime_again=df['score'].apply(nth_prime_existing))
df.head(10)

CPU times: user 10.9 ms, sys: 0 ns, total: 10.9 ms
Wall time: 11.4 ms


HTML(value='Query job bec5f7d1-3df1-4292-8c68-c396bce7dc5d is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job 02e3bf43-a387-41c7-85c7-4a5366251de7 is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job fa4329e8-2918-44c4-96c5-d8591364abc9 is RUNNING. <a target="_blank" href="https://consol…

Unnamed: 0,id,text,score,n_prime,n_prime_again
0,11012908,you're welcome! according to the docs it shoul...,0,-1,-1
1,11013760,You *should* be concerned with the disk being ...,0,-1,-1
2,11013784,have you looked at `Integrate` or `NIntegrate`?,0,-1,-1
3,11015512,"sorry, is a typo. The variable name is dist. (...",0,-1,-1
4,11016238,"Pfff, I'm having trouble with that formula too...",0,-1,-1
5,11016276,Thanks thinksteep! Does this mean that by usin...,0,-1,-1
6,11016551,"Jason, thanks for the reply. I've been workin...",0,-1,-1
7,11017973,I assume an `off` of 0.5 would put be exactly ...,0,-1,-1
8,11018225,Thank you very much. I do worry too much abou...,0,-1,-1
9,11018370,"@IanClelland, I edited my question a bit. The ...",0,-1,-1
