In [1]:
TABLE='bigquery-public-data.stackoverflow.comments'
MAX_ROWS=1000000 # 1 Million

In [2]:
%%time

##############################
#       Pandas World         #
##############################

import pandas as pd
df = pd.read_gbq(TABLE, max_results=MAX_ROWS)[['id', 'text', 'score']]
print(df.head(10))

    id                                               text  score
0   10  It will help if you give some details of which...      6
1   25  infact it does. Look a the first lines of your...     10
2   27  "Currently + is implemented using StringBuffer...      7
3   41  I don't think that's the magic number he was r...     18
4   59  It's still very useful to know that magic numb...     12
5   96  This implementation is also nice if you wish t...      9
6  108  That's not full text searching, it's searching...      6
7  109  That's not full text searching, it's searching...      6
8  137  In vim you can open > 1 buffer. :e filename. T...      9
9  154  Sure, but what about a solution using O(1) mem...      8
CPU times: user 30.7 s, sys: 4.99 s, total: 35.7 s
Wall time: 1min 39s


In [3]:
# User defined function
# https://www.codespeedy.com/find-nth-prime-number-in-python/
def nth_prime(n):
    prime_numbers = [2,3]
    i=3
    if(0<n<=2):
        return prime_numbers[n-1]
    elif(n>2):
        while (True):
            i+=1
            status = True
            for j in range(2,int(i/2)+1):
                if(i%j==0):
                    status = False
                    break
            if(status==True):
                prime_numbers.append(i)
            if(len(prime_numbers)==n):
                break
        return prime_numbers[n-1]
    else:
        return -1

In [4]:
%%time

df = df.assign(n_prime=df['score'].apply(nth_prime))
print(df.head(10))

    id                                               text  score  n_prime
0   10  It will help if you give some details of which...      6       13
1   25  infact it does. Look a the first lines of your...     10       29
2   27  "Currently + is implemented using StringBuffer...      7       17
3   41  I don't think that's the magic number he was r...     18       61
4   59  It's still very useful to know that magic numb...     12       37
5   96  This implementation is also nice if you wish t...      9       23
6  108  That's not full text searching, it's searching...      6       13
7  109  That's not full text searching, it's searching...      6       13
8  137  In vim you can open > 1 buffer. :e filename. T...      9       23
9  154  Sure, but what about a solution using O(1) mem...      8       19
CPU times: user 5.91 s, sys: 42.4 ms, total: 5.95 s
Wall time: 5.94 s


In [5]:
%%time

##############################
#     BigFrames World        #
##############################

import bigframes
pd = bigframes.connect()

df = pd.read_gbq(TABLE).head(MAX_ROWS)[['id', 'text', 'score']]
print(df.head(10))

         id                                               text  score
0  41014003        Can you provide more details ?how exactly ?      0
1  41015675  this is tricky. some employer likes to ask thi...      0
2  41016624  Sorry was just being picky. int columns can be...      0
3  41017045  In the code above, you check ddLoc.SelectedVal...      0
4  41017803  I have not used myself, but every CCNode , hen...      0
5  41018325  @DanielStanley I didn't know this was a thing!...      0
6  41019358                           Thanks for theese infos!      0
7  41020018                              but yes you are right      0
8  41025209  Were you able to find a solution here?  What w...      0
9  41027340  As far as I know, `text_pattern_ops` should no...      0

[10 rows x 3 columns]
CPU times: user 1.17 s, sys: 119 ms, total: 1.29 s
Wall time: 1min 38s


In [6]:
# User simply imports the remote_function decorator and uses it on their udf
from bigframes import remote_function

# Tell the user what needs to be done offline before using BigFrames remote functions
help(remote_function)

Help on function remote_function in module bigframes.remote_function:

remote_function(input_types: 'typing.Sequence[type]', output_type: 'type', session: 'typing.Optional[Session]' = None, bigquery_client: 'typing.Optional[bigquery.Client]' = None, dataset: 'typing.Optional[str]' = None, bigquery_connection: 'typing.Optional[str]' = None, reuse: 'bool' = True)
    Decorator to turn a user defined function into a BigQuery remote function.
    
    Args:
        input_types : list(type)
            List of input data types in the user defined function.
        output_type : type
            Data type of the output in the user defined function.
        session : bigframes.Session, Optional
            BigFrames session to use for getting default project, dataset and
            bigquery connection.
        bigquery_client : google.cloud.bigquery.Client, Optional
            Client to use for BigQuery operations. If this param is not provided
            then bigquery client from the sess

In [7]:
# BigFrames requires user to specify a dataset where the BQ function would be persisted
# We are discussing if this can be avoided in go/bigframes-transient-data
dataset_id = 'my-first-project-331618.bigframes_dev'

# BigFrames user is a data scientist and may not have privileges to create a BQ connector
# and set it up for invoking a cloud function. They should get such a connector created
# from their cloud admin and use it with BigFrames remote functions.
bq_connector_name = 'bigframes-rf-conn'

In [8]:
from time import perf_counter
start = perf_counter()
#-------------------------------------------------------------------------------------

# User defined function
# https://www.codespeedy.com/find-nth-prime-number-in-python/
@remote_function([int], int, session=pd, dataset=dataset_id, bigquery_connection=bq_connector_name)
def nth_prime(n):
    prime_numbers = [2,3]
    i=3
    if(0<n<=2):
        return prime_numbers[n-1]
    elif(n>2):
        while (True):
            i+=1
            status = True
            for j in range(2,int(i/2)+1):
                if(i%j==0):
                    status = False
                    break
            if(status==True):
                prime_numbers.append(i)
            if(len(prime_numbers)==n):
                break
        return prime_numbers[n-1]
    else:
        return -1

#-------------------------------------------------------------------------------------
print(f"\nWall time: {(perf_counter()-start):.4f} s")

[INFO][2023-06-23 06:35:15,552][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-f5d95bee351fbf2cc6386f60f4d193ba --gen2 --runtime=python310 --project=my-first-project-331618 --region=us-central1 --source=/tmp/tmp80eo8lva --entry-point=udf_http --trigger-http --no-allow-unauthenticated
Preparing function...
.done.
Deploying function...
[Build].................................................................................................................................................................................................................................................................................................................................................................................................................done
[Service]..........................................................................................................................................................................................done
Done.
Yo

buildConfig:
  build: projects/371300922916/locations/us-central1/builds/7dcc3ada-d010-4e5b-bd71-bcb9d95e6f69
  entryPoint: udf_http
  runtime: python310
  source:
    storageSource:
      bucket: gcf-v2-sources-371300922916-us-central1
      object: bigframes-f5d95bee351fbf2cc6386f60f4d193ba/function-source.zip
  sourceProvenance:
    resolvedStorageSource:
      bucket: gcf-v2-sources-371300922916-us-central1
      generation: '1687502118787781'
      object: bigframes-f5d95bee351fbf2cc6386f60f4d193ba/function-source.zip
environment: GEN_2
labels:
  deployment-tool: cli-gcloud
name: projects/my-first-project-331618/locations/us-central1/functions/bigframes-f5d95bee351fbf2cc6386f60f4d193ba
serviceConfig:
  allTrafficOnLatestRevision: true
  availableCpu: '0.1666'
  availableMemory: 256M
  ingressSettings: ALLOW_ALL
  maxInstanceCount: 100
  maxInstanceRequestConcurrency: 1
  revision: bigframes-f5d95bee351fbf2cc6386f60f4d193ba-00001-nav
  service: projects/my-first-project-331618/loca

[INFO][2023-06-23 06:36:20,572][bigframes.remote_function] Successfully created cloud function bigframes-f5d95bee351fbf2cc6386f60f4d193ba with uri (https://bigframes-f5d95bee351fbf2cc6386f60f4d193ba-76sbkhjuiq-uc.a.run.app)
[INFO][2023-06-23 06:36:22,241][bigframes.remote_function] Connector bigframes-rf-conn already exists
[INFO][2023-06-23 06:36:22,242][bigframes.remote_function] Creating BQ remote function: 
    CREATE OR REPLACE FUNCTION `my-first-project-331618.bigframes_dev`.bigframes_f5d95bee351fbf2cc6386f60f4d193ba(n INT64)
    RETURNS INT64
    REMOTE WITH CONNECTION `my-first-project-331618.us.bigframes-rf-conn`
    OPTIONS (
      endpoint = "https://bigframes-f5d95bee351fbf2cc6386f60f4d193ba-76sbkhjuiq-uc.a.run.app"
    )
[INFO][2023-06-23 06:36:23,224][bigframes.remote_function] Created remote function my-first-project-331618.bigframes_dev.bigframes_f5d95bee351fbf2cc6386f60f4d193ba



Wall time: 70.7487 s


In [9]:
%%time

df = df.assign(n_prime=df['score'].apply(nth_prime))
print(df.head(10))

         id                                               text  score  n_prime
0  41014003        Can you provide more details ?how exactly ?      0       -1
1  41015675  this is tricky. some employer likes to ask thi...      0       -1
2  41016624  Sorry was just being picky. int columns can be...      0       -1
3  41017045  In the code above, you check ddLoc.SelectedVal...      0       -1
4  41017803  I have not used myself, but every CCNode , hen...      0       -1
5  41018325  @DanielStanley I didn't know this was a thing!...      0       -1
6  41019358                           Thanks for theese infos!      0       -1
7  41020018                              but yes you are right      0       -1
8  41025209  Were you able to find a solution here?  What w...      0       -1
9  41027340  As far as I know, `text_pattern_ops` should no...      0       -1

[10 rows x 4 columns]
CPU times: user 302 ms, sys: 12.1 ms, total: 314 ms
Wall time: 1min 34s
