In [12]:
TABLE='bigquery-public-data.stackoverflow.comments'
MAX_ROWS=1000000 # 1 Million

In [13]:
%%time

##############################
#       Pandas World         #
##############################

import pandas as pd
df = pd.read_gbq(TABLE, max_results=MAX_ROWS)[['id', 'text', 'score']]
print(df.head(10))

    id                                               text  score
0   10  It will help if you give some details of which...      6
1   25  infact it does. Look a the first lines of your...     10
2   27  "Currently + is implemented using StringBuffer...      7
3   41  I don't think that's the magic number he was r...     18
4   59  It's still very useful to know that magic numb...     12
5   96  This implementation is also nice if you wish t...      9
6  108  That's not full text searching, it's searching...      6
7  109  That's not full text searching, it's searching...      6
8  137  In vim you can open > 1 buffer. :e filename. T...      9
9  154  Sure, but what about a solution using O(1) mem...      8
CPU times: user 24.8 s, sys: 2.02 s, total: 26.9 s
Wall time: 1min 30s


In [14]:
# User defined function
# https://www.codespeedy.com/find-nth-prime-number-in-python/
def nth_prime(n):
    prime_numbers = [2,3]
    i=3
    if(0<n<=2):
        return prime_numbers[n-1]
    elif(n>2):
        while (True):
            i+=1
            status = True
            for j in range(2,int(i/2)+1):
                if(i%j==0):
                    status = False
                    break
            if(status==True):
                prime_numbers.append(i)
            if(len(prime_numbers)==n):
                break
        return prime_numbers[n-1]
    else:
        return -1

In [15]:
%%time

df = df.assign(n_prime=df['score'].apply(nth_prime))
print(df.head(10))

    id                                               text  score  n_prime
0   10  It will help if you give some details of which...      6       13
1   25  infact it does. Look a the first lines of your...     10       29
2   27  "Currently + is implemented using StringBuffer...      7       17
3   41  I don't think that's the magic number he was r...     18       61
4   59  It's still very useful to know that magic numb...     12       37
5   96  This implementation is also nice if you wish t...      9       23
6  108  That's not full text searching, it's searching...      6       13
7  109  That's not full text searching, it's searching...      6       13
8  137  In vim you can open > 1 buffer. :e filename. T...      9       23
9  154  Sure, but what about a solution using O(1) mem...      8       19
CPU times: user 5.12 s, sys: 6.33 ms, total: 5.12 s
Wall time: 5.13 s


In [16]:
%%time

##############################
#     BigFrames World        #
##############################

import bigframes
pd = bigframes.connect()

df = pd.read_gbq(TABLE).head(MAX_ROWS)[['id', 'text', 'score']]
print(df.head(10))

        id                                               text  score
0  5397100  Is there any proof that "F# is faster than C#"...      8
1  5404973  Times change, and this question's answer is no...     10
2  5406997  And while we're at it: Rename it to `xs`, the ...      8
3  5413268  Let me know if you get any leads on the "elimi...      7
4  5421911  Correct, but how does this relate to the quest...      5
5  5422815  @user477399: Do those input documents have a p...      5
6  5422907  Also, parens are not needed (and not recommend...      6
7  5432203  @Riduidel At least you get to know if the cat ...      6

[10 rows x 3 columns]
CPU times: user 151 ms, sys: 39.5 ms, total: 191 ms
Wall time: 2.73 s


In [17]:
# User simply imports the remote_function decorator and uses it on their udf
from bigframes import remote_function

# Tell the user what needs to be done offline before using BigFrames remote functions
help(remote_function)

Help on function remote_function in module bigframes.remote_function:

remote_function(input_type, output_type, bigquery_client: google.cloud.bigquery.client.Client, dataset: str, bigquery_connection: str)
    Decorator to turn a user defined function into a BigQuery remote function.
    
    Parameters
    ----------
    input_type : list(ibis.expr.datatypes)
        List of input data types in the user defined function.
    output_type : ibis.expr.datatypes
        Data type of the output in the user defined function.
    bigquery_client : google.cloud.bigquery.client.Client
        Client to use for BigQuery operations.
    dataset : str
        Dataset to use to create a BigQuery function. It should be in
        `project_id.dataset_name` format
    bigquery_connection : str
        Name of the BigQuery connection. It should be pre created in the same
        location as the `bigquery_client.location`.
    
    Prerequisites
    -------------
    Please make sure following is setup

In [18]:
# BigFrames requires user to annotate the udf with input and output data types
# Here we are using ibis datatypes, but ideally we would adapt to go/bigframes-dtypes
import ibis.expr.datatypes as dt

# BigFrames requires user to specify a dataset where the BQ function would be persisted
# We are discussing if this can be avoided in go/bigframes-transient-data
dataset_id = 'my-first-project-331618.bigframes_dev'

# BigFrames user is a data scientist and may not have privileges to create a BQ connector
# and set it up for invoking a cloud function. They should get such a connector created
# from their cloud admin and use it with BigFrames remote functions.
bq_connector_name = 'bigframes-rf-conn'


In [19]:
from time import perf_counter
start = perf_counter()
#-------------------------------------------------------------------------------------

# User defined function
# https://www.codespeedy.com/find-nth-prime-number-in-python/
@remote_function([dt.int64()], dt.int64(), pd.bqclient, dataset_id, bq_connector_name)
def nth_prime(n):
    prime_numbers = [2,3]
    i=3
    if(0<n<=2):
        return prime_numbers[n-1]
    elif(n>2):
        while (True):
            i+=1
            status = True
            for j in range(2,int(i/2)+1):
                if(i%j==0):
                    status = False
                    break
            if(status==True):
                prime_numbers.append(i)
            if(len(prime_numbers)==n):
                break
        return prime_numbers[n-1]
    else:
        return -1

#-------------------------------------------------------------------------------------
print(f"\nWall time: {(perf_counter()-start):.4f} s")


===> Remote function nth_prime already exists, reusing ... <===


Wall time: 0.1849 s


In [21]:
%%time

df = df.assign(n_prime=df['score'].apply(nth_prime))
print(df.head(10))

        id                                               text  score  n_prime
0  5397100  Is there any proof that "F# is faster than C#"...      8       19
1  5404973  Times change, and this question's answer is no...     10       29
2  5406997  And while we're at it: Rename it to `xs`, the ...      8       19
3  5413268  Let me know if you get any leads on the "elimi...      7       17
4  5421911  Correct, but how does this relate to the quest...      5       11
5  5422815  @user477399: Do those input documents have a p...      5       11
6  5422907  Also, parens are not needed (and not recommend...      6       13
7  5432203  @Riduidel At least you get to know if the cat ...      6       13

[10 rows x 4 columns]
CPU times: user 82.8 ms, sys: 308 µs, total: 83.1 ms
Wall time: 13.4 s
