In [20]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Set Up

In [21]:
import bigframes.pandas as bpd

In [22]:
df = bpd.read_gbq("bigquery-public-data.baseball.schedules")[["homeTeamName", "awayTeamName", "duration_minutes"]]
df.peek()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,homeTeamName,awayTeamName,duration_minutes
36,Reds,Cubs,159
358,Dodgers,Diamondbacks,223
416,Yankees,White Sox,216
523,Rays,Athletics,187
594,Pirates,Brewers,169


# Notes

* The API reference documentation for the `remote_function` can be found at
  https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.session.Session#bigframes_session_Session_remote_function

* More code samples for `remote_function` can be found in the BigQuery
  DataFrames API reference documentation, e.g.
  * https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.series.Series#bigframes_series_Series_apply
  * https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_map
  * https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_apply

* The following examples are only for the purpose of demonstrating
`remote_function` usage. They are not necessarily the best way to achieve the
end result.

* In the examples in this notebook we are using `reuse=False` just as a caution
  to avoid concurrent runs of this notebook in the same google cloud project
  stepping over each other's remote function deployment. It may not be neccesary
  in a simple use case.

# Self-contained function

Let's consider a scenario where we want to categorize the matches as short,
medium or long duration based on the `duration_minutes` column.

In [23]:
@bpd.remote_function(reuse=False)
def duration_category(duration_minutes: int) -> str:
    if duration_minutes < 90:
        return "short"
    elif duration_minutes < 180:
        return "medium"
    else:
        return "long"

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")

Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-e22dbecc9ec0374bda36bc23df3775b0-g8zp' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_e22dbecc9ec0374bda36bc23df3775b0_g8zp'.


In [24]:
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()

Unnamed: 0,homeTeamName,awayTeamName,duration_minutes,duration_cat
1911,Dodgers,Angels,132,medium
2365,Athletics,Angels,134,medium
1977,Athletics,Angels,139,medium
554,Cubs,Angels,142,medium
654,Astros,Angels,143,medium


# Function referring to variables outside the function body

Let's consider a slight variation of the earlier example where the labels for
the short, medium and long duration matches are defined outside the function
body. They would be captured at the time of `remote_function` deployment and
any change in their values in the notebook after the deployment will not
automatically propagate to the `remote_function`.

In [25]:
DURATION_CATEGORY_SHORT = "S"
DURATION_CATEGORY_MEDIUM = "M"
DURATION_CATEGORY_LONG = "L"

In [26]:
@bpd.remote_function(reuse=False)
def duration_category(duration_minutes: int) -> str:
    if duration_minutes < 90:
        return DURATION_CATEGORY_SHORT
    elif duration_minutes < 180:
        return DURATION_CATEGORY_MEDIUM
    else:
        return DURATION_CATEGORY_LONG

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")

Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-4191f0fce98d46cc09359de47e203236-e009' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_4191f0fce98d46cc09359de47e203236_e009'.


In [27]:
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()

Unnamed: 0,homeTeamName,awayTeamName,duration_minutes,duration_cat
1911,Dodgers,Angels,132,M
2365,Athletics,Angels,134,M
1977,Athletics,Angels,139,M
554,Cubs,Angels,142,M
654,Astros,Angels,143,M


# Function referring to imports (built-in) outside the function body

Let's consider a scenario in which we want to categorize the matches in terms of
hour buckets. E.g. a match finishing in 0-60 minutes would be in 1h category,
61-120 minutes in 2h category and so on. The function itself makes use of the
`math` module (a built-in module in a standard python installation) which
happens to be imported outside the function body, let's say in one of the
previous cells. For the demo purpose we have aliased the import to `mymath`, but
it is not necessary.

Later in the notebook we will see another example with a third-party module.

In [28]:
import math as mymath

In [29]:
@bpd.remote_function(reuse=False)
def duration_category(duration_minutes: int) -> str:
    duration_hours = mymath.ceil(duration_minutes / 60)
    return f"{duration_hours}h"

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")

Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-cf31fc2d2c7fe111afa5526f5a9cdf06-gmmo' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_cf31fc2d2c7fe111afa5526f5a9cdf06_gmmo'.


In [30]:
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()

Unnamed: 0,homeTeamName,awayTeamName,duration_minutes,duration_cat
1911,Dodgers,Angels,132,3h
2365,Athletics,Angels,134,3h
1977,Athletics,Angels,139,3h
554,Cubs,Angels,142,3h
654,Astros,Angels,143,3h


# Function referring to another function outside the function body

In this example let's create a `remote_function` from a function
`duration_category` which depends upon another function `get_hour_ceiling`,
which further depends on another function `get_minutes_in_hour`. This dependency
chain could be even longer in a real world example. The behaviors of the
dependencies would be captured at the time of the remote function
deployment.

Please ntoe that any changes in those functions in the notebook after the
deployment would not automatically propagate to the remote function.

In [31]:
import math

def get_minutes_in_hour():
    return 60

def get_hour_ceiling(minutes):
    return math.ceil(minutes / get_minutes_in_hour())

In [32]:
@bpd.remote_function(reuse=False)
def duration_category(duration_minutes: int) -> str:
    duration_hours = get_hour_ceiling(duration_minutes)
    return f"{duration_hours} hrs"

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")

Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-3c03836c2044bf625d02e25ccdbfe101-k1m4' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_3c03836c2044bf625d02e25ccdbfe101_k1m4'.


In [33]:
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()

Unnamed: 0,homeTeamName,awayTeamName,duration_minutes,duration_cat
1911,Dodgers,Angels,132,3 hrs
2365,Athletics,Angels,134,3 hrs
1977,Athletics,Angels,139,3 hrs
554,Cubs,Angels,142,3 hrs
654,Astros,Angels,143,3 hrs


# Function requiring external packages

In this example let's say we want to redact the `homeTeamName` values, and we
choose to use a third party library `cryptography`. Any third party dependencies
can be specified in [pip format](https://pip.pypa.io/en/stable/reference/requirements-file-format/)
(with or without version number) as a list via the `packages` parameter.

In [34]:
@bpd.remote_function(reuse=False, packages=["cryptography"])
def get_hash(input: str) -> str:
    from cryptography.fernet import Fernet

    # handle missing value
    if input is None:
        input = ""

    key = Fernet.generate_key()
    f = Fernet(key)
    return f.encrypt(input.encode()).decode()

In [35]:
df1 = df.assign(homeTeamNameRedacted=df["homeTeamName"].apply(get_hash))
df1.peek()

Unnamed: 0,homeTeamName,awayTeamName,duration_minutes,homeTeamNameRedacted
641,American League,National League,185,gAAAAABmo0n2I391cbYwIYeg8lyJq1MSFZatrtpvuUD5v-...
349,Angels,Astros,187,gAAAAABmo0n2pX-siRwl2tIZA4m--swndC_b7vgGXrqSNM...
2349,Angels,Astros,160,gAAAAABmo0n28Q9RwH62HvYRhTDpQ9lo8c6G8F5bnn7wgF...
557,Angels,Astros,166,gAAAAABmo0n2YlwHlSGQ0_XvXd-QVBtB_Lq2zUifu7vKhg...
220,Angels,Astros,162,gAAAAABmo0n2l8HMSGKYizxfEmRvGQy96mrjwx734-Rl_Z...


# Function referring to imports (third-party) outside the function body

In this scenario the function depends on a third party library and the module
from the third party library used in the function is imported outside the
function body in a previous cell. Below is such an example where the third-party
dependency is `humanize` and its module of the same name is imported outside the
function body.

In [36]:
import datetime as dt
import humanize

In [37]:
@bpd.remote_function(reuse=False, packages=["humanize"])
def duration_category(duration_minutes: int) -> str:
    timedelta = dt.timedelta(minutes=duration_minutes)
    return humanize.naturaldelta(timedelta)

print(f"Created cloud function '{duration_category.bigframes_cloud_function}' and BQ remote function '{duration_category.bigframes_remote_function}'.")

Created cloud function 'projects/bigframes-dev/locations/us-central1/functions/bigframes-session54c8b0-a5e21a4ad488ce8b90de19c3c8cd33b6-0ab2' and BQ remote function 'bigframes-dev._1b6c31ff1bcd5d2f6d86833cf8268317f1b12d57.bigframes_session54c8b0_a5e21a4ad488ce8b90de19c3c8cd33b6_0ab2'.


In [38]:
df1 = df.assign(duration_cat=df["duration_minutes"].apply(duration_category))
df1.peek()

Unnamed: 0,homeTeamName,awayTeamName,duration_minutes,duration_cat
1911,Dodgers,Angels,132,2 hours
2365,Athletics,Angels,134,2 hours
1977,Athletics,Angels,139,2 hours
554,Cubs,Angels,142,2 hours
654,Astros,Angels,143,2 hours


# Clean Up

In [None]:
bpd.close_session()