# Using the dataframe API

### Start a BigFrames session

In [2]:
import bigframes
session = bigframes.connect(bigframes.Context(project='bigframes-dev'))

### Initialize a dataframe for a BigQuery table

In [3]:
df = session.read_gbq("bigquery-public-data.ml_datasets.penguins")

## View the DataFrame

In [4]:
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
27,Chinstrap penguin (Pygoscelis antarctica),Dream,50.5,19.6,201.0,4050.0,MALE
44,Chinstrap penguin (Pygoscelis antarctica),Dream,50.8,19.0,210.0,4100.0,MALE
77,Adelie Penguin (Pygoscelis adeliae),Dream,39.2,21.1,196.0,4150.0,MALE
97,Adelie Penguin (Pygoscelis adeliae),Dream,36.0,17.9,190.0,3450.0,FEMALE
133,Gentoo penguin (Pygoscelis papua),Biscoe,44.0,13.6,208.0,4350.0,FEMALE
273,Adelie Penguin (Pygoscelis adeliae),Biscoe,45.6,20.3,191.0,4600.0,MALE
281,Gentoo penguin (Pygoscelis papua),Biscoe,46.8,16.1,215.0,5500.0,MALE
285,Gentoo penguin (Pygoscelis papua),Biscoe,42.9,13.1,215.0,5000.0,FEMALE
286,Gentoo penguin (Pygoscelis papua),Biscoe,46.1,15.1,215.0,5100.0,MALE
295,Adelie Penguin (Pygoscelis adeliae),Torgersen,37.2,19.4,184.0,3900.0,MALE


### View the column names in the dataframe (aka columns names in the table)

In [5]:
df.columns

Index(['species', 'island', 'culmen_length_mm', 'culmen_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

### View the table schema

In [6]:
df.dtypes

species              string[pyarrow]
island               string[pyarrow]
culmen_length_mm             Float64
culmen_depth_mm              Float64
flipper_length_mm            Float64
body_mass_g                  Float64
sex                  string[pyarrow]
dtype: object

### Select a subset of columns

In [7]:
df = df[[
    "species",
    "island",
    "body_mass_g",
]]
df

Unnamed: 0,species,island,body_mass_g
27,Chinstrap penguin (Pygoscelis antarctica),Dream,4050.0
44,Chinstrap penguin (Pygoscelis antarctica),Dream,4100.0
77,Adelie Penguin (Pygoscelis adeliae),Dream,4150.0
97,Adelie Penguin (Pygoscelis adeliae),Dream,3450.0
133,Gentoo penguin (Pygoscelis papua),Biscoe,4350.0
273,Adelie Penguin (Pygoscelis adeliae),Biscoe,4600.0
281,Gentoo penguin (Pygoscelis papua),Biscoe,5500.0
285,Gentoo penguin (Pygoscelis papua),Biscoe,5000.0
286,Gentoo penguin (Pygoscelis papua),Biscoe,5100.0
295,Adelie Penguin (Pygoscelis adeliae),Torgersen,3900.0


### View the first ten values of a series

In [8]:
df['body_mass_g'].head(10)

27     4050.0
44     4100.0
77     4150.0
97     3450.0
133    4350.0
273    4600.0
281    5500.0
285    5000.0
286    5100.0
295    3900.0
Name: body_mass_g, dtype: Float64

### Compute the mean of a series

In [9]:
df['body_mass_g'].mean()

4201.754385964914

### Filter the dataframe

In [10]:
df[df['body_mass_g'] >= 4000.0]

Unnamed: 0,species,island,body_mass_g
27,Chinstrap penguin (Pygoscelis antarctica),Dream,4050.0
44,Chinstrap penguin (Pygoscelis antarctica),Dream,4100.0
77,Adelie Penguin (Pygoscelis adeliae),Dream,4150.0
133,Gentoo penguin (Pygoscelis papua),Biscoe,4350.0
273,Adelie Penguin (Pygoscelis adeliae),Biscoe,4600.0
281,Gentoo penguin (Pygoscelis papua),Biscoe,5500.0
285,Gentoo penguin (Pygoscelis papua),Biscoe,5000.0
286,Gentoo penguin (Pygoscelis papua),Biscoe,5100.0
218,Gentoo penguin (Pygoscelis papua),Biscoe,5050.0
234,Gentoo penguin (Pygoscelis papua),Biscoe,5100.0


# Using the Remote Functions

### BigFrames gives you the ability to turn your custom scalar functions into a BigQuery remote function. It requires the GCP project to be set up appropriately and the user having sufficient privileges to use them. One can find more details on it via `help` command.

In [11]:
help(bigframes.remote_function)

Help on function remote_function in module bigframes.remote_function:

remote_function(input_types: 'typing.Sequence[type]', output_type: 'type', session: 'typing.Optional[Session]' = None, bigquery_client: 'typing.Optional[bigquery.Client]' = None, dataset: 'typing.Optional[str]' = None, bigquery_connection: 'typing.Optional[str]' = None, reuse: 'bool' = True)
    Decorator to turn a user defined function into a BigQuery remote function.
    
    Args:
        input_types : list(type)
            List of input data types in the user defined function.
        output_type : type
            Data type of the output in the user defined function.
        session : bigframes.Session, Optional
            BigFrames session to use for getting default project, dataset and
            bigquery connection.
        bigquery_client : google.cloud.bigquery.Client, Optional
            Client to use for BigQuery operations. If this param is not provided
            then bigquery client from the sess

### Define a custom function, and specify the intent to turn it into a remote function. It requires a BigQuery connection. If the connection is not already created, BigFrames will attempt to create one assuming the necessary APIs and IAM permissions are setup in the project.

In [12]:
@session.remote_function([float], str, bigquery_connection='bigframes-rf-conn')
def get_bucket(num):
    if not num: return "NA"
    boundary = 4000
    return "at_or_above_4000" if num >= boundary else "below_4000"

[INFO][2023-05-22 09:34:30,928][bigframes.remote_function] Existing cloud functions


NAME                                                 STATE    TRIGGER       REGION       ENVIRONMENT
bigframes-01419d4cb41f1fdd1412f928fe2dc9a6           ACTIVE   HTTP Trigger  us-central1  2nd gen
bigframes-03520f5affb33f215ddeb2e655cf64bf-c8psy512  ACTIVE   HTTP Trigger  us-central1  2nd gen
bigframes-05e9ae2b1262e8c49b1ed1f54c77cb17           ACTIVE   HTTP Trigger  us-central1  2nd gen
bigframes-065885eacaa24a6446fef6e2369697cd           ACTIVE   HTTP Trigger  us-central1  2nd gen
bigframes-08be0bda1fb6d0c220a0bb81e4838eba           UNKNOWN  HTTP Trigger  us-central1  2nd gen
bigframes-09841edf0f7beafb531bc507541d6c5a           ACTIVE   HTTP Trigger  us-central1  2nd gen
bigframes-0b522e7221ebb826b2bfebbb3d5587ca           ACTIVE   HTTP Trigger  us-central1  2nd gen
bigframes-0ebf54645abef743485844a5e84a118b           ACTIVE   HTTP Trigger  us-central1  2nd gen
bigframes-1517f3edbba21fce76152d93000989d9           ACTIVE   HTTP Trigger  us-central1  2nd gen
bigframes-1629854f0f8f5b2c

[INFO][2023-05-22 09:34:44,713][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-1f5a6c9751d09766f0c886aaa1ef0f8b --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmpuh0qk5l4 --entry-point=udf_http --trigger-http --no-allow-unauthenticated
Preparing function...
.done.
Deploying function...
[Build]........................................................................................................................................................................................................................................................................................................................................................................................................done
[Service]...........................................................................................................................done
Done.
You can view your function in the Cloud Console here: https://console.cloud.google.c

buildConfig:
  build: projects/1084210331973/locations/us-central1/builds/33ff2a40-2a6b-40ea-871d-10b111aca2aa
  entryPoint: udf_http
  runtime: python310
  source:
    storageSource:
      bucket: gcf-v2-sources-1084210331973-us-central1
      object: bigframes-1f5a6c9751d09766f0c886aaa1ef0f8b/function-source.zip
  sourceProvenance:
    resolvedStorageSource:
      bucket: gcf-v2-sources-1084210331973-us-central1
      generation: '1684748088116699'
      object: bigframes-1f5a6c9751d09766f0c886aaa1ef0f8b/function-source.zip
environment: GEN_2
labels:
  deployment-tool: cli-gcloud
name: projects/bigframes-dev/locations/us-central1/functions/bigframes-1f5a6c9751d09766f0c886aaa1ef0f8b
serviceConfig:
  allTrafficOnLatestRevision: true
  availableCpu: '0.1666'
  availableMemory: 256M
  ingressSettings: ALLOW_ALL
  maxInstanceCount: 100
  maxInstanceRequestConcurrency: 1
  revision: bigframes-1f5a6c9751d09766f0c886aaa1ef0f8b-00001-wes
  service: projects/bigframes-dev/locations/us-central1

[INFO][2023-05-22 09:35:41,996][bigframes.remote_function] Existing cloud functions


NAME                                                 STATE      TRIGGER       REGION       ENVIRONMENT
bigframes-01419d4cb41f1fdd1412f928fe2dc9a6           ACTIVE     HTTP Trigger  us-central1  2nd gen
bigframes-03520f5affb33f215ddeb2e655cf64bf-c8psy512  ACTIVE     HTTP Trigger  us-central1  2nd gen
bigframes-05e9ae2b1262e8c49b1ed1f54c77cb17           ACTIVE     HTTP Trigger  us-central1  2nd gen
bigframes-065885eacaa24a6446fef6e2369697cd           ACTIVE     HTTP Trigger  us-central1  2nd gen
bigframes-08be0bda1fb6d0c220a0bb81e4838eba           UNKNOWN    HTTP Trigger  us-central1  2nd gen
bigframes-0b522e7221ebb826b2bfebbb3d5587ca           ACTIVE     HTTP Trigger  us-central1  2nd gen
bigframes-0ebf54645abef743485844a5e84a118b           ACTIVE     HTTP Trigger  us-central1  2nd gen
bigframes-1517f3edbba21fce76152d93000989d9           ACTIVE     HTTP Trigger  us-central1  2nd gen
bigframes-1629854f0f8f5b2cb78cadeaae6b4306           ACTIVE     HTTP Trigger  us-central1  2nd gen
bigfra

[INFO][2023-05-22 09:35:53,263][bigframes.remote_function] Successfully created cloud function bigframes-1f5a6c9751d09766f0c886aaa1ef0f8b with uri (https://bigframes-1f5a6c9751d09766f0c886aaa1ef0f8b-7krlje3eoq-uc.a.run.app)
[INFO][2023-05-22 09:35:54,835][bigframes.remote_function] Connector bigframes-rf-conn already exists
[INFO][2023-05-22 09:35:54,837][bigframes.remote_function] Creating BQ remote function: 
    CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_1f5a6c9751d09766f0c886aaa1ef0f8b(num FLOAT64)
    RETURNS STRING
    REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`
    OPTIONS (
      endpoint = "https://bigframes-1f5a6c9751d09766f0c886aaa1ef0f8b-7krlje3eoq-uc.a.run.app"
    )
[INFO][2023-05-22 09:35:55,737][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_1f5a6c9751d09766f0c886aaa1ef0f8b


### Run the custom function on the BigFrames dataframe

In [13]:
df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket))
df[['body_mass_g', 'body_mass_bucket']].head(10)

Unnamed: 0,body_mass_g,body_mass_bucket
27,4050.0,at_or_above_4000
44,4100.0,at_or_above_4000
77,4150.0,at_or_above_4000
97,3450.0,below_4000
133,4350.0,at_or_above_4000
273,4600.0,at_or_above_4000
281,5500.0,at_or_above_4000
285,5000.0,at_or_above_4000
286,5100.0,at_or_above_4000
295,3900.0,below_4000


# Using the ML API

### Start a session and initialize a dataframe for a BigQuery table

In [14]:
df = session.read_gbq("bigquery-public-data.ml_datasets.penguins")
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
37,Chinstrap penguin (Pygoscelis antarctica),Dream,51.7,20.3,194.0,3775.0,MALE
60,Chinstrap penguin (Pygoscelis antarctica),Dream,46.7,17.9,195.0,3300.0,FEMALE
193,Adelie Penguin (Pygoscelis adeliae),Biscoe,35.5,16.2,195.0,3350.0,FEMALE
229,Adelie Penguin (Pygoscelis adeliae),Biscoe,36.5,16.6,181.0,2850.0,FEMALE
46,Adelie Penguin (Pygoscelis adeliae),Dream,37.5,18.9,179.0,2975.0,
57,Chinstrap penguin (Pygoscelis antarctica),Dream,49.2,18.2,195.0,4400.0,MALE
154,Adelie Penguin (Pygoscelis adeliae),Biscoe,37.9,18.6,193.0,2925.0,FEMALE
161,Gentoo penguin (Pygoscelis papua),Biscoe,49.3,15.7,217.0,5850.0,MALE
185,Gentoo penguin (Pygoscelis papua),Biscoe,51.3,14.2,218.0,5300.0,MALE
209,Adelie Penguin (Pygoscelis adeliae),Biscoe,42.7,18.3,196.0,4075.0,MALE


### Clean and prepare the data

In [15]:
# filter down to the data we want to analyze
adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"]

# drop the columns we don't care about
adelie_data = adelie_data.drop(columns=["species"])

# drop rows with nulls to get our training data
training_data = adelie_data.dropna()

# take a peek at the training data
training_data

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
193,Biscoe,35.5,16.2,195.0,3350.0,FEMALE
229,Biscoe,36.5,16.6,181.0,2850.0,FEMALE
154,Biscoe,37.9,18.6,193.0,2925.0,FEMALE
209,Biscoe,42.7,18.3,196.0,4075.0,MALE
228,Biscoe,38.1,17.0,181.0,3175.0,FEMALE
275,Biscoe,38.6,17.2,199.0,3750.0,FEMALE
339,Torgersen,38.8,17.6,191.0,3275.0,FEMALE
71,Dream,32.1,15.5,188.0,3050.0,FEMALE
272,Biscoe,41.4,18.6,191.0,3700.0,MALE
54,Dream,40.6,17.2,187.0,3475.0,MALE


In [16]:
# pick feature columns and label column
feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]
label_columns = training_data[['body_mass_g']]

# also get the rows that we want to make predictions for (i.e. where the feature column is null)
missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]

### Train and evaluate a linear regression model using the ML API

In [17]:
from bigframes.ml.linear_model import LinearRegression

# as in scikit-learn, a newly created model is just a bundle of parameters
# default parameters are fine here
model = LinearRegression()

# this will train a temporary model in BigQuery Machine Learning
model.fit(feature_columns, label_columns)

# check how the model performed, using the automatic test/training data split chosen by BQML
model.score()

Unnamed: 0,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,223.878763,78553.601634,0.005614,181.330911,0.623951,0.623951


### Make predictions using the model

In [20]:
# TODO(shobs): Following expression is failing with the error:
#   ValueError: Column `index_5` of `index_cols` not found in this table.
# originating from Session.read_gbq().
# Disable for now, Enable back once fixed.
#model.predict(missing_body_mass)

### Save the trained model to BigQuery, so we can load it later

In [21]:
model.to_gbq("bqml_tutorial.penguins_model", replace=True)

LinearRegression()