# Using the BigQuery DataFrames API

### Set BigQuery DataFrames options

In [1]:
import bigframes.pandas

bigframes.pandas.options.bigquery.project = "bigframes-dev"
bigframes.pandas.options.bigquery.location = "us"

### Initialize a dataframe for a BigQuery table

In [2]:
df = bigframes.pandas.read_gbq("bigquery-public-data.ml_datasets.penguins")

## View the DataFrame

In [3]:
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie Penguin (Pygoscelis adeliae),Dream,36.6,18.4,184.0,3475.0,FEMALE
1,Adelie Penguin (Pygoscelis adeliae),Dream,39.8,19.1,184.0,4650.0,MALE
2,Adelie Penguin (Pygoscelis adeliae),Dream,40.9,18.9,184.0,3900.0,MALE
3,Chinstrap penguin (Pygoscelis antarctica),Dream,46.5,17.9,192.0,3500.0,FEMALE
4,Adelie Penguin (Pygoscelis adeliae),Dream,37.3,16.8,192.0,3000.0,FEMALE
5,Adelie Penguin (Pygoscelis adeliae),Dream,43.2,18.5,192.0,4100.0,MALE
6,Chinstrap penguin (Pygoscelis antarctica),Dream,46.9,16.6,192.0,2700.0,FEMALE
7,Chinstrap penguin (Pygoscelis antarctica),Dream,50.5,18.4,200.0,3400.0,FEMALE
8,Chinstrap penguin (Pygoscelis antarctica),Dream,49.5,19.0,200.0,3800.0,MALE
9,Adelie Penguin (Pygoscelis adeliae),Dream,40.2,20.1,200.0,3975.0,MALE


### View the column names in the dataframe (aka columns names in the table)

In [4]:
df.columns

Index(['species', 'island', 'culmen_length_mm', 'culmen_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

### View the table schema

In [5]:
df.dtypes

species              string[pyarrow]
island               string[pyarrow]
culmen_length_mm             Float64
culmen_depth_mm              Float64
flipper_length_mm            Float64
body_mass_g                  Float64
sex                  string[pyarrow]
dtype: object

### Select a subset of columns

In [6]:
df = df[[
    "species",
    "island",
    "body_mass_g",
]]
df

Unnamed: 0,species,island,body_mass_g
0,Adelie Penguin (Pygoscelis adeliae),Dream,3475.0
1,Adelie Penguin (Pygoscelis adeliae),Dream,4650.0
2,Adelie Penguin (Pygoscelis adeliae),Dream,3900.0
3,Chinstrap penguin (Pygoscelis antarctica),Dream,3500.0
4,Adelie Penguin (Pygoscelis adeliae),Dream,3000.0
5,Adelie Penguin (Pygoscelis adeliae),Dream,4100.0
6,Chinstrap penguin (Pygoscelis antarctica),Dream,2700.0
7,Chinstrap penguin (Pygoscelis antarctica),Dream,3400.0
8,Chinstrap penguin (Pygoscelis antarctica),Dream,3800.0
9,Adelie Penguin (Pygoscelis adeliae),Dream,3975.0


### View the first ten values of a series

In [7]:
df['body_mass_g'].head(10)

0    3475.0
1    4650.0
2    3900.0
3    3500.0
4    3000.0
5    4100.0
6    2700.0
7    3400.0
8    3800.0
9    3975.0
Name: body_mass_g, dtype: Float64

### Compute the mean of a series

In [8]:
df['body_mass_g'].mean()

4201.7543859649095

### Filter the DataFrame

In [9]:
df[df['body_mass_g'] >= 4000.0]

Unnamed: 0,species,island,body_mass_g
1,Adelie Penguin (Pygoscelis adeliae),Dream,4650.0
5,Adelie Penguin (Pygoscelis adeliae),Dream,4100.0
10,Adelie Penguin (Pygoscelis adeliae),Dream,4300.0
18,Adelie Penguin (Pygoscelis adeliae),Dream,4250.0
25,Chinstrap penguin (Pygoscelis antarctica),Dream,4050.0
26,Adelie Penguin (Pygoscelis adeliae),Dream,4000.0
27,Chinstrap penguin (Pygoscelis antarctica),Dream,4050.0
28,Chinstrap penguin (Pygoscelis antarctica),Dream,4300.0
30,Chinstrap penguin (Pygoscelis antarctica),Dream,4450.0
36,Adelie Penguin (Pygoscelis adeliae),Dream,4450.0


# Using the Remote Functions

### BigQuery DataFrames gives you the ability to turn your custom scalar functions into a BigQuery remote function.

It requires the GCP project to be set up appropriately and the user having sufficient privileges to use them. One can find more details on it via `help` command.

In [10]:
import bigframes.pandas as pd
help(pd.remote_function)

Help on function remote_function in module bigframes.pandas:

remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)
    Decorator to turn a user defined function into a BigQuery remote function.
    
    Args:
        input_types (list(type)):
            List of input data types in the user defined function.
        output_type (type):
            Data type of the output in the user defined function.
        dataset (str, Optional):
            Dataset to use to create a BigQuery function. It should be in
            `<project_id>.<dataset_name>` or `<dataset_name>` format. If this
            param is not provided then session dataset id would be used.
        bigquery_connection (str, Optional):
            Name of the BigQuery connection. If it is pre created in the same
            location as the `bigquery_client.location` then it would be used,
            otherwise it w

### Define a custom function, and specify the intent to turn it into a remote function.

It requires a BigQuery connection. If the connection is not already created,
the BigQuery DataFrames package attempts to create one assuming the necessary
APIs and IAM permissions are setup in the project.

In [11]:
@pd.remote_function([float], str, bigquery_connection='bigframes-rf-conn')
def get_bucket(num):
    if not num: return "NA"
    boundary = 4000
    return "at_or_above_4000" if num >= boundary else "below_4000"

[INFO][2023-06-28 23:31:49,355][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-b3fab64f5997ad6a516379defe8d4202 --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmp9w5e89lh --entry-point=udf_http --trigger-http --no-allow-unauthenticated
Preparing function...
.done.
Deploying function...
[Build]..........................................................................................................................................................................................................................................................................................................................................................................................................................done
[Service]........................................................................................................................................................................................................

buildConfig:
  build: projects/1084210331973/locations/us-central1/builds/780b1780-9b38-4515-ae60-89d05454ef83
  entryPoint: udf_http
  runtime: python310
  source:
    storageSource:
      bucket: gcf-v2-sources-1084210331973-us-central1
      object: bigframes-b3fab64f5997ad6a516379defe8d4202/function-source.zip
  sourceProvenance:
    resolvedStorageSource:
      bucket: gcf-v2-sources-1084210331973-us-central1
      generation: '1687995112300727'
      object: bigframes-b3fab64f5997ad6a516379defe8d4202/function-source.zip
environment: GEN_2
labels:
  deployment-tool: cli-gcloud
name: projects/bigframes-dev/locations/us-central1/functions/bigframes-b3fab64f5997ad6a516379defe8d4202
serviceConfig:
  allTrafficOnLatestRevision: true
  availableCpu: '0.1666'
  availableMemory: 256M
  ingressSettings: ALLOW_ALL
  maxInstanceCount: 100
  maxInstanceRequestConcurrency: 1
  revision: bigframes-b3fab64f5997ad6a516379defe8d4202-00001-tut
  service: projects/bigframes-dev/locations/us-central1

[INFO][2023-06-28 23:32:55,330][bigframes.remote_function] Successfully created cloud function bigframes-b3fab64f5997ad6a516379defe8d4202 with uri (https://bigframes-b3fab64f5997ad6a516379defe8d4202-7krlje3eoq-uc.a.run.app)
[INFO][2023-06-28 23:32:59,378][bigframes.remote_function] Connector bigframes-rf-conn already exists
[INFO][2023-06-28 23:32:59,379][bigframes.remote_function] Creating BQ remote function: 
    CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_b3fab64f5997ad6a516379defe8d4202(num FLOAT64)
    RETURNS STRING
    REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`
    OPTIONS (
      endpoint = "https://bigframes-b3fab64f5997ad6a516379defe8d4202-7krlje3eoq-uc.a.run.app"
    )
[INFO][2023-06-28 23:33:00,338][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_b3fab64f5997ad6a516379defe8d4202


### Run the custom function on the BigQuery-backed dataframe

In [12]:
df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket))
df[['body_mass_g', 'body_mass_bucket']].head(10)

Unnamed: 0,body_mass_g,body_mass_bucket
0,3475.0,below_4000
1,4650.0,at_or_above_4000
2,3900.0,below_4000
3,3500.0,below_4000
4,3000.0,below_4000
5,4100.0,at_or_above_4000
6,2700.0,below_4000
7,3400.0,below_4000
8,3800.0,below_4000
9,3975.0,below_4000


# Using the ML API

### Initialize a DataFrame from a BigQuery table

In [13]:
df = bigframes.pandas.read_gbq("bigquery-public-data.ml_datasets.penguins")
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie Penguin (Pygoscelis adeliae),Dream,36.6,18.4,184.0,3475.0,FEMALE
1,Adelie Penguin (Pygoscelis adeliae),Dream,39.8,19.1,184.0,4650.0,MALE
2,Adelie Penguin (Pygoscelis adeliae),Dream,40.9,18.9,184.0,3900.0,MALE
3,Chinstrap penguin (Pygoscelis antarctica),Dream,46.5,17.9,192.0,3500.0,FEMALE
4,Adelie Penguin (Pygoscelis adeliae),Dream,37.3,16.8,192.0,3000.0,FEMALE
5,Adelie Penguin (Pygoscelis adeliae),Dream,43.2,18.5,192.0,4100.0,MALE
6,Chinstrap penguin (Pygoscelis antarctica),Dream,46.9,16.6,192.0,2700.0,FEMALE
7,Chinstrap penguin (Pygoscelis antarctica),Dream,50.5,18.4,200.0,3400.0,FEMALE
8,Chinstrap penguin (Pygoscelis antarctica),Dream,49.5,19.0,200.0,3800.0,MALE
9,Adelie Penguin (Pygoscelis adeliae),Dream,40.2,20.1,200.0,3975.0,MALE


### Clean and prepare the data

In [14]:
# filter down to the data we want to analyze
adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"]

# drop the columns we don't care about
adelie_data = adelie_data.drop(columns=["species"])

# drop rows with nulls to get our training data
training_data = adelie_data.dropna()

# take a peek at the training data
training_data

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Dream,36.6,18.4,184.0,3475.0,FEMALE
1,Dream,39.8,19.1,184.0,4650.0,MALE
2,Dream,40.9,18.9,184.0,3900.0,MALE
4,Dream,37.3,16.8,192.0,3000.0,FEMALE
5,Dream,43.2,18.5,192.0,4100.0,MALE
9,Dream,40.2,20.1,200.0,3975.0,MALE
10,Dream,40.8,18.9,208.0,4300.0,MALE
11,Dream,39.0,18.7,185.0,3650.0,MALE
12,Dream,37.0,16.9,185.0,3000.0,FEMALE
14,Dream,34.0,17.1,185.0,3400.0,FEMALE


In [15]:
# pick feature columns and label column
feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]
label_columns = training_data[['body_mass_g']]

# also get the rows that we want to make predictions for (i.e. where the feature column is null)
missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]

### Train and evaluate a linear regression model using the ML API

In [16]:
from bigframes.ml.linear_model import LinearRegression

# as in scikit-learn, a newly created model is just a bundle of parameters
# default parameters are fine here
model = LinearRegression()

# this will train a temporary model in BigQuery Machine Learning
model.fit(feature_columns, label_columns)

# check how the model performed
model.score(feature_columns, label_columns)

Unnamed: 0,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,223.878763,78553.601634,0.005614,181.330911,0.623951,0.623951


### Make predictions using the model

In [17]:
model.predict(missing_body_mass)

Unnamed: 0,predicted_body_mass_g
292,3603.735118


### Save the trained model to BigQuery, so we can load it later

In [18]:
model.to_gbq("bqml_tutorial.penguins_model", replace=True)

LinearRegression()