# README

This Notebook runs differently depending on the following environent variable:
1. BIGQUERY_LOCATION - can take values as per https://cloud.google.com/bigquery/docs/locations, e.g. `us`, `asia-east1`

### Infer location and set up data in that location if needed

In [1]:
# Take multi-region US as the default BQ location, where most of the BQ data lies including the BQ public datasets
BQ_LOCATION = "us"
PROJECT = "bigframes-dev"
DATASET = "bigframes_testing"
PENGUINS_TABLE = "bigquery-public-data.ml_datasets.penguins"


# Check for a location set in the environment and do location-specific setup if needed

import os
import google.api_core.exceptions
from google.cloud import bigquery
import bigframes
    
env_bq_location = os.getenv("BIGQUERY_LOCATION")
if env_bq_location and env_bq_location != BQ_LOCATION:
    BQ_LOCATION = env_bq_location.lower()

client = bigquery.Client()

if BQ_LOCATION != "us":
    bq_location_normalized = BQ_LOCATION.replace('-', '_')

    # Nominate a local penguins table
    penguins_table_ref = bigquery.TableReference.from_string(PENGUINS_TABLE)
    penguins_local_dataset_name = f"{DATASET}_{bq_location_normalized}"
    penguins_local_dataset_ref = bigquery.DatasetReference(project=PROJECT, dataset_id=penguins_local_dataset_name)
    penguins_local_dataset = bigquery.Dataset(penguins_local_dataset_ref)
    penguins_local_dataset.location = BQ_LOCATION
    penguins_local_table_ref= bigquery.TableReference(penguins_local_dataset, penguins_table_ref.table_id)
    penguins_local_table = str(penguins_local_table_ref)
    try:
        client.get_table(penguins_local_table_ref)
    except google.api_core.exceptions.NotFound:
        client.create_dataset(penguins_local_dataset, exists_ok=True)

        # Read the public table as an in-memory dataframe and then write to the local table
        session_us = bigframes.connect()
        df = session_us.read_gbq(PENGUINS_TABLE).to_pandas()
        df.to_gbq(penguins_local_table)

    # Finally point the penguins table to the local table
    PENGUINS_TABLE=penguins_local_table

    # Also update the dataset name used for test artifacts
    DATASET = f"{DATASET}_{bq_location_normalized}"

# Create the dataset to store the model if it doesn't exist    
model_local_dataset = bigquery.Dataset(bigquery.DatasetReference(project=PROJECT, dataset_id=DATASET))
model_local_dataset.location = BQ_LOCATION
model_dataset = client.create_dataset(model_local_dataset, exists_ok=True)

# Finally log the variables driving the core notebook execution
log = ('\n'.join(f"{name}: {str(value)}" for name, value in {
    "BigQuery project" : PROJECT,
    "BigQuery location" : BQ_LOCATION,
    "Penguins Table" : PENGUINS_TABLE,
    "ML Model Dataset" : model_dataset.reference
}.items())) 
print(log)



BigQuery project: bigframes-dev
BigQuery location: us
Penguins Table: bigquery-public-data.ml_datasets.penguins
ML Model Dataset: bigframes-dev.bigframes_testing


# Using the BigQuery DataFrames API

### Set BigQuery DataFrames options

In [2]:
import bigframes.pandas

bigframes.pandas.options.bigquery.project = PROJECT
bigframes.pandas.options.bigquery.location = BQ_LOCATION

### Initialize a dataframe for a BigQuery table

In [3]:
df = bigframes.pandas.read_gbq(PENGUINS_TABLE)



HTML(value='Query job e1a62d56-8cab-4bc1-9ad3-457f48b71d9c is RUNNING. <a target="_blank" href="https://consol…

## View the DataFrame

In [4]:
df

HTML(value='Query job 88571869-2c86-4605-942d-21f11909f16b is DONE. 0 Bytes processed. <a target="_blank" href…

HTML(value='Query job ba0deab9-f576-46ad-adb8-ec22f6a88029 is DONE. 31.7 kB processed. <a target="_blank" href…

HTML(value='Query job a4ec65ff-4672-4b28-8300-3269bbbc516a is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie Penguin (Pygoscelis adeliae),Dream,36.6,18.4,184.0,3475.0,FEMALE
1,Adelie Penguin (Pygoscelis adeliae),Dream,39.8,19.1,184.0,4650.0,MALE
2,Adelie Penguin (Pygoscelis adeliae),Dream,40.9,18.9,184.0,3900.0,MALE
3,Chinstrap penguin (Pygoscelis antarctica),Dream,46.5,17.9,192.0,3500.0,FEMALE
4,Adelie Penguin (Pygoscelis adeliae),Dream,37.3,16.8,192.0,3000.0,FEMALE
5,Adelie Penguin (Pygoscelis adeliae),Dream,43.2,18.5,192.0,4100.0,MALE
6,Chinstrap penguin (Pygoscelis antarctica),Dream,46.9,16.6,192.0,2700.0,FEMALE
7,Chinstrap penguin (Pygoscelis antarctica),Dream,50.5,18.4,200.0,3400.0,FEMALE
8,Chinstrap penguin (Pygoscelis antarctica),Dream,49.5,19.0,200.0,3800.0,MALE
9,Adelie Penguin (Pygoscelis adeliae),Dream,40.2,20.1,200.0,3975.0,MALE


### View the column names in the dataframe (aka columns names in the table)

In [5]:
df.columns

Index(['species', 'island', 'culmen_length_mm', 'culmen_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

### View the table schema

In [6]:
df.dtypes

species              string[pyarrow]
island               string[pyarrow]
culmen_length_mm             Float64
culmen_depth_mm              Float64
flipper_length_mm            Float64
body_mass_g                  Float64
sex                  string[pyarrow]
dtype: object

### Select a subset of columns

In [7]:
df = df[[
    "species",
    "island",
    "body_mass_g",
]]
df

HTML(value='Query job ee2bae68-7c4b-49f4-b5c2-8818cec33a74 is DONE. 0 Bytes processed. <a target="_blank" href…

HTML(value='Query job 2568d815-1e92-407c-8c58-27aed7093a6c is DONE. 21.2 kB processed. <a target="_blank" href…

HTML(value='Query job 01d36b39-7d27-4b25-850e-d362cb705c1f is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0,species,island,body_mass_g
0,Adelie Penguin (Pygoscelis adeliae),Dream,3475.0
1,Adelie Penguin (Pygoscelis adeliae),Dream,4650.0
2,Adelie Penguin (Pygoscelis adeliae),Dream,3900.0
3,Chinstrap penguin (Pygoscelis antarctica),Dream,3500.0
4,Adelie Penguin (Pygoscelis adeliae),Dream,3000.0
5,Adelie Penguin (Pygoscelis adeliae),Dream,4100.0
6,Chinstrap penguin (Pygoscelis antarctica),Dream,2700.0
7,Chinstrap penguin (Pygoscelis antarctica),Dream,3400.0
8,Chinstrap penguin (Pygoscelis antarctica),Dream,3800.0
9,Adelie Penguin (Pygoscelis adeliae),Dream,3975.0


### View the first ten values of a series

In [8]:
df['body_mass_g'].head(10)

HTML(value='Query job 31a62db3-8550-43e9-8707-ed80de1382d3 is DONE. 2.8 kB processed. <a target="_blank" href=…

HTML(value='Query job 9cf7ee94-1ff3-4362-a30f-a7cc732717e1 is DONE. 5.5 kB processed. <a target="_blank" href=…

0    3475.0
1    4650.0
2    3900.0
3    3500.0
4    3000.0
5    4100.0
6    2700.0
7    3400.0
8    3800.0
9    3975.0
Name: body_mass_g, dtype: Float64

### Compute the mean of a series

In [9]:
df['body_mass_g'].mean()

HTML(value='Query job 792dfdf3-0c5d-4acf-b809-2247ff897bb2 is DONE. 2.7 kB processed. <a target="_blank" href=…

4201.754385964912

### Filter the DataFrame

In [10]:
df[df['body_mass_g'] >= 4000.0]

HTML(value='Query job a5a4fb7a-5570-4ca3-93e5-7dcefee3d137 is DONE. 2.7 kB processed. <a target="_blank" href=…

HTML(value='Query job 06f79d18-0bd6-48cd-b7a7-465efb48f651 is DONE. 21.2 kB processed. <a target="_blank" href…

HTML(value='Query job a6917951-c692-4de6-b03e-67250d8fedc9 is RUNNING. <a target="_blank" href="https://consol…

Unnamed: 0,species,island,body_mass_g
1,Adelie Penguin (Pygoscelis adeliae),Dream,4650.0
5,Adelie Penguin (Pygoscelis adeliae),Dream,4100.0
10,Adelie Penguin (Pygoscelis adeliae),Dream,4300.0
18,Adelie Penguin (Pygoscelis adeliae),Dream,4250.0
25,Chinstrap penguin (Pygoscelis antarctica),Dream,4050.0
26,Adelie Penguin (Pygoscelis adeliae),Dream,4000.0
27,Chinstrap penguin (Pygoscelis antarctica),Dream,4050.0
28,Chinstrap penguin (Pygoscelis antarctica),Dream,4300.0
30,Chinstrap penguin (Pygoscelis antarctica),Dream,4450.0
36,Adelie Penguin (Pygoscelis adeliae),Dream,4450.0


# Using the Remote Functions

### BigQuery DataFrames gives you the ability to turn your custom scalar functions into a BigQuery remote function.

It requires the GCP project to be set up appropriately and the user having sufficient privileges to use them. One can find more details on it via `help` command.

In [11]:
import bigframes.pandas as pd
help(pd.remote_function)

Help on function remote_function in module bigframes.pandas:

remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)
    Decorator to turn a user defined function into a BigQuery remote function.
    
    .. note::
        Please make sure following is setup before using this API:
    
        1. Have the below APIs enabled for your project:
              a. BigQuery Connection API
              b. Cloud Functions API
              c. Cloud Run API
              d. Cloud Build API
              e. Artifact Registry API
              f. Cloud Resource Manager API
    
          This can be done from the cloud console (change PROJECT_ID to yours):
              https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.google

### Define a custom function, and specify the intent to turn it into a remote function.

It requires a BigQuery connection. If the connection is not already created,
the BigQuery DataFrames package attempts to create one assuming the necessary
APIs and IAM permissions are setup in the project.

In [12]:
@pd.remote_function([float], str, bigquery_connection='bigframes-rf-conn')
def get_bucket(num):
    if not num: return "NA"
    boundary = 4000
    return "at_or_above_4000" if num >= boundary else "below_4000"

[INFO][2023-08-05 23:12:12,870][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-f9320ad496b5aeca2d7f343cbab03e2f --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmps5m0qu4z --entry-point=udf_http --trigger-http --no-allow-unauthenticated
[INFO][2023-08-05 23:13:20,660][bigframes.remote_function] Successfully created cloud function bigframes-f9320ad496b5aeca2d7f343cbab03e2f with uri (https://bigframes-f9320ad496b5aeca2d7f343cbab03e2f-7krlje3eoq-uc.a.run.app)
[INFO][2023-08-05 23:13:32,717][bigframes.remote_function] Connector bigframes-rf-conn already exists
[INFO][2023-08-05 23:13:32,719][bigframes.remote_function] Creating BQ remote function: 
    CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_f9320ad496b5aeca2d7f343cbab03e2f(num FLOAT64)
    RETURNS STRING
    REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`
    OPTIONS (
      endpoint = "https://bigframes-f9320ad496b5a

### Run the custom function on the BigQuery-backed dataframe

In [13]:
df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket))
df[['body_mass_g', 'body_mass_bucket']].head(10)

HTML(value='Query job 5f30816f-f4d0-4063-bb9e-2905b89f717d is DONE. 2.8 kB processed. <a target="_blank" href=…

HTML(value='Query job 447748be-7eaa-486a-a67c-65a595f413cb is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job 0ea806ba-d42d-4464-897b-f60210695d69 is DONE. 5.5 kB processed. <a target="_blank" href=…

Unnamed: 0,body_mass_g,body_mass_bucket
0,3475.0,below_4000
1,4650.0,at_or_above_4000
2,3900.0,below_4000
3,3500.0,below_4000
4,3000.0,below_4000
5,4100.0,at_or_above_4000
6,2700.0,below_4000
7,3400.0,below_4000
8,3800.0,below_4000
9,3975.0,below_4000


# Using the ML API

### Initialize a DataFrame from a BigQuery table

In [14]:
df = bigframes.pandas.read_gbq(PENGUINS_TABLE)
df

HTML(value='Query job acd770bb-5ccb-463f-beec-2386132ded6b is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job a4529f0b-cc46-498b-8c70-74e0ba660674 is DONE. 0 Bytes processed. <a target="_blank" href…

HTML(value='Query job 5b7b38a5-0720-4797-8401-179668134038 is DONE. 31.7 kB processed. <a target="_blank" href…

HTML(value='Query job c8370cb9-4f79-4766-a43c-e24db7ba6b50 is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie Penguin (Pygoscelis adeliae),Dream,36.6,18.4,184.0,3475.0,FEMALE
1,Adelie Penguin (Pygoscelis adeliae),Dream,39.8,19.1,184.0,4650.0,MALE
2,Adelie Penguin (Pygoscelis adeliae),Dream,40.9,18.9,184.0,3900.0,MALE
3,Chinstrap penguin (Pygoscelis antarctica),Dream,46.5,17.9,192.0,3500.0,FEMALE
4,Adelie Penguin (Pygoscelis adeliae),Dream,37.3,16.8,192.0,3000.0,FEMALE
5,Adelie Penguin (Pygoscelis adeliae),Dream,43.2,18.5,192.0,4100.0,MALE
6,Chinstrap penguin (Pygoscelis antarctica),Dream,46.9,16.6,192.0,2700.0,FEMALE
7,Chinstrap penguin (Pygoscelis antarctica),Dream,50.5,18.4,200.0,3400.0,FEMALE
8,Chinstrap penguin (Pygoscelis antarctica),Dream,49.5,19.0,200.0,3800.0,MALE
9,Adelie Penguin (Pygoscelis adeliae),Dream,40.2,20.1,200.0,3975.0,MALE


### Clean and prepare the data

In [15]:
# filter down to the data we want to analyze
adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"]

# drop the columns we don't care about
adelie_data = adelie_data.drop(columns=["species"])

# drop rows with nulls to get our training data
training_data = adelie_data.dropna()

# take a peek at the training data
training_data

HTML(value='Query job 6cb59d7e-091b-4c08-833d-3e189972c28e is DONE. 28.9 kB processed. <a target="_blank" href…

HTML(value='Query job b42f2b25-a040-414f-afc0-dfdfde0e358b is DONE. 31.7 kB processed. <a target="_blank" href…

HTML(value='Query job 9a68cc3f-eade-4c5c-8d18-1eb3277678b6 is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Dream,36.6,18.4,184.0,3475.0,FEMALE
1,Dream,39.8,19.1,184.0,4650.0,MALE
2,Dream,40.9,18.9,184.0,3900.0,MALE
4,Dream,37.3,16.8,192.0,3000.0,FEMALE
5,Dream,43.2,18.5,192.0,4100.0,MALE
9,Dream,40.2,20.1,200.0,3975.0,MALE
10,Dream,40.8,18.9,208.0,4300.0,MALE
11,Dream,39.0,18.7,185.0,3650.0,MALE
12,Dream,37.0,16.9,185.0,3000.0,FEMALE
14,Dream,34.0,17.1,185.0,3400.0,FEMALE


In [16]:
# pick feature columns and label column
feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]
label_columns = training_data[['body_mass_g']]

# also get the rows that we want to make predictions for (i.e. where the feature column is null)
missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]

### Train and evaluate a linear regression model using the ML API

In [17]:
from bigframes.ml.linear_model import LinearRegression

# as in scikit-learn, a newly created model is just a bundle of parameters
# default parameters are fine here
model = LinearRegression()

# this will train a temporary model in BigQuery Machine Learning
model.fit(feature_columns, label_columns)

# check how the model performed
model.score(feature_columns, label_columns)

HTML(value='Query job 4b0c58e4-4752-4b96-b490-a95e3ae326c0 is DONE. 31.9 kB processed. <a target="_blank" href…

HTML(value='Query job 741394fb-1599-44d2-9ba2-7e179c906dc5 is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job 015ccf47-1d52-465f-8201-2a24122d4006 is DONE. 0 Bytes processed. <a target="_blank" href…

HTML(value='Query job eaefd749-80bf-49e0-a0f9-93ea5315462f is DONE. 56 Bytes processed. <a target="_blank" hre…

HTML(value='Query job e931ec39-9c02-4a04-bd47-fe82d299780b is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error,r2_score,explained_variance
0,223.878763,78553.601634,0.005614,181.330911,0.623951,0.623951


### Make predictions using the model

In [18]:
model.predict(missing_body_mass)

HTML(value='Query job fe2fe252-8433-4d20-861c-681a8dfbf2c4 is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job 929db8ef-fc3b-41ae-a646-102e6ef94c5f is DONE. 8 Bytes processed. <a target="_blank" href…

HTML(value='Query job 95a08810-311e-4cb9-aad5-ba867b384ad7 is DONE. 0 Bytes processed. <a target="_blank" href…

HTML(value='Query job ff0d54a9-3e5f-4f45-a511-17f05c2a5101 is DONE. 16 Bytes processed. <a target="_blank" hre…

HTML(value='Query job f56133e3-e97f-4413-a909-5ff150b70b9a is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0,predicted_body_mass_g
292,3459.735118


### Save the trained model to BigQuery, so we can load it later

In [19]:
model.to_gbq(f"{DATASET}.penguins_model", replace=True)

LinearRegression()