# Using ML - SKLearn linear regression

This demo shows how we can implement a linear regression in BigQuery DataFrames ML, with API that is exactly compatible with Scikit-Learn.

## 1. Init & load data

In [12]:
# Initialize BigQuery DataFrame
import bigframes.pandas

# read a BigQuery table to a BigQuery DataFrame
df = bigframes.pandas.read_gbq("bigframes-dev.bqml_tutorial.penguins")

# take a peek at the dataframe
df

HTML(value='Query job d47c23df-1830-4451-9016-7747c1420abd is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job 7ada8963-c6e6-4d46-b42d-8a8cc6d6ead2 is DONE. 0 Bytes processed. <a target="_blank" href…

HTML(value='Query job 31a5b656-000e-4238-9fd9-c6e644ca298f is DONE. 31.7 kB processed. <a target="_blank" href…

HTML(value='Query job d8eed0ca-7ce9-4ed8-a592-e16af9f9db8d is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie Penguin (Pygoscelis adeliae),Dream,36.6,18.4,184.0,3475.0,FEMALE
1,Adelie Penguin (Pygoscelis adeliae),Dream,39.8,19.1,184.0,4650.0,MALE
2,Adelie Penguin (Pygoscelis adeliae),Dream,40.9,18.9,184.0,3900.0,MALE
3,Chinstrap penguin (Pygoscelis antarctica),Dream,46.5,17.9,192.0,3500.0,FEMALE
4,Adelie Penguin (Pygoscelis adeliae),Dream,37.3,16.8,192.0,3000.0,FEMALE
5,Adelie Penguin (Pygoscelis adeliae),Dream,43.2,18.5,192.0,4100.0,MALE
6,Chinstrap penguin (Pygoscelis antarctica),Dream,46.9,16.6,192.0,2700.0,FEMALE
7,Chinstrap penguin (Pygoscelis antarctica),Dream,50.5,18.4,200.0,3400.0,FEMALE
8,Chinstrap penguin (Pygoscelis antarctica),Dream,49.5,19.0,200.0,3800.0,MALE
9,Adelie Penguin (Pygoscelis adeliae),Dream,40.2,20.1,200.0,3975.0,MALE


## 2. Data cleaning / prep

In [13]:
# filter down to the data we want to analyze
adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"]

# drop the columns we don't care about
adelie_data = adelie_data.drop(columns=["species"])

# drop rows with nulls to get our training data
training_data = adelie_data.dropna()

# take a peek at the training data
training_data

HTML(value='Query job 3537a10a-641a-4d40-ae47-449c641b1bc5 is DONE. 28.9 kB processed. <a target="_blank" href…

HTML(value='Query job 34101409-7c65-4045-ad52-c6ba24dc9cbb is DONE. 31.7 kB processed. <a target="_blank" href…

HTML(value='Query job 74190ac2-21a2-47b0-bc21-ef5373565f17 is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Dream,36.6,18.4,184.0,3475.0,FEMALE
1,Dream,39.8,19.1,184.0,4650.0,MALE
2,Dream,40.9,18.9,184.0,3900.0,MALE
4,Dream,37.3,16.8,192.0,3000.0,FEMALE
5,Dream,43.2,18.5,192.0,4100.0,MALE
9,Dream,40.2,20.1,200.0,3975.0,MALE
10,Dream,40.8,18.9,208.0,4300.0,MALE
11,Dream,39.0,18.7,185.0,3650.0,MALE
12,Dream,37.0,16.9,185.0,3000.0,FEMALE
14,Dream,34.0,17.1,185.0,3400.0,FEMALE


## 3. Use `model_selection.train_test_split` to prepare training data

In [14]:
from bigframes.ml.model_selection import train_test_split

feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]
label_columns = training_data[['body_mass_g']] 

train_X, test_X, train_y, test_y = train_test_split(
  feature_columns, label_columns, test_size=0.2)

HTML(value='Query job 288f0daa-a51e-45b4-86bf-d054467c4a99 is DONE. 28.9 kB processed. <a target="_blank" href…

## 4. Configure a linear regression pipeline with preprocessing

In [15]:
from bigframes.ml.linear_model import LinearRegression
from bigframes.ml.pipeline import Pipeline
from bigframes.ml.compose import ColumnTransformer
from bigframes.ml.preprocessing import StandardScaler, OneHotEncoder

preprocessing = ColumnTransformer([
  ("onehot", OneHotEncoder(), ["island", "species", "sex"]),
  ("scaler", StandardScaler(), ["culmen_depth_mm", "culmen_length_mm", "flipper_length_mm"]),
])

model = LinearRegression(fit_intercept=False)

pipeline = Pipeline([
  ('preproc', preprocessing),
  ('linreg', model)
])

# TODO(bmil): pretty printing for pipelines
pipeline

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['island', 'species', 'sex']),
                                                 ('scaler', StandardScaler(),
                                                  ['culmen_depth_mm',
                                                   'culmen_length_mm',
                                                   'flipper_length_mm'])])),
                ('linreg', LinearRegression(fit_intercept=False))])

## 5. Fit the pipeline to the training data

This will create a temporary BQML model in BigQuery

In [16]:
pipeline.fit(train_X, train_y)

## 6. Score the pipeline on the test data with `metrics.r2_score`

In [17]:
from bigframes.ml.metrics import r2_score

pred_y = pipeline.predict(test_X)

r2_score(test_y, pred_y)

HTML(value='Query job 81196f97-304b-4d77-bb0f-8fc8adb8fe75 is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job b417f27a-387d-4eb2-8d6d-287327ef0471 is DONE. 232 Bytes processed. <a target="_blank" hr…

HTML(value='Query job b7f89a61-d76a-47be-8b83-917d69f255a2 is DONE. 31.7 kB processed. <a target="_blank" href…

HTML(value='Query job 9619c393-90b3-4fea-a197-d09389e9486c is DONE. 31.7 kB processed. <a target="_blank" href…

HTML(value='Query job e5854451-ffb4-4a28-a25f-3bdd68e9edae is DONE. 32.2 kB processed. <a target="_blank" href…

0.6757452736197735

## 5. Inference the model on new data

In [18]:
import pandas

new_penguins = bigframes.pandas.read_pandas(
        pandas.DataFrame(
            {
                "tag_number": [1633, 1672, 1690],
                "species": [
                    "Adelie Penguin (Pygoscelis adeliae)",
                    "Adelie Penguin (Pygoscelis adeliae)",
                    "Adelie Penguin (Pygoscelis adeliae)",
                ],
                "island": ["Torgersen", "Torgersen", "Dream"],
                "culmen_length_mm": [39.5, 38.5, 37.9],
                "culmen_depth_mm": [18.8, 17.2, 18.1],
                "flipper_length_mm": [196.0, 181.0, 188.0],
                "sex": ["MALE", "FEMALE", "FEMALE"],
            }
        ).set_index("tag_number")
    )

HTML(value='Load job d4c2f933-3514-4901-bcd7-888ee66eba82 is RUNNING. <a target="_blank" href="https://console…

In [19]:
pipeline.predict(new_penguins)

HTML(value='Query job e4ffd919-6f69-4382-a7e5-db37c7c1fefa is RUNNING. <a target="_blank" href="https://consol…

HTML(value='Query job 6b3e3285-79e9-4137-bf3b-7b7185ef76a5 is DONE. 24 Bytes processed. <a target="_blank" hre…

HTML(value='Query job 173c4194-e194-43d2-8359-7bec83d3c861 is DONE. 0 Bytes processed. <a target="_blank" href…

HTML(value='Query job 53ba2332-590c-488d-9505-23aebaaad9cb is DONE. 48 Bytes processed. <a target="_blank" hre…

HTML(value='Query job 66e4a8e0-4cae-4e9d-86e0-17dc24f6cfbb is DONE. 0 Bytes processed. <a target="_blank" href…

Unnamed: 0_level_0,predicted_body_mass_g
tag_number,Unnamed: 1_level_1
1633,3965.994361
1672,3246.312058
1690,3456.404062


## 4. Save in BigQuery

In [20]:
pipeline.to_gbq("bigframes-dev.bigframes_demo_us.penguin_model", replace=True)

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['island', 'species', 'sex']),
                                                 ('scaler', StandardScaler(),
                                                  ['culmen_depth_mm',
                                                   'culmen_length_mm',
                                                   'flipper_length_mm'])])),
                ('linreg', LinearRegression(fit_intercept=False))])