# Using ML - SKLearn linear regression

This demo shows how we can implement a linear regression in BigQuery DataFrames ML, with API that is exactly compatible with Scikit-Learn.

## 1. Init & load data

In [1]:
# Initialize BigQuery DataFrame
import bigframes.pandas

# read a BigQuery table to a BigQuery DataFrame
df = bigframes.pandas.read_gbq("bigframes-dev.bqml_tutorial.penguins")

# take a peek at the dataframe
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie Penguin (Pygoscelis adeliae),Biscoe,40.1,18.9,188.0,4300.0,MALE
1,Adelie Penguin (Pygoscelis adeliae),Torgersen,39.1,18.7,181.0,3750.0,MALE
2,Gentoo penguin (Pygoscelis papua),Biscoe,47.4,14.6,212.0,4725.0,FEMALE
3,Chinstrap penguin (Pygoscelis antarctica),Dream,42.5,16.7,187.0,3350.0,FEMALE
4,Adelie Penguin (Pygoscelis adeliae),Biscoe,43.2,19.0,197.0,4775.0,MALE
5,Gentoo penguin (Pygoscelis papua),Biscoe,46.7,15.3,219.0,5200.0,MALE
6,Adelie Penguin (Pygoscelis adeliae),Biscoe,41.3,21.1,195.0,4400.0,MALE
7,Gentoo penguin (Pygoscelis papua),Biscoe,45.2,13.8,215.0,4750.0,FEMALE
8,Gentoo penguin (Pygoscelis papua),Biscoe,46.5,13.5,210.0,4550.0,FEMALE
9,Gentoo penguin (Pygoscelis papua),Biscoe,50.5,15.2,216.0,5000.0,FEMALE


## 2. Data cleaning / prep

In [2]:
# filter down to the data we want to analyze
adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"]

# drop the columns we don't care about
adelie_data = adelie_data.drop(columns=["species"])

# drop rows with nulls to get our training data
training_data = adelie_data.dropna()

# take a peek at the training data
training_data

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Biscoe,40.1,18.9,188.0,4300.0,MALE
1,Torgersen,39.1,18.7,181.0,3750.0,MALE
4,Biscoe,43.2,19.0,197.0,4775.0,MALE
6,Biscoe,41.3,21.1,195.0,4400.0,MALE
11,Dream,38.1,18.6,190.0,3700.0,FEMALE
13,Biscoe,37.8,20.0,190.0,4250.0,MALE
14,Biscoe,35.0,17.9,190.0,3450.0,FEMALE
16,Torgersen,34.6,21.1,198.0,4400.0,MALE
19,Dream,37.2,18.1,178.0,3900.0,MALE
21,Biscoe,40.5,17.9,187.0,3200.0,FEMALE


## 3. Use `model_selection.train_test_split` to prepare training data

In [3]:
from bigframes.ml.model_selection import train_test_split

feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]
label_columns = training_data[['body_mass_g']] 

X_train, X_test, y_train, y_test = train_test_split(
  feature_columns, label_columns, test_size=0.2)

## 4. Configure a linear regression pipeline with preprocessing

In [4]:
from bigframes.ml.linear_model import LinearRegression
from bigframes.ml.pipeline import Pipeline
from bigframes.ml.compose import ColumnTransformer
from bigframes.ml.preprocessing import StandardScaler, OneHotEncoder

preprocessing = ColumnTransformer([
  ("onehot", OneHotEncoder(), ["island", "species", "sex"]),
  ("scaler", StandardScaler(), ["culmen_depth_mm", "culmen_length_mm", "flipper_length_mm"]),
])

model = LinearRegression(fit_intercept=False)

pipeline = Pipeline([
  ('preproc', preprocessing),
  ('linreg', model)
])

# TODO(bmil): pretty printing for pipelines
pipeline

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['island', 'species', 'sex']),
                                                 ('scaler', StandardScaler(),
                                                  ['culmen_depth_mm',
                                                   'culmen_length_mm',
                                                   'flipper_length_mm'])])),
                ('linreg', LinearRegression(fit_intercept=False))])

## 5. Fit the pipeline to the training data

This will create a temporary BQML model in BigQuery

In [5]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['island', 'species', 'sex']),
                                                 ('scaler', StandardScaler(),
                                                  ['culmen_depth_mm',
                                                   'culmen_length_mm',
                                                   'flipper_length_mm'])])),
                ('linreg', LinearRegression(fit_intercept=False))])

## 6. Score the pipeline on the test data with `metrics.r2_score`

In [6]:
from bigframes.ml.metrics import r2_score

y_pred = pipeline.predict(X_test)["predicted_body_mass_g"]

r2_score(y_test, y_pred)

0.2655729213572775

## 5. Inference the model on new data

In [7]:
import pandas

new_penguins = bigframes.pandas.read_pandas(
        pandas.DataFrame(
            {
                "tag_number": [1633, 1672, 1690],
                "species": [
                    "Adelie Penguin (Pygoscelis adeliae)",
                    "Adelie Penguin (Pygoscelis adeliae)",
                    "Adelie Penguin (Pygoscelis adeliae)",
                ],
                "island": ["Torgersen", "Torgersen", "Dream"],
                "culmen_length_mm": [39.5, 38.5, 37.9],
                "culmen_depth_mm": [18.8, 17.2, 18.1],
                "flipper_length_mm": [196.0, 181.0, 188.0],
                "sex": ["MALE", "FEMALE", "FEMALE"],
            }
        ).set_index("tag_number")
    )

In [8]:
pipeline.predict(new_penguins)

Unnamed: 0_level_0,predicted_body_mass_g,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,sex
tag_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1633,4017.203152,Adelie Penguin (Pygoscelis adeliae),Torgersen,39.5,18.8,196.0,MALE
1672,3127.601519,Adelie Penguin (Pygoscelis adeliae),Torgersen,38.5,17.2,181.0,FEMALE
1690,3386.101231,Adelie Penguin (Pygoscelis adeliae),Dream,37.9,18.1,188.0,FEMALE


## 6. Save in BigQuery

In [9]:
pipeline.to_gbq("bigframes-dev.bigframes_demo_us.penguin_model", replace=True)

Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('ont_hot_encoder',
                                                  OneHotEncoder(max_categories=1000001,
                                                                min_frequency=0),
                                                  'island'),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  'culmen_length_mm'),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  'culmen_depth_mm'),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  'flipper_length_mm'),
                                              