# Using ML - SKLearn linear regression

This demo shows how we can implement a linear regression in BigQuery DataFrames ML, with API that is exactly compatible with Scikit-Learn.

## 1. Init & load data

In [1]:
# Initialize BigQuery DataFrame
import bigframes.pandas

# read a BigQuery table to a BigQuery DataFrame
df = bigframes.pandas.read_gbq("bigframes-dev.bqml_tutorial.penguins")

# take a peek at the dataframe
df

Unnamed: 0,tag_number,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,1225,Gentoo penguin (Pygoscelis papua),Biscoe,,,,,
1,1278,Gentoo penguin (Pygoscelis papua),Biscoe,42.0,13.5,210.0,4150.0,FEMALE
2,1275,Gentoo penguin (Pygoscelis papua),Biscoe,46.5,13.5,210.0,4550.0,FEMALE
3,1233,Gentoo penguin (Pygoscelis papua),Biscoe,43.3,14.0,208.0,4575.0,FEMALE
4,1311,Gentoo penguin (Pygoscelis papua),Biscoe,47.5,14.0,212.0,4875.0,FEMALE
5,1316,Gentoo penguin (Pygoscelis papua),Biscoe,49.1,14.5,212.0,4625.0,FEMALE
6,1313,Gentoo penguin (Pygoscelis papua),Biscoe,45.5,14.5,212.0,4750.0,FEMALE
7,1381,Gentoo penguin (Pygoscelis papua),Biscoe,47.6,14.5,215.0,5400.0,MALE
8,1377,Gentoo penguin (Pygoscelis papua),Biscoe,45.1,14.5,207.0,5050.0,FEMALE
9,1380,Gentoo penguin (Pygoscelis papua),Biscoe,45.1,14.5,215.0,5000.0,FEMALE


## 2. Data cleaning / prep

In [2]:
# set a friendlier index to uniquely identify the rows
df = df.set_index("tag_number")

# filter down to the data we want to analyze
adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"]

# drop the columns we don't care about
adelie_data = adelie_data.drop(columns=["species"])

# drop rows with nulls to get our training data
training_data = adelie_data.dropna()

# take a peek at the training data
training_data

Unnamed: 0_level_0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
tag_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1172,Dream,32.1,15.5,188.0,3050.0,FEMALE
1371,Biscoe,37.7,16.0,183.0,3075.0,FEMALE
1417,Torgersen,38.6,17.0,188.0,2900.0,FEMALE
1204,Dream,40.7,17.0,190.0,3725.0,MALE
1251,Biscoe,37.6,17.0,185.0,3600.0,FEMALE
1422,Torgersen,35.7,17.0,189.0,3350.0,FEMALE
1394,Torgersen,40.2,17.0,176.0,3450.0,FEMALE
1163,Dream,36.4,17.0,195.0,3325.0,FEMALE
1329,Biscoe,38.1,17.0,181.0,3175.0,FEMALE
1406,Torgersen,44.1,18.0,210.0,4000.0,MALE


## 3. Use `model_selection.train_test_split` to prepare training data

In [3]:
from bigframes.ml.model_selection import train_test_split

feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]
label_columns = training_data[['body_mass_g']] 

train_X, test_X, train_y, test_y = train_test_split(
  feature_columns, label_columns, test_size=0.2)

## 4. Configure a linear regression pipeline with preprocessing

In [4]:
from bigframes.ml.linear_model import LinearRegression
from bigframes.ml.pipeline import Pipeline
from bigframes.ml.compose import ColumnTransformer
from bigframes.ml.preprocessing import StandardScaler, OneHotEncoder

preprocessing = ColumnTransformer([
  ("onehot", OneHotEncoder(), ["island", "species", "sex"]),
  ("scaler", StandardScaler(), ["culmen_depth_mm", "culmen_length_mm", "flipper_length_mm"]),
])

model = LinearRegression(fit_intercept=False)

pipeline = Pipeline([
  ('preproc', preprocessing),
  ('linreg', model)
])

# TODO(bmil): pretty printing for pipelines
pipeline

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['island', 'species', 'sex']),
                                                 ('scaler', StandardScaler(),
                                                  ['culmen_depth_mm',
                                                   'culmen_length_mm',
                                                   'flipper_length_mm'])])),
                ('linreg', LinearRegression(fit_intercept=False))])

## 5. Fit the pipeline to the training data

This will create a temporary BQML model in BigQuery

In [5]:
pipeline.fit(train_X, train_y)

## 6. Score the pipeline on the test data with `metrics.r2_score`

In [6]:
from bigframes.ml.metrics import r2_score

pred_y = pipeline.predict(test_X)

r2_score(test_y, pred_y)

0.4597467714807377

## 5. Inference the model on new data

In [7]:
import pandas

new_penguins = bigframes.pandas.read_pandas(
        pandas.DataFrame(
            {
                "tag_number": [1633, 1672, 1690],
                "species": [
                    "Adelie Penguin (Pygoscelis adeliae)",
                    "Adelie Penguin (Pygoscelis adeliae)",
                    "Adelie Penguin (Pygoscelis adeliae)",
                ],
                "island": ["Torgersen", "Torgersen", "Dream"],
                "culmen_length_mm": [39.5, 38.5, 37.9],
                "culmen_depth_mm": [18.8, 17.2, 18.1],
                "flipper_length_mm": [196.0, 181.0, 188.0],
                "sex": ["MALE", "FEMALE", "FEMALE"],
            }
        ).set_index("tag_number")
    )

In [8]:
pipeline.predict(new_penguins)

Unnamed: 0_level_0,predicted_body_mass_g
tag_number,Unnamed: 1_level_1
1633,4034.682043
1672,3267.196667
1690,3445.920252


## 4. Save in BigQuery

In [9]:
pipeline.to_gbq("bigframes-dev.bigframes_demo_us.penguin_model", replace=True)

Pipeline(steps=[('preproc',
                 ColumnTransformer(transformers=[('onehot', OneHotEncoder(),
                                                  ['island', 'species', 'sex']),
                                                 ('scaler', StandardScaler(),
                                                  ['culmen_depth_mm',
                                                   'culmen_length_mm',
                                                   'flipper_length_mm'])])),
                ('linreg', LinearRegression(fit_intercept=False))])