# Scikit-Learn API example

This demo shows how we can implement the same Linear Regression demo as in `03 - Linear Regression.ipynb` using exactly compatible Scikit-Learn API.

## 1. Init & load data

In [4]:
# initialize BigFrames
import bigframes
session = bigframes.connect()

# read a BigQuery table to a BigFrames dataframe
df = session.read_gbq("bigframes-dev.bqml_tutorial.penguins")

# take a peek at the dataframe
df

     tag_number                                    species     island  \
41         1256          Gentoo penguin (Pygoscelis papua)     Biscoe   
73         1289          Gentoo penguin (Pygoscelis papua)     Biscoe   
75         1382          Gentoo penguin (Pygoscelis papua)     Biscoe   
93         1318          Gentoo penguin (Pygoscelis papua)     Biscoe   
125        1172        Adelie Penguin (Pygoscelis adeliae)      Dream   
236        1411        Adelie Penguin (Pygoscelis adeliae)  Torgersen   
294        1189  Chinstrap penguin (Pygoscelis antarctica)      Dream   
299        1151  Chinstrap penguin (Pygoscelis antarctica)      Dream   
312        1161  Chinstrap penguin (Pygoscelis antarctica)      Dream   
37         1297          Gentoo penguin (Pygoscelis papua)     Biscoe   

     culmen_length_mm  culmen_depth_mm  flipper_length_mm  body_mass_g     sex  
41               48.1             15.1              209.0       5500.0    MALE  
73               50.8             

## 2. Data cleaning / prep

In [5]:
# set a friendlier index to uniquely identify the rows
df = df.set_index("tag_number")

# filter down to the data we want to analyze
adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"]

# drop the columns we don't care about
adelie_data = adelie_data.drop(["species"])

# drop rows with nulls to get our training data
training_data = adelie_data.dropna()

# take a peek at the training data
training_data

           island  culmen_length_mm  culmen_depth_mm  flipper_length_mm  \
tag_number                                                                
1101        Dream              36.6             18.4              184.0   
1102        Dream              39.8             19.1              184.0   
1103        Dream              40.9             18.9              184.0   
1105        Dream              37.3             16.8              192.0   
1106        Dream              43.2             18.5              192.0   
1110        Dream              40.2             20.1              200.0   
1111        Dream              40.8             18.9              208.0   
1112        Dream              39.0             18.7              185.0   
1113        Dream              37.0             16.9              185.0   
1115        Dream              34.0             17.1              185.0   

            body_mass_g     sex  
tag_number                       
1101             3475.0  FEMALE

In [6]:
# pick feature columns and label column
feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]
label_columns = training_data[['body_mass_g']]                               

# also get the rows that we want to make predictions for (i.e. where the feature column is null)
missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]

## 3. Create, fit, score and inference the model - this time with manual preprocessing and data split

In the original version of this demo, we used BQML's automatic preprocessing and automatic test/train data split.

Here, we instead specify these steps manually, as is the norm in Scikit-Learn.

In [9]:
from bigframes.ml.linear_model import LinearRegression
from bigframes.ml.pipeline import Pipeline
from bigframes.ml.compose import ColumnTransformer
from bigframes.ml.model_selection import train_test_split
from bigframes.ml.preprocessing import StandardScaler, OneHotEncoder
# from bigframes.ml.metrics import r2_score

preprocessing = ColumnTransformer([
  ("onehot", OneHotEncoder(), ["island", "species", "sex"]),
  ("scaler", StandardScaler(), ["culmen_depth_mm", "culmen_length_mm", "flipper_length_mm"]),
])

# TODO(bmil): need to disable auto preproc and auto test split by default
model = LinearRegression()

pipeline = Pipeline([
  ('preproc', preprocessing),
  ('linreg', model)
])

train_X, test_X, train_y, test_y = train_test_split(
  feature_columns, label_columns, test_size=0.2)

# this will train a temporary model in BQML
pipeline.fit(train_X, train_y)

# score test data
# TODO(bmil): Implement r2_score
# pred_y = model.predict(test_X)
# r2_score(test_y, pred_y)

# use the model to make predictions
pipeline.predict(missing_body_mass)


                predicted_body_mass_g
tag_number_z_z                       
1525                      3853.343683
1393                      2435.813817
1524                      4310.513425
1523                      3480.749595

[4 rows x 1 columns]

## 4. Save in BigQuery

In [None]:
# TODO(bmil): implement Pipeline.to_gbq