# BigFrames ML Cross-Vaidation

This demo shows how to do cross validation in bigframes.ml

## 1. Prepare Data

In [1]:
import bigframes.pandas as bpd

In [2]:
# read and filter out unavailable data
df = bpd.read_gbq("bigframes-dev.bqml_tutorial.penguins")
df = df.dropna()
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Gentoo penguin (Pygoscelis papua),Biscoe,45.2,16.4,223.0,5950.0,MALE
1,Gentoo penguin (Pygoscelis papua),Biscoe,46.5,14.5,213.0,4400.0,FEMALE
2,Adelie Penguin (Pygoscelis adeliae),Biscoe,37.7,16.0,183.0,3075.0,FEMALE
3,Gentoo penguin (Pygoscelis papua),Biscoe,46.4,15.6,221.0,5000.0,MALE
4,Gentoo penguin (Pygoscelis papua),Biscoe,46.1,13.2,211.0,4500.0,FEMALE
5,Adelie Penguin (Pygoscelis adeliae),Torgersen,43.1,19.2,197.0,3500.0,MALE
6,Gentoo penguin (Pygoscelis papua),Biscoe,45.2,15.8,215.0,5300.0,MALE
7,Adelie Penguin (Pygoscelis adeliae),Dream,36.2,17.3,187.0,3300.0,FEMALE
8,Chinstrap penguin (Pygoscelis antarctica),Dream,46.0,18.9,195.0,4150.0,FEMALE
9,Gentoo penguin (Pygoscelis papua),Biscoe,54.3,15.7,231.0,5650.0,MALE


In [3]:
# Select X and y from the dataset
X = df[
        [
            "species",
            "island",
            "culmen_length_mm",
        ]
    ]
y = df["body_mass_g"]

## 2.1 Define KFold class and Train/Test for Each Fold (Manual Approach)

In [4]:
from bigframes.ml import model_selection, linear_model

In [5]:
# Create KFold instance, n_splits defines how many folds the data will split. For example, n_split=5 will split the entire dataset into 5 pieces. 
# In each fold, 4 pieces will be used for training, and the other piece will be used for evaluation.  
kf = model_selection.KFold(n_splits=5)

In [6]:
for X_train, X_test, y_train, y_test in kf.split(X, y):
    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)

    print(score)

   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0            297.36838       148892.914876                0.009057   

   median_absolute_error  r2_score  explained_variance  
0             238.424052  0.814613            0.816053  

[1 rows x 6 columns]


   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0             307.6149       139013.303482                0.007907   

   median_absolute_error  r2_score  explained_variance  
0             266.589811  0.782835            0.794297  

[1 rows x 6 columns]


   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0           348.412701       180661.063512                 0.01125   

   median_absolute_error  r2_score  explained_variance  
0              313.29406  0.744053             0.74537  

[1 rows x 6 columns]


   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0           309.991882       151820.705254                0.008898   

   median_absolute_error  r2_score  explained_variance  
0             212.758708  0.694001            0.694287  

[1 rows x 6 columns]


   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0           256.569216       103495.042886                0.006605   

   median_absolute_error  r2_score  explained_variance  
0             222.940815  0.818589            0.832344  

[1 rows x 6 columns]


## 2.2 Use cross_validate Function to Do Cross Validation (Automatic Approach)

In [7]:
# By using model_selection.cross_validate, the above 2.1 process is automated. The returned scores contains the evaluation results for each fold.
model = linear_model.LinearRegression()
scores = model_selection.cross_validate(model, X, y, cv=5)
scores

{'test_score': [   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
  0           322.341485       157616.627179                0.009137   
  
     median_absolute_error  r2_score  explained_variance  
  0             269.412639  0.705594            0.724882  
  
  [1 rows x 6 columns],
     mean_absolute_error  mean_squared_error  mean_squared_log_error  \
  0           289.682121       136550.318797                 0.00878   
  
     median_absolute_error  r2_score  explained_variance  
  0             212.874686  0.799363             0.81416  
  
  [1 rows x 6 columns],
     mean_absolute_error  mean_squared_error  mean_squared_log_error  \
  0           325.358522       155218.752974                0.009606   
  
     median_absolute_error  r2_score  explained_variance  
  0             267.301671  0.777174              0.7782  
  
  [1 rows x 6 columns],
     mean_absolute_error  mean_squared_error  mean_squared_log_error  \
  0           286.874056       120586.