# BigFrames ML Cross-Vaidation

This demo shows how to do cross validation in bigframes.ml

## 1. Prepare Data

In [1]:
import bigframes.pandas as bpd

In [4]:
# read and filter out unavailable data
df = bpd.read_gbq("bigframes-dev.bqml_tutorial.penguins")
df = df.dropna()
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Gentoo penguin (Pygoscelis papua),Biscoe,50.5,15.9,225.0,5400.0,MALE
1,Gentoo penguin (Pygoscelis papua),Biscoe,45.1,14.5,215.0,5000.0,FEMALE
2,Adelie Penguin (Pygoscelis adeliae),Torgersen,41.4,18.5,202.0,3875.0,MALE
3,Adelie Penguin (Pygoscelis adeliae),Torgersen,38.6,17.0,188.0,2900.0,FEMALE
4,Gentoo penguin (Pygoscelis papua),Biscoe,46.5,14.8,217.0,5200.0,FEMALE
...,...,...,...,...,...,...,...
339,Adelie Penguin (Pygoscelis adeliae),Dream,38.1,17.6,187.0,3425.0,FEMALE
340,Adelie Penguin (Pygoscelis adeliae),Biscoe,36.4,17.1,184.0,2850.0,FEMALE
341,Chinstrap penguin (Pygoscelis antarctica),Dream,40.9,16.6,187.0,3200.0,FEMALE
342,Adelie Penguin (Pygoscelis adeliae),Biscoe,41.3,21.1,195.0,4400.0,MALE


In [5]:
# Select X and y from the dataset
X = df[
        [
            "species",
            "island",
            "culmen_length_mm",
        ]
    ]
y = df["body_mass_g"]

## 2.1 Define KFold class and Train/Test for Each Fold (Manual Approach)

In [6]:
from bigframes.ml import model_selection, linear_model

In [8]:
# Create KFold instance, n_splits defines how many folds the data will split. For example, n_split=5 will split the entire dataset into 5 pieces. 
# In each fold, 4 pieces will be used for training, and the other piece will be used for evaluation.  
kf = model_selection.KFold(n_splits=5)

In [9]:
for X_train, X_test, y_train, y_test in kf.split(X, y):
    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)

    print(score)

   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0           318.358226       151689.571141                0.009814   

   median_absolute_error  r2_score  explained_variance  
0             255.095561  0.780659            0.783304  

[1 rows x 6 columns]


   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0           306.435423        151573.84019                0.008539   

   median_absolute_error  r2_score  explained_variance  
0               244.2899  0.737623            0.742859  

[1 rows x 6 columns]


   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0           253.349578       112039.741164                0.007153   

   median_absolute_error  r2_score  explained_variance  
0             185.916761  0.823381            0.823456  

[1 rows x 6 columns]


   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0           320.381386       155234.800349                0.008638   

   median_absolute_error  r2_score  explained_variance  
0             306.281263  0.793405            0.794504  

[1 rows x 6 columns]


   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
0           303.855563       141869.030392                0.008989   

   median_absolute_error  r2_score  explained_variance  
0             245.102301  0.731737            0.732793  

[1 rows x 6 columns]


## 2.2 Use cross_validate Function to Do Cross Validation (Automatic Approach)

In [10]:
# By using model_selection.cross_validate, the above 2.1 process is automated. The returned scores contains the evaluation results for each fold.
model = linear_model.LinearRegression()
scores = model_selection.cross_validate(model, X, y, cv=5)
scores

{'test_score': [   mean_absolute_error  mean_squared_error  mean_squared_log_error  \
  0           237.154735         97636.17064                0.005571   
  
     median_absolute_error  r2_score  explained_variance  
  0             187.883888  0.842018            0.846816  
  
  [1 rows x 6 columns],
     mean_absolute_error  mean_squared_error  mean_squared_log_error  \
  0           304.281635       141966.045867                0.008064   
  
     median_absolute_error  r2_score  explained_variance  
  0             236.096453  0.762979            0.764008  
  
  [1 rows x 6 columns],
     mean_absolute_error  mean_squared_error  mean_squared_log_error  \
  0           316.380322       157332.146085                0.009699   
  
     median_absolute_error  r2_score  explained_variance  
  0             222.824496  0.764607            0.765369  
  
  [1 rows x 6 columns],
     mean_absolute_error  mean_squared_error  mean_squared_log_error  \
  0           309.609657       152421.