From b67616fd571107fd03e30b10a92b5959d521f980 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 11 Nov 2024 15:41:11 +0000 Subject: [PATCH 1/4] docs: create boosted tree model --- .../classification_boosted_tree_model_test.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 samples/snippets/classification_boosted_tree_model_test.py diff --git a/samples/snippets/classification_boosted_tree_model_test.py b/samples/snippets/classification_boosted_tree_model_test.py new file mode 100644 index 0000000000..464199ad94 --- /dev/null +++ b/samples/snippets/classification_boosted_tree_model_test.py @@ -0,0 +1,64 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_boosted_tree_model(random_model_id: str) -> None: + your_model_id = random_model_id + # [START bigquery_dataframes_bqml_boosted_tree_prepare] + import bigframes.pandas as bpd + + input_data = bpd.read_gbq( + "bigquery-public-data.ml_datasets.census_adult_income", + columns=( + "age", + "workclass", + "marital_status", + "education_num", + "occupation", + "hours_per_week", + "income_bracket", + "functional_weight", + ), + ) + input_data["dataframe"] = bpd.Series("training", index=input_data.index,).case_when( + [ + (((input_data["functional_weight"] % 10) == 8), "evaluation"), + (((input_data["functional_weight"] % 10) == 9), "prediction"), + ] + ) + del input_data["functional_weight"] + # [END bigquery_dataframes_bqml_boosted_tree_prepare] + # [START bigquery_dataframes_bqml_boosted_tree_create] + import bigframes.ml.linear_model + + # input_data is defined in an earlier step. + training_data = input_data[input_data["dataframe"] == "training"] + X = training_data.drop(columns=["income_bracket", "dataframe"]) + y = training_data["income_bracket"] + + # create and train the model + census_model = bigframes.ml.linear_model.LogisticRegression( + # model_type="BOOSTED_TREE_CLASSIFIER", + # booster_type="gbtree", + max_iterations=50, + ) + census_model.fit(X, y) + + census_model.to_gbq( + your_model_id, # For example: "your-project.census.census_model" + replace=True, + ) + # [END bigquery_dataframes_bqml_boosted_tree_create] + assert input_data is not None + assert census_model is not None From 669bc745acb0eeb2cb7956e2e347b0a268f98632 Mon Sep 17 00:00:00 2001 From: Daniela Date: Thu, 14 Nov 2024 16:08:07 +0000 Subject: [PATCH 2/4] merge main --- samples/snippets/classification_boosted_tree_model_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/snippets/classification_boosted_tree_model_test.py b/samples/snippets/classification_boosted_tree_model_test.py index 464199ad94..3ea5f85eba 100644 --- a/samples/snippets/classification_boosted_tree_model_test.py +++ b/samples/snippets/classification_boosted_tree_model_test.py @@ -40,6 +40,7 @@ def test_boosted_tree_model(random_model_id: str) -> None: del input_data["functional_weight"] # [END bigquery_dataframes_bqml_boosted_tree_prepare] # [START bigquery_dataframes_bqml_boosted_tree_create] + # from sklearn.ensemble import GradientBoostingClassifier import bigframes.ml.linear_model # input_data is defined in an earlier step. From b113a574891c72f5952bd3e0aa67c1a01ee54cb7 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 19 Nov 2024 20:30:28 +0000 Subject: [PATCH 3/4] update model --- .../classification_boosted_tree_model_test.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/samples/snippets/classification_boosted_tree_model_test.py b/samples/snippets/classification_boosted_tree_model_test.py index 3ea5f85eba..e9dd3f444d 100644 --- a/samples/snippets/classification_boosted_tree_model_test.py +++ b/samples/snippets/classification_boosted_tree_model_test.py @@ -40,8 +40,7 @@ def test_boosted_tree_model(random_model_id: str) -> None: del input_data["functional_weight"] # [END bigquery_dataframes_bqml_boosted_tree_prepare] # [START bigquery_dataframes_bqml_boosted_tree_create] - # from sklearn.ensemble import GradientBoostingClassifier - import bigframes.ml.linear_model + from bigframes.ml import ensemble # input_data is defined in an earlier step. training_data = input_data[input_data["dataframe"] == "training"] @@ -49,10 +48,12 @@ def test_boosted_tree_model(random_model_id: str) -> None: y = training_data["income_bracket"] # create and train the model - census_model = bigframes.ml.linear_model.LogisticRegression( - # model_type="BOOSTED_TREE_CLASSIFIER", - # booster_type="gbtree", - max_iterations=50, + census_model = ensemble.XGBClassifier( + n_estimators=1, + booster="gbtree", + tree_method="hist", + max_iterations=5, # For a more accurate model, try 50 iterations. + subsample=0.85, ) census_model.fit(X, y) From a8ac72d903ddc26ef854693dc8d9fda5e2390435 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 19 Nov 2024 20:40:13 +0000 Subject: [PATCH 4/4] update test --- samples/snippets/classification_boosted_tree_model_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/classification_boosted_tree_model_test.py b/samples/snippets/classification_boosted_tree_model_test.py index e9dd3f444d..707ce16279 100644 --- a/samples/snippets/classification_boosted_tree_model_test.py +++ b/samples/snippets/classification_boosted_tree_model_test.py @@ -52,7 +52,7 @@ def test_boosted_tree_model(random_model_id: str) -> None: n_estimators=1, booster="gbtree", tree_method="hist", - max_iterations=5, # For a more accurate model, try 50 iterations. + max_iterations=1, # For a more accurate model, try 50 iterations. subsample=0.85, ) census_model.fit(X, y)