From e3aa06857699ff05a873a893d4a44451a0a97242 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 24 Jul 2024 23:23:53 +0000 Subject: [PATCH 1/4] feat: Allow DataFrame.join for self-join on Null index --- bigframes/core/blocks.py | 10 +++++----- tests/system/small/test_null_index.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2d7c543678..29cbbb3b3b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2292,11 +2292,11 @@ def join( f"Only how='outer','left','right','inner' currently supported. {constants.FEEDBACK_LINK}" ) # Handle null index, which only supports row join - if (self.index.nlevels == other.index.nlevels == 0) and not block_identity_join: - if not block_identity_join: - result = try_row_join(self, other, how=how) - if result is not None: - return result + # This is the canonical way of aligning on null index, so always allow (ignore block_identity_join) + if self.index.nlevels == other.index.nlevels == 0: + result = try_row_join(self, other, how=how) + if result is not None: + return result raise bigframes.exceptions.NullIndexError( "Cannot implicitly align objects. Set an explicit index using set_index." ) diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index 27a3d8dffe..a1e360f73d 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -201,6 +201,20 @@ def test_null_index_stack(scalars_df_null_index, scalars_pandas_df_default_index ) +def test_null_index_series_self_join( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = scalars_df_null_index[["int64_col"]].join( + scalars_df_null_index[["int64_too"]] + ) + pd_result = scalars_pandas_df_default_index[["int64_col"]].join( + scalars_pandas_df_default_index[["int64_too"]] + ) + pd.testing.assert_frame_equal( + bf_result.to_pandas(), pd_result.reset_index(drop=True), check_dtype=False + ) + + def test_null_index_series_self_aligns( scalars_df_null_index, scalars_pandas_df_default_index ): From a35eded1387b08c5d4d3c0855a3f2370edbadd20 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 25 Jul 2024 17:58:05 +0000 Subject: [PATCH 2/4] fix ml caching to apply post-join, add test --- bigframes/ml/core.py | 8 ++--- tests/system/large/ml/test_linear_model.py | 38 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index ee4d8a8c27..a419a1bf86 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -83,7 +83,7 @@ def distance( """ assert len(x.columns) == 1 and len(y.columns) == 1 - input_data = x.cache().join(y.cache(), how="outer") + input_data = x.join(y, how="outer").cache() x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0] return self._apply_sql( @@ -326,7 +326,7 @@ def create_model( if y_train is None: input_data = X_train.cache() else: - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train.cache(), how="outer").cache() options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -366,7 +366,7 @@ def create_llm_remote_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session @@ -399,7 +399,7 @@ def create_time_series_model( options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train.cache().join(y_train.cache(), how="outer") + input_data = X_train.join(y_train, how="outer").cache() options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 0cc9fc5353..cf4ec83fd5 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -150,6 +150,44 @@ def test_logistic_regression_configure_fit_score(penguins_df_default_index, data assert reloaded_model.class_weight is None +def test_unordered_mode_logistic_regression_configure_fit_score( + unordered_session, penguins_table_id, dataset_id +): + model = bigframes.ml.linear_model.LogisticRegression() + + df = unordered_session.read_gbq(penguins_table_id).dropna() + X_train = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + ] + ] + y_train = df[["sex"]] + model.fit(X_train, y_train) + + # Check score to ensure the model was fitted + result = model.score(X_train, y_train).to_pandas() + utils.check_pandas_df_schema_and_index( + result, columns=utils.ML_CLASSFICATION_METRICS, index=1 + ) + + # save, load, check parameters to ensure configuration was kept + reloaded_model = model.to_gbq( + f"{dataset_id}.temp_configured_logistic_reg_model", replace=True + ) + assert reloaded_model._bqml_model is not None + assert ( + f"{dataset_id}.temp_configured_logistic_reg_model" + in reloaded_model._bqml_model.model_name + ) + assert reloaded_model.fit_intercept is True + assert reloaded_model.class_weight is None + + def test_logistic_regression_customized_params_fit_score( penguins_df_default_index, dataset_id ): From 3ff908fbc42d21d02831347390a5d664bbc894e3 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 25 Jul 2024 18:10:54 +0000 Subject: [PATCH 3/4] fix ml golden sql test --- tests/unit/ml/test_golden_sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 48fb7011ea..aa7e919b24 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -78,6 +78,7 @@ def mock_X(mock_y, mock_session): ["index_column_label"], ) mock_X.join(mock_y).sql = "input_X_y_sql" + mock_X.join(mock_y).cache.return_value = mock_X.join(mock_y) mock_X.join(mock_y)._to_sql_query.return_value = ( "input_X_y_sql", ["index_column_id"], From 3da452a867a8bf8081d95976d31dc5b5f57f1a72 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 30 Jul 2024 00:39:35 +0000 Subject: [PATCH 4/4] change unordered test to use linear regression --- bigframes/ml/core.py | 2 +- tests/system/large/ml/test_linear_model.py | 44 ++++++++++++---------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index a419a1bf86..f1b36651f4 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -326,7 +326,7 @@ def create_model( if y_train is None: input_data = X_train.cache() else: - input_data = X_train.join(y_train.cache(), how="outer").cache() + input_data = X_train.join(y_train, how="outer").cache() options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index cf4ec83fd5..2f4c07fa28 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -111,13 +111,12 @@ def test_linear_regression_customized_params_fit_score( assert reloaded_model.learning_rate == 0.2 -# TODO(garrettwu): add tests for param warm_start. Requires a trained model. - - -def test_logistic_regression_configure_fit_score(penguins_df_default_index, dataset_id): - model = bigframes.ml.linear_model.LogisticRegression() +def test_unordered_mode_regression_configure_fit_score( + unordered_session, penguins_table_id, dataset_id +): + model = bigframes.ml.linear_model.LinearRegression() - df = penguins_df_default_index.dropna() + df = unordered_session.read_gbq(penguins_table_id).dropna() X_train = df[ [ "species", @@ -125,37 +124,44 @@ def test_logistic_regression_configure_fit_score(penguins_df_default_index, data "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", - "body_mass_g", + "sex", ] ] - y_train = df[["sex"]] + y_train = df[["body_mass_g"]] model.fit(X_train, y_train) # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() utils.check_pandas_df_schema_and_index( - result, columns=utils.ML_CLASSFICATION_METRICS, index=1 + result, columns=utils.ML_REGRESSION_METRICS, index=1 ) # save, load, check parameters to ensure configuration was kept - reloaded_model = model.to_gbq( - f"{dataset_id}.temp_configured_logistic_reg_model", replace=True - ) + reloaded_model = model.to_gbq(f"{dataset_id}.temp_configured_model", replace=True) assert reloaded_model._bqml_model is not None assert ( - f"{dataset_id}.temp_configured_logistic_reg_model" - in reloaded_model._bqml_model.model_name + f"{dataset_id}.temp_configured_model" in reloaded_model._bqml_model.model_name ) + assert reloaded_model.optimize_strategy == "NORMAL_EQUATION" assert reloaded_model.fit_intercept is True - assert reloaded_model.class_weight is None + assert reloaded_model.calculate_p_values is False + assert reloaded_model.enable_global_explain is False + assert reloaded_model.l1_reg is None + assert reloaded_model.l2_reg == 0.0 + assert reloaded_model.learning_rate is None + assert reloaded_model.learning_rate_strategy == "line_search" + assert reloaded_model.ls_init_learning_rate is None + assert reloaded_model.max_iterations == 20 + assert reloaded_model.tol == 0.01 -def test_unordered_mode_logistic_regression_configure_fit_score( - unordered_session, penguins_table_id, dataset_id -): +# TODO(garrettwu): add tests for param warm_start. Requires a trained model. + + +def test_logistic_regression_configure_fit_score(penguins_df_default_index, dataset_id): model = bigframes.ml.linear_model.LogisticRegression() - df = unordered_session.read_gbq(penguins_table_id).dropna() + df = penguins_df_default_index.dropna() X_train = df[ [ "species",