From 18c8b4ace9a9cae30e6aa239ace8adce19259489 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 18 Jun 2025 00:00:25 +0000 Subject: [PATCH 1/3] feat: add required param 'engine' to multimodal functions --- bigframes/operations/blob.py | 33 +++++++++++++++++ .../multimodal/multimodal_dataframe.ipynb | 9 ++--- samples/snippets/multimodal_test.py | 9 ++--- tests/system/large/blob/test_function.py | 36 +++++++++++++------ 4 files changed, 69 insertions(+), 18 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index e143cfc519..63875ded99 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -303,6 +303,7 @@ def get_runtime_json_str( def exif( self, *, + engine: Literal[None, "pillow"] = None, connection: Optional[str] = None, max_batching_rows: int = 8192, container_cpu: Union[float, int] = 0.33, @@ -311,6 +312,7 @@ def exif( """Extract EXIF data. Now only support image types. Args: + engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function. container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers. @@ -319,6 +321,8 @@ def exif( Returns: bigframes.series.Series: JSON series of key-value pairs. """ + if engine is None or engine.casefold() != "pillow": + raise ValueError("Must specify the engine, supported value is 'pillow'.") import bigframes.bigquery as bbq import bigframes.blob._functions as blob_func @@ -344,6 +348,7 @@ def image_blur( self, ksize: tuple[int, int], *, + engine: Literal[None, "opencv"] = None, dst: Optional[Union[str, bigframes.series.Series]] = None, connection: Optional[str] = None, max_batching_rows: int = 8192, @@ -354,6 +359,7 @@ def image_blur( Args: ksize (tuple(int, int)): Kernel size. + engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of: str: GCS folder str. The output filenames are the same as the input files. blob Series: The output file paths are determined by the uris of the blob Series. @@ -367,6 +373,9 @@ def image_blur( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. """ + if engine is None or engine.casefold() != "opencv": + raise ValueError("Must specify the engine, supported value is 'opencv'.") + import bigframes.blob._functions as blob_func connection = self._resolve_connection(connection) @@ -424,6 +433,7 @@ def image_resize( self, dsize: tuple[int, int] = (0, 0), *, + engine: Literal[None, "opencv"] = None, fx: float = 0.0, fy: float = 0.0, dst: Optional[Union[str, bigframes.series.Series]] = None, @@ -436,6 +446,7 @@ def image_resize( Args: dsize (tuple(int, int), default (0, 0)): Destination size. If set to 0, fx and fy parameters determine the size. + engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. fx (float, default 0.0): scale factor along the horizontal axis. If set to 0.0, dsize parameter determines the output size. fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size. dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of: @@ -451,6 +462,9 @@ def image_resize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. """ + if engine is None or engine.casefold() != "opencv": + raise ValueError("Must specify the engine, supported value is 'opencv'.") + dsize_set = dsize[0] > 0 and dsize[1] > 0 fsize_set = fx > 0.0 and fy > 0.0 if not dsize_set ^ fsize_set: @@ -516,6 +530,7 @@ def image_resize( def image_normalize( self, *, + engine: Literal[None, "opencv"] = None, alpha: float = 1.0, beta: float = 0.0, norm_type: str = "l2", @@ -528,6 +543,7 @@ def image_normalize( """Normalize images. Args: + engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization. beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization. norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax". @@ -544,6 +560,9 @@ def image_normalize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. """ + if engine is None or engine.casefold() != "opencv": + raise ValueError("Must specify the engine, supported value is 'opencv'.") + import bigframes.blob._functions as blob_func connection = self._resolve_connection(connection) @@ -604,6 +623,7 @@ def image_normalize( def pdf_extract( self, *, + engine: Literal[None, "pypdf"] = None, connection: Optional[str] = None, max_batching_rows: int = 1, container_cpu: Union[float, int] = 2, @@ -613,6 +633,7 @@ def pdf_extract( """Extracts text from PDF URLs and saves the text as string. Args: + engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. @@ -631,6 +652,9 @@ def pdf_extract( Contains the extracted text from the PDF file. Includes error messages if verbosity is enabled. """ + if engine is None or engine.casefold() != "pypdf": + raise ValueError("Must specify the engine, supported value is 'pypdf'.") + import bigframes.bigquery as bbq import bigframes.blob._functions as blob_func import bigframes.pandas as bpd @@ -663,6 +687,7 @@ def pdf_extract( def pdf_chunk( self, *, + engine: Literal[None, "pypdf"] = None, connection: Optional[str] = None, chunk_size: int = 2000, overlap_size: int = 200, @@ -675,6 +700,7 @@ def pdf_chunk( arrays of strings. Args: + engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. @@ -698,6 +724,8 @@ def pdf_chunk( where each string is a chunk of text extracted from PDF. Includes error messages if verbosity is enabled. """ + if engine is None or engine.casefold() != "pypdf": + raise ValueError("Must specify the engine, supported value is 'pypdf'.") import bigframes.bigquery as bbq import bigframes.blob._functions as blob_func @@ -740,6 +768,7 @@ def pdf_chunk( def audio_transcribe( self, *, + engine: Literal["bigquery"] = "bigquery", connection: Optional[str] = None, model_name: Optional[ Literal[ @@ -753,6 +782,7 @@ def audio_transcribe( Transcribe audio content using a Gemini multimodal model. Args: + engine ('bigquery'): The engine (bigquery or third party library) used for the function. connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session. @@ -770,6 +800,9 @@ def audio_transcribe( Contains the transcribed text from the audio file. Includes error messages if verbosity is enabled. """ + if engine.casefold() != "bigquery": + raise ValueError("Must specify the engine, supported value is 'bigquery'.") + import bigframes.bigquery as bbq import bigframes.ml.llm as llm import bigframes.pandas as bpd diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index 3f36c2908a..fbe074b0d0 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -254,16 +254,17 @@ "outputs": [], "source": [ "df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n", - " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\"\n", + " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n", ")\n", "df_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n", - " (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\"\n", + " (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\", engine=\"opencv\"\n", ")\n", "df_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n", " alpha=50.0,\n", " beta=150.0,\n", " norm_type=\"minmax\",\n", " dst=f\"gs://{OUTPUT_BUCKET}/image_normalize_transformed/\",\n", + " engine=\"opencv\",\n", ")" ] }, @@ -280,7 +281,7 @@ "outputs": [], "source": [ "# You can also chain functions together\n", - "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\")" + "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")" ] }, { @@ -419,7 +420,7 @@ }, "outputs": [], "source": [ - "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk()" + "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")" ] }, { diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py index 7f8e13cd7b..087299aa0a 100644 --- a/samples/snippets/multimodal_test.py +++ b/samples/snippets/multimodal_test.py @@ -56,21 +56,22 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: # [START bigquery_dataframes_multimodal_dataframe_image_transform] df_image["blurred"] = df_image["image"].blob.image_blur( - (20, 20), dst=f"{dst_bucket}/image_blur_transformed/" + (20, 20), dst=f"{dst_bucket}/image_blur_transformed/", engine="opencv" ) df_image["resized"] = df_image["image"].blob.image_resize( - (300, 200), dst=f"{dst_bucket}/image_resize_transformed/" + (300, 200), dst=f"{dst_bucket}/image_resize_transformed/", engine="opencv" ) df_image["normalized"] = df_image["image"].blob.image_normalize( alpha=50.0, beta=150.0, norm_type="minmax", dst=f"{dst_bucket}/image_normalize_transformed/", + engine="opencv", ) # You can also chain functions together df_image["blur_resized"] = df_image["blurred"].blob.image_resize( - (300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/" + (300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/", engine="opencv" ) df_image # [END bigquery_dataframes_multimodal_dataframe_image_transform] @@ -113,7 +114,7 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None: df_pdf = bpd.from_glob_path( "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*", name="pdf" ) - df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk() + df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk(engine="pypdf") chunked = df_pdf["chunked"].explode() chunked # [END bigquery_dataframes_multimodal_dataframe_pdf_chunk] diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 4a95e4c6d1..e88b7e0f26 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -86,7 +86,7 @@ def test_blob_image_blur_to_series( ) actual = images_mm_df["blob_col"].blob.image_blur( - (8, 8), dst=series, connection=bq_connection + (8, 8), dst=series, connection=bq_connection, engine="opencv" ) expected_df = pd.DataFrame( { @@ -114,7 +114,7 @@ def test_blob_image_blur_to_folder( images_output_uris: list[str], ): actual = images_mm_df["blob_col"].blob.image_blur( - (8, 8), dst=images_output_folder, connection=bq_connection + (8, 8), dst=images_output_folder, connection=bq_connection, engine="opencv" ) expected_df = pd.DataFrame( { @@ -136,7 +136,9 @@ def test_blob_image_blur_to_folder( def test_blob_image_blur_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str): - actual = images_mm_df["blob_col"].blob.image_blur((8, 8), connection=bq_connection) + actual = images_mm_df["blob_col"].blob.image_blur( + (8, 8), connection=bq_connection, engine="opencv" + ) assert isinstance(actual, bpd.Series) assert len(actual) == 2 @@ -154,7 +156,7 @@ def test_blob_image_resize_to_series( ) actual = images_mm_df["blob_col"].blob.image_resize( - (200, 300), dst=series, connection=bq_connection + (200, 300), dst=series, connection=bq_connection, engine="opencv" ) expected_df = pd.DataFrame( { @@ -182,7 +184,7 @@ def test_blob_image_resize_to_folder( images_output_uris: list[str], ): actual = images_mm_df["blob_col"].blob.image_resize( - (200, 300), dst=images_output_folder, connection=bq_connection + (200, 300), dst=images_output_folder, connection=bq_connection, engine="opencv" ) expected_df = pd.DataFrame( { @@ -205,7 +207,7 @@ def test_blob_image_resize_to_folder( def test_blob_image_resize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str): actual = images_mm_df["blob_col"].blob.image_resize( - (200, 300), connection=bq_connection + (200, 300), connection=bq_connection, engine="opencv" ) assert isinstance(actual, bpd.Series) @@ -224,7 +226,12 @@ def test_blob_image_normalize_to_series( ) actual = images_mm_df["blob_col"].blob.image_normalize( - alpha=50.0, beta=150.0, norm_type="minmax", dst=series, connection=bq_connection + alpha=50.0, + beta=150.0, + norm_type="minmax", + dst=series, + connection=bq_connection, + engine="opencv", ) expected_df = pd.DataFrame( { @@ -257,6 +264,7 @@ def test_blob_image_normalize_to_folder( norm_type="minmax", dst=images_output_folder, connection=bq_connection, + engine="opencv", ) expected_df = pd.DataFrame( { @@ -279,7 +287,11 @@ def test_blob_image_normalize_to_folder( def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str): actual = images_mm_df["blob_col"].blob.image_normalize( - alpha=50.0, beta=150.0, norm_type="minmax", connection=bq_connection + alpha=50.0, + beta=150.0, + norm_type="minmax", + connection=bq_connection, + engine="opencv", ) assert isinstance(actual, bpd.Series) @@ -322,7 +334,7 @@ def test_blob_pdf_extract( ): actual = ( pdf_mm_df["pdf"] - .blob.pdf_extract(connection=bq_connection, verbose=verbose) + .blob.pdf_extract(connection=bq_connection, verbose=verbose, engine="pypdf") .explode() .to_pandas() ) @@ -373,7 +385,11 @@ def test_blob_pdf_chunk( actual = ( pdf_mm_df["pdf"] .blob.pdf_chunk( - connection=bq_connection, chunk_size=50, overlap_size=10, verbose=verbose + connection=bq_connection, + chunk_size=50, + overlap_size=10, + verbose=verbose, + engine="pypdf", ) .explode() .to_pandas() From aeef2a8264f86c94c0cfd98969c60aad03da750f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 18 Jun 2025 09:17:02 -0500 Subject: [PATCH 2/3] add missing engine to exif test --- tests/system/large/blob/test_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index e88b7e0f26..0f956dc668 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -61,7 +61,7 @@ def test_blob_exif( connection=bq_connection, ) - actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection) + actual = exif_image_df["blob_col"].blob.exif(engine="pillow", connection=bq_connection) expected = bpd.Series( ['{"ExifOffset": 47, "Make": "MyCamera"}'], session=session, From a7aabe9fb829b8862a3a23fce25ee9187cf6a307 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Wed, 18 Jun 2025 14:19:25 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- tests/system/large/blob/test_function.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 0f956dc668..a594b144f5 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -61,7 +61,9 @@ def test_blob_exif( connection=bq_connection, ) - actual = exif_image_df["blob_col"].blob.exif(engine="pillow", connection=bq_connection) + actual = exif_image_df["blob_col"].blob.exif( + engine="pillow", connection=bq_connection + ) expected = bpd.Series( ['{"ExifOffset": 47, "Make": "MyCamera"}'], session=session,