Merge branch 'main' into documentation/fix_typos_in_user_guide

exasol · May 15, 2024 · 7ee6208 · 7ee6208
2 parents ea1c54e + 14e7007
commit 7ee6208
Show file tree

Hide file tree

Showing 9 changed files with 597 additions and 580 deletions.
diff --git a/doc/changes/changelog.md b/doc/changes/changelog.md
@@ -1,5 +1,6 @@
 # Changelog
 
+* [1.0.1](changes_1.0.1.md)
 * [1.0.0](changes_1.0.0.md)
 * [0.10.0](changes_0.10.0.md)
 * [0.9.2](changes_0.9.2.md)

diff --git a/doc/changes/changes_1.0.1.md b/doc/changes/changes_1.0.1.md
@@ -0,0 +1,24 @@
+# Transformers Extension 1.0.1, 2024-04-25
+
+Code name: Fixed the directory structure bug 
+
+
+## Summary
+
+Fixed the directory structure made by the model upload UDF.
+
+### Bugs
+
+- #221: Directory Structure that Model Upload UDF creates is different from what PredictionUDFs expect.
+
+### Features
+
+N/A
+
+### Refactorings
+
+N/A
+
+### Security 
+
+N/A
diff --git a/exasol_transformers_extension/udfs/models/model_downloader_udf.py b/exasol_transformers_extension/udfs/models/model_downloader_udf.py
@@ -57,7 +57,7 @@ def _download_model(self, ctx) -> Tuple[str, str]:
             token = token_conn_obj.password
 
         # set model path in buckets
-        model_path = bucketfs_operations.get_model_path(sub_dir, model_name)
+        model_path = bucketfs_operations.get_model_path_with_pretrained(sub_dir, model_name)
 
         # create bucketfs location
         bfs_conn_obj = self._exa.get_connection(bfs_conn)

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "exasol-transformers-extension"
-version = "1.0.0"
+version = "1.0.1"
 description = "An Exasol extension for using state-of-the-art pretrained machine learning models via the Hugging Face Transformers API."
 
 authors = [

diff --git a/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py b/tests/integration_tests/with_db/udfs/test_model_downloader_udf_script.py
@@ -15,7 +15,7 @@ def test_model_downloader_udf_script(
     for i in range(n_rows):
         sub_dir = SUB_DIR.format(id=i)
         sub_dirs.append(sub_dir)
-        model_paths.append(bucketfs_operations.get_model_path(
+        model_paths.append(bucketfs_operations.get_model_path_with_pretrained(
             sub_dir, model_params.tiny_model))
         input_data.append((
             model_params.tiny_model,

diff --git a/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py b/tests/integration_tests/with_db/udfs/test_prediction_with_downloader_udf.py
@@ -0,0 +1,63 @@
+import time
+from tests.utils import postprocessing
+
+SUB_DIR = 'test_downloader_with_prediction_sub_dir'
+MODEL_NAME = 'gaunernst/bert-tiny-uncased'
+
+
+def test_prediction_with_downloader_udf(
+        setup_database, pyexasol_connection, bucketfs_location):
+    bucketfs_conn_name, schema_name = setup_database
+
+    try:
+        # execute downloader UDF
+        input_data = (
+            MODEL_NAME,
+            SUB_DIR,
+            bucketfs_conn_name,
+            ''
+        )
+        query = f"""
+            SELECT TE_MODEL_DOWNLOADER_UDF(
+            t.model_name,
+            t.sub_dir,
+            t.bucketfs_conn_name,
+            t.token_conn_name
+            ) FROM (VALUES {str(input_data)} AS
+            t(model_name, sub_dir, bucketfs_conn_name, token_conn_name));
+            """
+
+        pyexasol_connection.execute(query).fetchall()
+        time.sleep(10)
+
+        # execute the filling mask UDF
+        text_data = "I <mask> you so much."
+        top_k = 3
+        input_data = (
+            '',
+            bucketfs_conn_name,
+            SUB_DIR,
+            MODEL_NAME,
+            text_data,
+            top_k
+        )
+
+        query = f"SELECT TE_FILLING_MASK_UDF(" \
+                f"t.device_id, " \
+                f"t.bucketfs_conn_name, " \
+                f"t.sub_dir, " \
+                f"t.model_name, " \
+                f"t.text_data," \
+                f"t.top_k" \
+                f") FROM (VALUES {str(input_data)} " \
+                f"AS t(device_id, bucketfs_conn_name, sub_dir, " \
+                f"model_name, text_data, top_k));"
+
+        result = pyexasol_connection.execute(query).fetchall()
+
+        # assertions
+        assert len(result) == top_k
+        assert all(row[-1] is None for row in result)
+
+    finally:
+        postprocessing.cleanup_buckets(bucketfs_location, SUB_DIR)
diff --git a/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py b/tests/integration_tests/without_db/udfs/test_model_downloader_udf.py
@@ -74,7 +74,7 @@ def __init__(self, id: str, url_localfs: str, token_conn_name: str):
             'bucketfs_conn_name': self.bucketfs_conn_name,
             'token_conn_name': self.token_conn_name
         }
-        self.model_path = bucketfs_operations.get_model_path(
+        self.model_path = bucketfs_operations.get_model_path_with_pretrained(
             self.sub_dir, self.tiny_model)
         self.bucketfs_connection = Connection(
             address=f"{url_localfs}/bucket{id}",
@@ -118,7 +118,7 @@ def test_model_downloader_udf_implementation():
         # assertions
         env1_bucketfs_files = env1.list_files_in_bucketfs()
         env2_bucketfs_files = env2.list_files_in_bucketfs()
-        assert ctx.get_emitted()[0] == (str(env1.model_path), str(env1.model_path.with_suffix(".tar.gz"))) \
-               and ctx.get_emitted()[1] == (str(env2.model_path), str(env2.model_path.with_suffix(".tar.gz"))) \
-               and str(Path(ctx.get_emitted()[0][1]).relative_to(env1.sub_dir)) in env1_bucketfs_files \
-               and str(Path(ctx.get_emitted()[1][1]).relative_to(env2.sub_dir)) in env2_bucketfs_files
+        assert ctx.get_emitted()[0] == (str(env1.model_path), str(env1.model_path.with_suffix(".tar.gz")))
+        assert ctx.get_emitted()[1] == (str(env2.model_path), str(env2.model_path.with_suffix(".tar.gz")))
+        assert str(Path(ctx.get_emitted()[0][1]).relative_to(env1.sub_dir)) in env1_bucketfs_files
+        assert str(Path(ctx.get_emitted()[1][1]).relative_to(env2.sub_dir)) in env2_bucketfs_files
diff --git a/tests/unit_tests/udfs/test_model_downloader_udf.py b/tests/unit_tests/udfs/test_model_downloader_udf.py
@@ -91,7 +91,7 @@ def test_model_downloader(description, count, token_conn_name, token_conn_obj, e
     assert mock_cast(mock_model_downloader_factory.create).mock_calls == [
         call(bucketfs_location=mock_bucketfs_locations[i],
              model_name=base_model_names[i],
-             model_path=PosixPath(f'{sub_directory_names[i]}/{base_model_names[i]}'),
+             model_path=PosixPath(f'{sub_directory_names[i]}/{base_model_names[i]}/pretrained/{base_model_names[i]}'),
              token=expected_token)
         for i in range(count)
     ]
@@ -107,7 +107,7 @@ def test_model_downloader(description, count, token_conn_name, token_conn_obj, e
     ])
     assert mock_ctx.output == [
         (
-            f'{sub_directory_names[i]}/{base_model_names[i]}',
+            f'{sub_directory_names[i]}/{base_model_names[i]}/pretrained/{base_model_names[i]}',
             str(mock_model_downloaders[i].upload_to_bucketfs())
         )
         for i in range(count)