fix: Minor code refactor (#632)

* refactor * minor fix testcase * bug: fix pip install * merge master * remove unused code * fix mnist udf
georgia-tech-db · Apr 4, 2023 · 0f41606 · 0f41606
1 parent 3f302bd
commit 0f41606
Show file tree

Hide file tree

Showing 8 changed files with 101 additions and 119 deletions.
diff --git a/eva/binder/binder_utils.py b/eva/binder/binder_utils.py
@@ -20,6 +20,8 @@
 from eva.catalog.catalog_type import TableType
 from eva.catalog.catalog_utils import is_string_col, is_video_table
 from eva.catalog.sql_config import IDENTIFIER_COLUMN
+from eva.expression.function_expression import FunctionExpression
+from eva.parser.alias import Alias
 
 if TYPE_CHECKING:
     from eva.binder.statement_binder_context import StatementBinderContext
@@ -113,3 +115,22 @@ def check_column_name_is_string(col_ref) -> None:
     if not is_string_col(col_ref.col_object):
         err_msg = "LIKE only supported for string columns"
         raise BinderError(err_msg)
+
+
+def resolve_alias_table_value_expression(node: FunctionExpression):
+    default_alias_name = node.name.lower()
+    default_output_col_aliases = [str(obj.name.lower()) for obj in node.output_objs]
+    if not node.alias:
+        node.alias = Alias(default_alias_name, default_output_col_aliases)
+    else:
+        if not len(node.alias.col_names):
+            node.alias = Alias(node.alias.alias_name, default_output_col_aliases)
+        else:
+            output_aliases = [
+                str(col_name.lower()) for col_name in node.alias.col_names
+            ]
+            node.alias = Alias(node.alias.alias_name, output_aliases)
+
+    assert len(node.alias.col_names) == len(
+        node.output_objs
+    ), f"""Expected {len(node.output_objs)} output columns for {node.alias.alias_name}, got {len(node.alias.col_names)}."""
diff --git a/eva/binder/statement_binder.py b/eva/binder/statement_binder.py
@@ -21,14 +21,14 @@
     check_groupby_pattern,
     check_table_object_is_video,
     extend_star,
+    resolve_alias_table_value_expression,
 )
 from eva.binder.statement_binder_context import StatementBinderContext
 from eva.catalog.catalog_manager import CatalogManager
 from eva.catalog.catalog_type import IndexType, NdArrayType, TableType, VideoColumnName
 from eva.expression.abstract_expression import AbstractExpression, ExpressionType
 from eva.expression.function_expression import FunctionExpression
 from eva.expression.tuple_value_expression import TupleValueExpression
-from eva.parser.alias import Alias
 from eva.parser.create_index_statement import CreateIndexStatement
 from eva.parser.create_mat_view_statement import CreateMaterializedViewStatement
 from eva.parser.delete_statement import DeleteTableStatement
@@ -278,19 +278,4 @@ def _bind_func_expr(self, node: FunctionExpression):
             node.output_objs = output_objs
             node.projection_columns = [obj.name.lower() for obj in output_objs]
 
-        default_alias_name = node.name.lower()
-        default_output_col_aliases = [str(obj.name.lower()) for obj in node.output_objs]
-        if not node.alias:
-            node.alias = Alias(default_alias_name, default_output_col_aliases)
-        else:
-            if not len(node.alias.col_names):
-                node.alias = Alias(node.alias.alias_name, default_output_col_aliases)
-            else:
-                output_aliases = [
-                    str(col_name.lower()) for col_name in node.alias.col_names
-                ]
-                node.alias = Alias(node.alias.alias_name, output_aliases)
-
-        assert len(node.alias.col_names) == len(
-            node.output_objs
-        ), f"""Expected {len(node.output_objs)} output columns for {node.alias.alias_name}, got {len(node.alias.col_names)}."""
+        resolve_alias_table_value_expression(node)
diff --git a/eva/binder/statement_binder_context.py b/eva/binder/statement_binder_context.py
@@ -205,7 +205,7 @@ def _search_all_alias_maps(self, col_name: str) -> Tuple[str, CatalogColumnType]
                 alias_match = alias
 
         if num_alias_matches > 1:
-            err_msg = "Ambiguous Column name {col_name}"
+            err_msg = f"Ambiguous Column name {col_name}"
             logger.error(err_msg)
             raise BinderError(err_msg)
 

diff --git a/eva/models/storage/batch.py b/eva/models/storage/batch.py
@@ -55,8 +55,16 @@ def __len__(self):
     def columns(self):
         return self._frames.columns
 
-    def column_as_numpy_array(self, column_name="data"):
-        return np.array(self._frames[column_name])
+    def column_as_numpy_array(self, column_name: str) -> np.ndarray:
+        """Return a column as numpy array
+
+        Args:
+            column_name (str): the name of the required column
+
+        Returns:
+            numpy.ndarray: the column data as a numpy array
+        """
+        return self._frames[column_name].to_numpy()
 
     def serialize(self):
         obj = {"frames": self._frames, "batch_size": len(self)}
@@ -69,48 +77,34 @@ def deserialize(cls, data):
 
     @classmethod
     def from_eq(cls, batch1: Batch, batch2: Batch) -> Batch:
-        return Batch(
-            pd.DataFrame(batch1._frames.to_numpy() == batch2._frames.to_numpy())
-        )
+        return Batch(pd.DataFrame(batch1.to_numpy() == batch2.to_numpy()))
 
     @classmethod
     def from_greater(cls, batch1: Batch, batch2: Batch) -> Batch:
-        return Batch(
-            pd.DataFrame(batch1._frames.to_numpy() > batch2._frames.to_numpy())
-        )
+        return Batch(pd.DataFrame(batch1.to_numpy() > batch2.to_numpy()))
 
     @classmethod
     def from_lesser(cls, batch1: Batch, batch2: Batch) -> Batch:
-        return Batch(
-            pd.DataFrame(batch1._frames.to_numpy() < batch2._frames.to_numpy())
-        )
+        return Batch(pd.DataFrame(batch1.to_numpy() < batch2.to_numpy()))
 
     @classmethod
     def from_greater_eq(cls, batch1: Batch, batch2: Batch) -> Batch:
-        return Batch(
-            pd.DataFrame(batch1._frames.to_numpy() >= batch2._frames.to_numpy())
-        )
+        return Batch(pd.DataFrame(batch1.to_numpy() >= batch2.to_numpy()))
 
     @classmethod
     def from_lesser_eq(cls, batch1: Batch, batch2: Batch) -> Batch:
-        return Batch(
-            pd.DataFrame(batch1._frames.to_numpy() <= batch2._frames.to_numpy())
-        )
+        return Batch(pd.DataFrame(batch1.to_numpy() <= batch2.to_numpy()))
 
     @classmethod
     def from_not_eq(cls, batch1: Batch, batch2: Batch) -> Batch:
-        return Batch(
-            pd.DataFrame(batch1._frames.to_numpy() != batch2._frames.to_numpy())
-        )
+        return Batch(pd.DataFrame(batch1.to_numpy() != batch2.to_numpy()))
 
     @classmethod
     def compare_contains(cls, batch1: Batch, batch2: Batch) -> None:
         return cls(
             pd.DataFrame(
                 [all(x in p for x in q) for p, q in zip(left, right)]
-                for left, right in zip(
-                    batch1._frames.to_numpy(), batch2._frames.to_numpy()
-                )
+                for left, right in zip(batch1.to_numpy(), batch2.to_numpy())
             )
         )
 
@@ -119,9 +113,7 @@ def compare_is_contained(cls, batch1: Batch, batch2: Batch) -> None:
         return cls(
             pd.DataFrame(
                 [all(x in q for x in p) for p, q in zip(left, right)]
-                for left, right in zip(
-                    batch1._frames.to_numpy(), batch2._frames.to_numpy()
-                )
+                for left, right in zip(batch1.to_numpy(), batch2.to_numpy())
             )
         )
 
@@ -292,14 +284,11 @@ def __add__(self, other: Batch) -> Batch:
         if other.empty():
             return self
 
-        new_frames = pd.concat([self._frames, other.frames], ignore_index=True)
-
-        return Batch(new_frames)
+        return Batch.concat([self, other], copy=False)
 
     @classmethod
     def concat(cls, batch_list: Iterable[Batch], copy=True) -> Batch:
-        """Concat a list of batches. Avoid the extra copying overhead by
-        the append operation in __add__.
+        """Concat a list of batches.
         Notice: only frames are considered.
         """
 
@@ -378,11 +367,13 @@ def empty(self):
         """
         return len(self) == 0
 
-    def unnest(self) -> None:
+    def unnest(self, cols: List[str] = None) -> None:
         """
         Unnest columns and drop columns with no data
         """
-        self._frames = self._frames.explode(list(self._frames.columns))
+        if cols is None:
+            cols = list(self.columns)
+        self._frames = self._frames.explode(cols)
         self._frames.dropna(inplace=True)
 
     def reverse(self) -> None:

diff --git a/eva/udfs/udf_bootstrap_queries.py b/eva/udfs/udf_bootstrap_queries.py
@@ -176,13 +176,14 @@ def init_builtin_udfs(mode="debug"):
         # Disabled as it requires specific pytorch package
         # Mvit_udf_query,
     ]
-    queries.extend(
-        [
-            DummyObjectDetector_udf_query,
-            DummyMultiObjectDetector_udf_query,
-            DummyFeatureExtractor_udf_query,
-        ]
-    )
+    if mode != "release":
+        queries.extend(
+            [
+                DummyObjectDetector_udf_query,
+                DummyMultiObjectDetector_udf_query,
+                DummyFeatureExtractor_udf_query,
+            ]
+        )
 
     if mode != "minimal":
         queries.extend([YoloV5_udf_query])

diff --git a/setup.py b/setup.py
@@ -39,9 +39,9 @@ def read(path, encoding="utf-8"):
 minimal_requirement = [
     "numpy>=1.19.5,<=1.23.5",
     "opencv-python>=4.5.4.60,<4.6.0.66",  # bug in easyocr
-    "pandas>=1.1.5,<2.0.0", # major changes in 2.0.0
+    "pandas>=1.1.5",
     "Pillow>=8.4.0",
-    "sqlalchemy>=1.4.0,<2.0.0", # major changes in 2.0.0
+    "sqlalchemy>=1.4.0,<2.0.0",  # major changes in 2.0.0
     "sqlalchemy-utils>=0.36.6",
     "lark>=1.0.0",
     "pyyaml>=5.1",
@@ -50,13 +50,9 @@ def read(path, encoding="utf-8"):
     "aenum>=2.2.0",
     "diskcache>=5.4.0",
     "decord>=0.6.0",
-    "mock>=4.0.3", # for Dummy UDFs in test/util.py
 ]
 
-formatter_libs = [
-    "black>=23.1.0",
-    "isort>=5.10.1"
-]
+formatter_libs = ["black>=23.1.0", "isort>=5.10.1"]
 
 test_libs = [
     "pytest>=6.1.2",
@@ -67,53 +63,46 @@ def read(path, encoding="utf-8"):
     "pytest-xdist",
     "coveralls>=3.0.1",
     "flake8>=3.9.1",
-    "moto[s3]>=4.1.1"
+    "moto[s3]>=4.1.1",
 ]
 
 notebook_libs = [
     "ipywidgets>=7.7.2",
     "matplotlib>=3.3.4",
     "nbmake>=1.2.1",
-    "nest-asyncio>=1.5.6"
+    "nest-asyncio>=1.5.6",
 ]
 
 ### NEEDED FOR INTEGRATION TESTS ONLY
 integration_test_libs = [
     "torch>=1.10.0",
     "torchvision>=0.11.1",
-    "faiss-cpu" # faiss-gpu does not work on mac
+    "faiss-cpu",  # faiss-gpu does not work on mac
 ]
 
 benchmark_libs = [
     "pytest-benchmark",
 ]
 
-doc_libs = [
-]
+doc_libs = []
 
-dist_libs = [
-    "wheel>=0.37.1",
-    "scriv>=0.16.0"
-]
+dist_libs = ["wheel>=0.37.1", "scriv>=0.16.0"]
 
 ### NEEDED FOR AN ALTERNATE DATA SYSTEM OTHER THAN SQLITE
-database_libs = [
-    "pymysql>=0.10.1"
-]
+database_libs = ["pymysql>=0.10.1"]
 
 ### NEEDED FOR A BATTERIES-LOADED EXPERIENCE
 udf_libs = [
-    "facenet-pytorch>=2.5.2", # FACE DETECTION
-    "easyocr>=1.5.0",         # OCR EXTRACTION
+    "facenet-pytorch>=2.5.2",  # FACE DETECTION
+    "easyocr>=1.5.0",  # OCR EXTRACTION
     "ipython",
-    "yolov5<=7.0.6",          # OBJECT DETECTION
-    "detoxify",               # TEXT TOXICITY CLASSIFICATION
-    "thefuzz"                 # FUZZY STRING MATCHINGz
+    "yolov5<=7.0.6",  # OBJECT DETECTION
+    "detoxify",  # TEXT TOXICITY CLASSIFICATION
+    "thefuzz",  # FUZZY STRING MATCHINGz
 ]
 
 ### NEEDED FOR EXPERIMENTAL FEATURES
-experimental_libs = [
-]
+experimental_libs = []
 
 INSTALL_REQUIRES = minimal_requirement + integration_test_libs + udf_libs
 DEV_REQUIRES = (
@@ -128,9 +117,7 @@ def read(path, encoding="utf-8"):
     + experimental_libs
 )
 
-EXTRA_REQUIRES = {
-    "dev": DEV_REQUIRES
-}
+EXTRA_REQUIRES = {"dev": DEV_REQUIRES}
 
 setup(
     name=NAME,
@@ -149,22 +136,19 @@ def read(path, encoding="utf-8"):
         "License :: OSI Approved :: Apache Software License",
         "Programming Language :: Python :: 3",
         "Development Status :: 3 - Alpha",
-        "Operating System :: OS Independent"
+        "Operating System :: OS Independent",
     ],
-    packages=find_packages(exclude=[
-        "tests",
-        "tests.*"
-    ]),
+    packages=find_packages(exclude=["tests", "tests.*"]),
     # https://python-packaging.readthedocs.io/en/latest/command-line-scripts.html#the-console-scripts-entry-point
-    entry_points={"console_scripts": [
-        "eva_server=eva.eva_server:main",
-        "eva_client=eva.eva_cmd_client:main"
-    ]},
+    entry_points={
+        "console_scripts": [
+            "eva_server=eva.eva_server:main",
+            "eva_client=eva.eva_cmd_client:main",
+        ]
+    },
     python_requires=">=3.7",
     install_requires=INSTALL_REQUIRES,
     extras_require=EXTRA_REQUIRES,
     include_package_data=True,
-    package_data={
-        "eva": ["eva.yml", "parser/eva.lark"]
-    }
+    package_data={"eva": ["eva.yml", "parser/eva.lark"]},
 )
diff --git a/test/models/storage/test_batch.py b/test/models/storage/test_batch.py
@@ -31,7 +31,7 @@ def test_batch_serialize_deserialize(self):
     def test_frames_as_numpy_array_should_frames_as_numpy_array(self):
         batch = Batch(frames=create_dataframe_same(2))
         expected = list(np.ones((2, 1, 1)))
-        actual = list(batch.column_as_numpy_array())
+        actual = list(batch.column_as_numpy_array(batch.columns[0]))
         self.assertEqual(expected, actual)
 
     def test_return_only_frames_specified_in_the_indices(self):