Skip to content

Commit

Permalink
fix: Minor code refactor (#632)
Browse files Browse the repository at this point in the history
* refactor

* minor fix testcase

* bug: fix pip install

* merge master

* remove unused code

* fix mnist udf
  • Loading branch information
gaurav274 committed Apr 4, 2023
1 parent 3f302bd commit 0f41606
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 119 deletions.
21 changes: 21 additions & 0 deletions eva/binder/binder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from eva.catalog.catalog_type import TableType
from eva.catalog.catalog_utils import is_string_col, is_video_table
from eva.catalog.sql_config import IDENTIFIER_COLUMN
from eva.expression.function_expression import FunctionExpression
from eva.parser.alias import Alias

if TYPE_CHECKING:
from eva.binder.statement_binder_context import StatementBinderContext
Expand Down Expand Up @@ -113,3 +115,22 @@ def check_column_name_is_string(col_ref) -> None:
if not is_string_col(col_ref.col_object):
err_msg = "LIKE only supported for string columns"
raise BinderError(err_msg)


def resolve_alias_table_value_expression(node: FunctionExpression):
default_alias_name = node.name.lower()
default_output_col_aliases = [str(obj.name.lower()) for obj in node.output_objs]
if not node.alias:
node.alias = Alias(default_alias_name, default_output_col_aliases)
else:
if not len(node.alias.col_names):
node.alias = Alias(node.alias.alias_name, default_output_col_aliases)
else:
output_aliases = [
str(col_name.lower()) for col_name in node.alias.col_names
]
node.alias = Alias(node.alias.alias_name, output_aliases)

assert len(node.alias.col_names) == len(
node.output_objs
), f"""Expected {len(node.output_objs)} output columns for {node.alias.alias_name}, got {len(node.alias.col_names)}."""
19 changes: 2 additions & 17 deletions eva/binder/statement_binder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@
check_groupby_pattern,
check_table_object_is_video,
extend_star,
resolve_alias_table_value_expression,
)
from eva.binder.statement_binder_context import StatementBinderContext
from eva.catalog.catalog_manager import CatalogManager
from eva.catalog.catalog_type import IndexType, NdArrayType, TableType, VideoColumnName
from eva.expression.abstract_expression import AbstractExpression, ExpressionType
from eva.expression.function_expression import FunctionExpression
from eva.expression.tuple_value_expression import TupleValueExpression
from eva.parser.alias import Alias
from eva.parser.create_index_statement import CreateIndexStatement
from eva.parser.create_mat_view_statement import CreateMaterializedViewStatement
from eva.parser.delete_statement import DeleteTableStatement
Expand Down Expand Up @@ -278,19 +278,4 @@ def _bind_func_expr(self, node: FunctionExpression):
node.output_objs = output_objs
node.projection_columns = [obj.name.lower() for obj in output_objs]

default_alias_name = node.name.lower()
default_output_col_aliases = [str(obj.name.lower()) for obj in node.output_objs]
if not node.alias:
node.alias = Alias(default_alias_name, default_output_col_aliases)
else:
if not len(node.alias.col_names):
node.alias = Alias(node.alias.alias_name, default_output_col_aliases)
else:
output_aliases = [
str(col_name.lower()) for col_name in node.alias.col_names
]
node.alias = Alias(node.alias.alias_name, output_aliases)

assert len(node.alias.col_names) == len(
node.output_objs
), f"""Expected {len(node.output_objs)} output columns for {node.alias.alias_name}, got {len(node.alias.col_names)}."""
resolve_alias_table_value_expression(node)
2 changes: 1 addition & 1 deletion eva/binder/statement_binder_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def _search_all_alias_maps(self, col_name: str) -> Tuple[str, CatalogColumnType]
alias_match = alias

if num_alias_matches > 1:
err_msg = "Ambiguous Column name {col_name}"
err_msg = f"Ambiguous Column name {col_name}"
logger.error(err_msg)
raise BinderError(err_msg)

Expand Down
57 changes: 24 additions & 33 deletions eva/models/storage/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,16 @@ def __len__(self):
def columns(self):
return self._frames.columns

def column_as_numpy_array(self, column_name="data"):
return np.array(self._frames[column_name])
def column_as_numpy_array(self, column_name: str) -> np.ndarray:
"""Return a column as numpy array
Args:
column_name (str): the name of the required column
Returns:
numpy.ndarray: the column data as a numpy array
"""
return self._frames[column_name].to_numpy()

def serialize(self):
obj = {"frames": self._frames, "batch_size": len(self)}
Expand All @@ -69,48 +77,34 @@ def deserialize(cls, data):

@classmethod
def from_eq(cls, batch1: Batch, batch2: Batch) -> Batch:
return Batch(
pd.DataFrame(batch1._frames.to_numpy() == batch2._frames.to_numpy())
)
return Batch(pd.DataFrame(batch1.to_numpy() == batch2.to_numpy()))

@classmethod
def from_greater(cls, batch1: Batch, batch2: Batch) -> Batch:
return Batch(
pd.DataFrame(batch1._frames.to_numpy() > batch2._frames.to_numpy())
)
return Batch(pd.DataFrame(batch1.to_numpy() > batch2.to_numpy()))

@classmethod
def from_lesser(cls, batch1: Batch, batch2: Batch) -> Batch:
return Batch(
pd.DataFrame(batch1._frames.to_numpy() < batch2._frames.to_numpy())
)
return Batch(pd.DataFrame(batch1.to_numpy() < batch2.to_numpy()))

@classmethod
def from_greater_eq(cls, batch1: Batch, batch2: Batch) -> Batch:
return Batch(
pd.DataFrame(batch1._frames.to_numpy() >= batch2._frames.to_numpy())
)
return Batch(pd.DataFrame(batch1.to_numpy() >= batch2.to_numpy()))

@classmethod
def from_lesser_eq(cls, batch1: Batch, batch2: Batch) -> Batch:
return Batch(
pd.DataFrame(batch1._frames.to_numpy() <= batch2._frames.to_numpy())
)
return Batch(pd.DataFrame(batch1.to_numpy() <= batch2.to_numpy()))

@classmethod
def from_not_eq(cls, batch1: Batch, batch2: Batch) -> Batch:
return Batch(
pd.DataFrame(batch1._frames.to_numpy() != batch2._frames.to_numpy())
)
return Batch(pd.DataFrame(batch1.to_numpy() != batch2.to_numpy()))

@classmethod
def compare_contains(cls, batch1: Batch, batch2: Batch) -> None:
return cls(
pd.DataFrame(
[all(x in p for x in q) for p, q in zip(left, right)]
for left, right in zip(
batch1._frames.to_numpy(), batch2._frames.to_numpy()
)
for left, right in zip(batch1.to_numpy(), batch2.to_numpy())
)
)

Expand All @@ -119,9 +113,7 @@ def compare_is_contained(cls, batch1: Batch, batch2: Batch) -> None:
return cls(
pd.DataFrame(
[all(x in q for x in p) for p, q in zip(left, right)]
for left, right in zip(
batch1._frames.to_numpy(), batch2._frames.to_numpy()
)
for left, right in zip(batch1.to_numpy(), batch2.to_numpy())
)
)

Expand Down Expand Up @@ -292,14 +284,11 @@ def __add__(self, other: Batch) -> Batch:
if other.empty():
return self

new_frames = pd.concat([self._frames, other.frames], ignore_index=True)

return Batch(new_frames)
return Batch.concat([self, other], copy=False)

@classmethod
def concat(cls, batch_list: Iterable[Batch], copy=True) -> Batch:
"""Concat a list of batches. Avoid the extra copying overhead by
the append operation in __add__.
"""Concat a list of batches.
Notice: only frames are considered.
"""

Expand Down Expand Up @@ -378,11 +367,13 @@ def empty(self):
"""
return len(self) == 0

def unnest(self) -> None:
def unnest(self, cols: List[str] = None) -> None:
"""
Unnest columns and drop columns with no data
"""
self._frames = self._frames.explode(list(self._frames.columns))
if cols is None:
cols = list(self.columns)
self._frames = self._frames.explode(cols)
self._frames.dropna(inplace=True)

def reverse(self) -> None:
Expand Down
15 changes: 8 additions & 7 deletions eva/udfs/udf_bootstrap_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,13 +176,14 @@ def init_builtin_udfs(mode="debug"):
# Disabled as it requires specific pytorch package
# Mvit_udf_query,
]
queries.extend(
[
DummyObjectDetector_udf_query,
DummyMultiObjectDetector_udf_query,
DummyFeatureExtractor_udf_query,
]
)
if mode != "release":
queries.extend(
[
DummyObjectDetector_udf_query,
DummyMultiObjectDetector_udf_query,
DummyFeatureExtractor_udf_query,
]
)

if mode != "minimal":
queries.extend([YoloV5_udf_query])
Expand Down
66 changes: 25 additions & 41 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ def read(path, encoding="utf-8"):
minimal_requirement = [
"numpy>=1.19.5,<=1.23.5",
"opencv-python>=4.5.4.60,<4.6.0.66", # bug in easyocr
"pandas>=1.1.5,<2.0.0", # major changes in 2.0.0
"pandas>=1.1.5",
"Pillow>=8.4.0",
"sqlalchemy>=1.4.0,<2.0.0", # major changes in 2.0.0
"sqlalchemy>=1.4.0,<2.0.0", # major changes in 2.0.0
"sqlalchemy-utils>=0.36.6",
"lark>=1.0.0",
"pyyaml>=5.1",
Expand All @@ -50,13 +50,9 @@ def read(path, encoding="utf-8"):
"aenum>=2.2.0",
"diskcache>=5.4.0",
"decord>=0.6.0",
"mock>=4.0.3", # for Dummy UDFs in test/util.py
]

formatter_libs = [
"black>=23.1.0",
"isort>=5.10.1"
]
formatter_libs = ["black>=23.1.0", "isort>=5.10.1"]

test_libs = [
"pytest>=6.1.2",
Expand All @@ -67,53 +63,46 @@ def read(path, encoding="utf-8"):
"pytest-xdist",
"coveralls>=3.0.1",
"flake8>=3.9.1",
"moto[s3]>=4.1.1"
"moto[s3]>=4.1.1",
]

notebook_libs = [
"ipywidgets>=7.7.2",
"matplotlib>=3.3.4",
"nbmake>=1.2.1",
"nest-asyncio>=1.5.6"
"nest-asyncio>=1.5.6",
]

### NEEDED FOR INTEGRATION TESTS ONLY
integration_test_libs = [
"torch>=1.10.0",
"torchvision>=0.11.1",
"faiss-cpu" # faiss-gpu does not work on mac
"faiss-cpu", # faiss-gpu does not work on mac
]

benchmark_libs = [
"pytest-benchmark",
]

doc_libs = [
]
doc_libs = []

dist_libs = [
"wheel>=0.37.1",
"scriv>=0.16.0"
]
dist_libs = ["wheel>=0.37.1", "scriv>=0.16.0"]

### NEEDED FOR AN ALTERNATE DATA SYSTEM OTHER THAN SQLITE
database_libs = [
"pymysql>=0.10.1"
]
database_libs = ["pymysql>=0.10.1"]

### NEEDED FOR A BATTERIES-LOADED EXPERIENCE
udf_libs = [
"facenet-pytorch>=2.5.2", # FACE DETECTION
"easyocr>=1.5.0", # OCR EXTRACTION
"facenet-pytorch>=2.5.2", # FACE DETECTION
"easyocr>=1.5.0", # OCR EXTRACTION
"ipython",
"yolov5<=7.0.6", # OBJECT DETECTION
"detoxify", # TEXT TOXICITY CLASSIFICATION
"thefuzz" # FUZZY STRING MATCHINGz
"yolov5<=7.0.6", # OBJECT DETECTION
"detoxify", # TEXT TOXICITY CLASSIFICATION
"thefuzz", # FUZZY STRING MATCHINGz
]

### NEEDED FOR EXPERIMENTAL FEATURES
experimental_libs = [
]
experimental_libs = []

INSTALL_REQUIRES = minimal_requirement + integration_test_libs + udf_libs
DEV_REQUIRES = (
Expand All @@ -128,9 +117,7 @@ def read(path, encoding="utf-8"):
+ experimental_libs
)

EXTRA_REQUIRES = {
"dev": DEV_REQUIRES
}
EXTRA_REQUIRES = {"dev": DEV_REQUIRES}

setup(
name=NAME,
Expand All @@ -149,22 +136,19 @@ def read(path, encoding="utf-8"):
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Development Status :: 3 - Alpha",
"Operating System :: OS Independent"
"Operating System :: OS Independent",
],
packages=find_packages(exclude=[
"tests",
"tests.*"
]),
packages=find_packages(exclude=["tests", "tests.*"]),
# https://python-packaging.readthedocs.io/en/latest/command-line-scripts.html#the-console-scripts-entry-point
entry_points={"console_scripts": [
"eva_server=eva.eva_server:main",
"eva_client=eva.eva_cmd_client:main"
]},
entry_points={
"console_scripts": [
"eva_server=eva.eva_server:main",
"eva_client=eva.eva_cmd_client:main",
]
},
python_requires=">=3.7",
install_requires=INSTALL_REQUIRES,
extras_require=EXTRA_REQUIRES,
include_package_data=True,
package_data={
"eva": ["eva.yml", "parser/eva.lark"]
}
package_data={"eva": ["eva.yml", "parser/eva.lark"]},
)
2 changes: 1 addition & 1 deletion test/models/storage/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_batch_serialize_deserialize(self):
def test_frames_as_numpy_array_should_frames_as_numpy_array(self):
batch = Batch(frames=create_dataframe_same(2))
expected = list(np.ones((2, 1, 1)))
actual = list(batch.column_as_numpy_array())
actual = list(batch.column_as_numpy_array(batch.columns[0]))
self.assertEqual(expected, actual)

def test_return_only_frames_specified_in_the_indices(self):
Expand Down
Loading

0 comments on commit 0f41606

Please sign in to comment.