Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tutorial: toxicity classifier #708

Merged
merged 20 commits into from
May 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
<!--
👋 Thanks for submitting a Pull Request to EVA DB!

🙌 We want to make contributing to EVA DB as easy and transparent as possible. Here are a few tips to get you started:
Expand All @@ -9,7 +8,3 @@

👉 Please see our ✅ [Contributing Guide](https://evadb.readthedocs.io/en/stable/source/contribute/index.html) for more details.

🚨 Note that Copilot will summarize this PR below, do not modify the 'copilot:all' line.
-->

copilot:all
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ tutorials/bddtest.zip
tutorials/license.zip
license/
bddtest/
tutorials/*.jpg

# benchmark
.benchmarks
Expand Down
2 changes: 2 additions & 0 deletions eva/executor/create_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from eva.executor.executor_utils import handle_if_not_exists
from eva.plan_nodes.create_plan import CreatePlan
from eva.storage.storage_engine import StorageEngine
from eva.utils.logging_manager import logger


class CreateExecutor(AbstractExecutor):
Expand All @@ -26,6 +27,7 @@ def __init__(self, node: CreatePlan):

def exec(self, *args, **kwargs):
if not handle_if_not_exists(self.node.table_info, self.node.if_not_exists):
logger.debug(f"Creating table {self.node.table_info}")
catalog_entry = self.catalog.create_and_insert_table_catalog_entry(
self.node.table_info, self.node.column_list
)
Expand Down
1 change: 1 addition & 0 deletions eva/executor/drop_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def exec(self, *args, **kwargs):
)
storage_engine = StorageEngine.factory(table_obj)

logger.debug(f"Dropping table {table_info}")
storage_engine.drop(table=table_obj)

for col_obj in table_obj.columns:
Expand Down
2 changes: 2 additions & 0 deletions eva/executor/executor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def apply_predicate(batch: Batch, predicate: AbstractExpression) -> Batch:


def handle_if_not_exists(table_info: TableInfo, if_not_exist=False):
# Table exists
if CatalogManager().check_table_exists(
table_info.table_name,
table_info.database_name,
Expand All @@ -57,6 +58,7 @@ def handle_if_not_exists(table_info: TableInfo, if_not_exist=False):
else:
logger.error(err_msg)
raise ExecutorError(err_msg)
# Table does not exist
else:
return False

Expand Down
6 changes: 5 additions & 1 deletion eva/models/server/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from enum import Enum
from typing import Optional

from eva.executor.executor_utils import ExecutorError
from eva.models.storage.batch import Batch
from eva.utils.generic_utils import PickleSerializer

Expand Down Expand Up @@ -45,7 +46,10 @@ def deserialize(cls, data):
return obj

def as_df(self):
assert self.batch is not None, "Response is empty"
if self.error is not None:
raise ExecutorError(self.error)
if self.batch is None:
raise ExecutorError("Empty batch")
return self.batch.frames

def __str__(self):
Expand Down
62 changes: 61 additions & 1 deletion script/formatting/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import sys
from pathlib import Path
import asyncio
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell

import pkg_resources

Expand Down Expand Up @@ -48,6 +50,7 @@ def wrapped(*args, **kwargs):
EVA_SRC_DIR = os.path.join(EVA_DIR, "eva")
EVA_TEST_DIR = os.path.join(EVA_DIR, "test")
EVA_SCRIPT_DIR = os.path.join(EVA_DIR, "script")
EVA_NOTEBOOKS_DIR = os.path.join(EVA_DIR, "tutorials")

FORMATTING_DIR = os.path.join(EVA_SCRIPT_DIR, "formatting")
PYLINTRC = os.path.join(FORMATTING_DIR, "pylintrc")
Expand Down Expand Up @@ -206,6 +209,54 @@ def format_file(file_path, add_header, strip_header, format_code):

# END FORMAT__FILE(FILE_NAME)

# check the notebooks
def check_notebook_format(notebook_file):
notebook_file_name = os.path.basename(notebook_file)

# Ignore this notebook
if notebook_file_name == "ignore_tag.ipynb":
return True

with open(notebook_file) as f:
nb = nbformat.read(f, as_version=4)

# Check that the notebook contains at least one cell
if not nb.cells:
LOG.error(f"ERROR: Notebook {notebook_file} has no cells")
sys.exit(1)

# Check that all cells have a valid cell type (code, markdown, or raw)
for cell in nb.cells:
if cell.cell_type not in ['code', 'markdown', 'raw']:
LOG.error(f"ERROR: Notebook {notebook_file} contains an invalid cell type: {cell.cell_type}")
sys.exit(1)

# Check that all code cells have a non-empty source code
for cell in nb.cells:
if cell.cell_type == 'code' and not cell.source.strip():
LOG.error(f"ERROR: Notebook {notebook_file} contains an empty code cell")
sys.exit(1)

# Check for "print(response)"
for cell in nb.cells:
if cell.cell_type == 'code' and 'print(response)' in cell.source:
LOG.error(f"ERROR: Notebook {notebook_file} contains an a cell with this content: {cell.source}")
sys.exit(1)

# Check for "Colab link"
contains_colab_link = False
for cell in nb.cells:
if cell.cell_type == 'markdown' and 'colab' in cell.source:
# Check if colab link is correct
# notebook_file_name must match colab link
if notebook_file_name in cell.source:
contains_colab_link = True
break

if contains_colab_link is False:
sys.exit(1)

return True

# format all the files in the dir passed as argument
def format_dir(dir_path, add_header, strip_header, format_code):
Expand Down Expand Up @@ -325,4 +376,13 @@ def check_file(file):
)

for file in files:
check_file(file)
check_file(file)

# CHECK ALL THE NOTEBOOKS

# Iterate over all files in the directory
# and check if they are Jupyter notebooks
for file in os.listdir(EVA_NOTEBOOKS_DIR):
if file.endswith(".ipynb"):
notebook_file = os.path.join(EVA_NOTEBOOKS_DIR, file)
check_notebook_format(notebook_file)
4 changes: 2 additions & 2 deletions script/test/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ if [[ "$OSTYPE" != "msys" ]];
then
if [[ "$MODE" = "TEST" || "$MODE" = "ALL" ]];
then
PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered --cov-config=.coveragerc --cov-context=test --cov=eva/ -s -v --log-level=WARNING -m "not benchmark"
PYTHONPATH=./ pytest --durations=20 --cov-report term-missing:skip-covered --cov-config=.coveragerc --cov-context=test --cov=eva/ --capture=sys --tb=short -v --log-level=WARNING -m "not benchmark"
elif [[ "$MODE" = "RAY" ]];
then
PYTHONPATH=./ pytest -s -v -p no:cov test/ -m "not benchmark"
Expand Down Expand Up @@ -109,7 +109,7 @@ fi

if [[ ( "$OSTYPE" != "msys" ) && ( "$MODE" = "NOTEBOOK" || "$MODE" = "ALL" ) ]];
then
PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" -s -v --log-level=WARNING --nbmake-timeout=3000
PYTHONPATH=./ python -m pytest --durations=5 --nbmake --overwrite "./tutorials" --capture=sys --tb=short -v --log-level=WARNING --nbmake-timeout=3000 --ignore="tutorials/08-chatgpt.ipynb"
notebook_test_code=$?
if [ "$notebook_test_code" != "0" ];
then
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def read(path, encoding="utf-8"):
"ipython<8.13.0", # NOTEBOOKS
"thefuzz", # FUZZY STRING MATCHING
"ultralytics", # OBJECT DETECTION
"transformers==4.27.4", # HUGGINGFACE
"transformers>=4.27.4", # HUGGINGFACE
"openai>=0.27.4", # CHATGPT
]

Expand Down
104 changes: 104 additions & 0 deletions test/integration_tests/test_huggingface_udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import unittest
from test.util import create_text_csv, file_remove

import pytest

from eva.catalog.catalog_manager import CatalogManager
from eva.executor.executor_utils import ExecutorError
from eva.server.command_handler import execute_query_fetch_all
Expand Down Expand Up @@ -44,6 +46,7 @@ def setUp(self) -> None:
def tearDown(self) -> None:
execute_query_fetch_all("DROP TABLE IF EXISTS DETRAC;")
execute_query_fetch_all("DROP TABLE IF EXISTS VIDEOS;")
execute_query_fetch_all("DROP TABLE IF EXISTS MyCSV;")
file_remove(self.csv_file_path)

def test_io_catalog_entries_populated(self):
Expand Down Expand Up @@ -140,6 +143,7 @@ def test_image_classification(self):

select_query = f"SELECT {udf_name}(data) FROM DETRAC WHERE id < 3;"
output = execute_query_fetch_all(select_query)
print("output: ", output)

# Test that output has 2 columns
self.assertEqual(len(output.frames.columns), 2)
Expand All @@ -159,6 +163,7 @@ def test_image_classification(self):
drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)

@pytest.mark.benchmark
def test_text_classification(self):
create_table_query = """CREATE TABLE IF NOT EXISTS MyCSV (
id INTEGER UNIQUE,
Expand Down Expand Up @@ -203,6 +208,7 @@ def test_text_classification(self):
execute_query_fetch_all(drop_udf_query)
execute_query_fetch_all("DROP TABLE MyCSV;")

@pytest.mark.benchmark
def test_automatic_speech_recognition(self):
udf_name = "SpeechRecognizer"
create_udf = (
Expand All @@ -223,6 +229,7 @@ def test_automatic_speech_recognition(self):
drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)

@pytest.mark.benchmark
def test_summarization_from_video(self):
asr_udf = "SpeechRecognizer"
create_udf = (
Expand Down Expand Up @@ -254,3 +261,100 @@ def test_summarization_from_video(self):
execute_query_fetch_all(drop_udf_query)
drop_udf_query = f"DROP UDF {summary_udf};"
execute_query_fetch_all(drop_udf_query)

@pytest.mark.benchmark
def test_toxicity_classification(self):
udf_name = "HFToxicityClassifier"
create_udf_query = f"""CREATE UDF {udf_name}
TYPE HuggingFace
'task' 'text-classification'
'model' 'martin-ha/toxic-comment-model'
"""
execute_query_fetch_all(create_udf_query)

drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
execute_query_fetch_all(drop_table_query)

create_table_query = """CREATE TABLE IF NOT EXISTS MyCSV (
id INTEGER UNIQUE,
comment TEXT(30)
);"""
execute_query_fetch_all(create_table_query)

load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
execute_query_fetch_all(load_table_query)

select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
output = execute_query_fetch_all(select_query)

# Test that output has 2 columns
self.assertEqual(len(output.frames.columns), 2)

# Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
self.assertTrue(
all(
x in ["non-toxic", "toxic"]
for x in output.frames[udf_name.lower() + ".label"]
)
)

# Test that there exists a column with udf_name.score
# and each entry is a float
self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
self.assertTrue(
all(
isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
)
)

drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)

@pytest.mark.benchmark
def test_multilingual_toxicity_classification(self):
udf_name = "HFMultToxicityClassifier"
create_udf_query = f"""CREATE UDF {udf_name}
TYPE HuggingFace
'task' 'text-classification'
'model' 'EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus'
"""
execute_query_fetch_all(create_udf_query)

drop_table_query = """DROP TABLE IF EXISTS MyCSV;"""
execute_query_fetch_all(drop_table_query)

create_table_query = """CREATE TABLE MyCSV (
id INTEGER UNIQUE,
comment TEXT(30)
);"""
execute_query_fetch_all(create_table_query)

load_table_query = f"""LOAD CSV '{self.csv_file_path}' INTO MyCSV;"""
execute_query_fetch_all(load_table_query)

select_query = f"SELECT {udf_name}(comment) FROM MyCSV;"
output = execute_query_fetch_all(select_query)

# Test that output has 2 columns
self.assertEqual(len(output.frames.columns), 2)

# Test that there exists a column with udf_name.label and each entry is either "POSITIVE" or "NEGATIVE"
self.assertTrue(udf_name.lower() + ".label" in output.frames.columns)
self.assertTrue(
all(
x in ["LABEL_1", "LABEL_0"]
for x in output.frames[udf_name.lower() + ".label"]
)
)

# Test that there exists a column with udf_name.score and each entry is a float
self.assertTrue(udf_name.lower() + ".score" in output.frames.columns)
self.assertTrue(
all(
isinstance(x, float) for x in output.frames[udf_name.lower() + ".score"]
)
)

drop_udf_query = f"DROP UDF {udf_name};"
execute_query_fetch_all(drop_udf_query)
19 changes: 14 additions & 5 deletions tutorials/00-start-eva-server.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,22 @@
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2023-04-30T15:16:35.543397Z",
"iopub.status.busy": "2023-04-30T15:16:35.542857Z",
"iopub.status.idle": "2023-04-30T15:16:49.079780Z",
"shell.execute_reply": "2023-04-30T15:16:49.077653Z"
"iopub.execute_input": "2023-05-09T03:37:54.104875Z",
"iopub.status.busy": "2023-05-09T03:37:54.104289Z",
"iopub.status.idle": "2023-05-09T03:38:08.286784Z",
"shell.execute_reply": "2023-05-09T03:38:08.284731Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\r\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand Down Expand Up @@ -133,7 +142,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.8"
},
"vscode": {
"interpreter": {
Expand Down
Loading