dcmoura · ricardocchaves · Mar 1, 2022 · Mar 1, 2022 · Mar 1, 2022 · Mar 1, 2022
diff --git a/README.md b/README.md
@@ -93,7 +93,7 @@ Right now, the focus is on building a command-line tool that follows these core
 SELECT [ DISTINCT | PARTIALS ] 
     [ * | python_expression [ AS output_column_name ] [, ...] ]
     [ FROM csv | spy | text | python_expression | json [ EXPLODE path ] ]
-    [ WHERE python_expression ]
+    [ WHERE python_expression [ [NOT] LIKE string] ]
     [ GROUP BY output_column_number | python_expression  [, ...] ]
     [ ORDER BY output_column_number | python_expression
         [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [, ...] ]

diff --git a/spyql/cli.py b/spyql/cli.py
@@ -207,6 +207,42 @@ def parse_select(sel, strings):
     return res, has_distinct, has_partials
 
 
+def parse_wherelike(clause, strings):
+    """splits the LIKE clause and completely supports the SQL syntax
+    https://docs.microsoft.com/en-us/sql/t-sql/language-elements/like-transact-sql?view=sql-server-ver15"""
+    # We're not in a LIKE expression, do nothing
+    if not re.search("LIKE", clause):
+        return clause
+
+    # Supports words containing [a-zA-Z0-9_\-]
+    expr_pattern = re.compile(r"([\w-]+)(?:\s+(NOT))?\s+LIKE\s+([\w-]+)", re.IGNORECASE)
+    groups = re.search(expr_pattern, clause)
+    if groups is None:
+        spyql.log.user_error(
+            f"{clause}",
+            SyntaxError("unexpected EOF while parsing")
+        )
+
+    groups = groups.groups()
+    negate = "NOT" in {groups[1]} # placed within {} because it can be None
+
+    if not groups[2] in strings:
+        spyql.log.user_error(
+            f"{groups[2]}: missing quotes, must be a string",
+            SyntaxError("bad query")
+        )
+
+    # Replacing SQL wildcard '%' for regex wildcard '.*' if not preceded by '\'
+    pattern = strings.put_strings_back(groups[2])
+    pattern = re.compile(r"(?<!\\)%").sub(r".*" , pattern)
+    pattern = re.compile(r"([^\"].*[^\"])").sub(r"^\1$", pattern)
+
+    clause = "re.match({}, str({}))".format(pattern, groups[0])
+    clause = "not " + clause if negate else clause
+
+    return clause
+
+
 def parse_orderby(clause, strings):
     """splits the ORDER BY clause and handles modifiers"""
 
@@ -275,9 +311,10 @@ def parse(query):
         "order by",
     }:
         if prs[clause]:
+            prs[clause] = make_expr_ready(prs[clause], strings)
             if clause in {"where", "from"}:
                 throw_error_if_has_agg_func(prs[clause], clause.upper())
-            prs[clause] = make_expr_ready(prs[clause], strings)
+                prs[clause] = parse_wherelike(prs[clause], strings)
 
     for clause in {"group by"}:
         if prs[clause]:
@@ -400,7 +437,7 @@ def main(query, warning_flag, verbose, unbuffered, input_opt, output_opt):
     SELECT [ DISTINCT | PARTIALS ]
         [ * | python_expression [ AS output_column_name ] [, ...] ]
         [ FROM csv | spy | text | python_expression | json [ EXPLODE path ] ]
-        [ WHERE python_expression ]
+        [ WHERE python_expression [ [NOT] LIKE string] ]
         [ GROUP BY output_column_number | python_expression  [, ...] ]
         [ ORDER BY output_column_number | python_expression
             [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [, ...] ]

diff --git a/spyql/quotes_handler.py b/spyql/quotes_handler.py
@@ -9,6 +9,9 @@ class QuotesHandler:
     def __init__(self):
         self.strings = {}
 
+    def __iter__(self):
+        return iter(self.strings)
+
     # replaces quoted strings by placeholders to make parsing easier
     # populates dictionary of placeholders and the strings they hold
     def extract_strings(self, query):

diff --git a/tests/main_test.py b/tests/main_test.py
@@ -193,6 +193,78 @@ def test_basic():
     )
 
 
+def test_wherelike():
+    base_data = """abc,def
+test1,a
+test2,a
+bla,a
+"""
+
+    # where like clause
+    eq_test_1row(
+        'SELECT * FROM range(3) WHERE col1 LIKE "1"', {"col1": 1}
+    )
+
+    # not matching
+    eq_test_nrows(
+        'SELECT * FROM range(3) WHERE col1 LIKE "5"', []
+    )
+
+    # where not like clause
+    eq_test_nrows(
+        'SELECT * FROM range(3) WHERE col1 NOT LIKE "1"', [{"col1": 0}, {"col1": 2}]
+    )
+
+    # non matching string
+    eq_test_nrows(
+        'SELECT abc FROM csv WHERE abc LIKE "x"',
+        [],
+        data=base_data
+    )
+
+    # matching string
+    eq_test_nrows(
+        'SELECT abc FROM csv WHERE abc LIKE "test1"',
+        [{"abc": "test1"}],
+        data=base_data
+    )
+
+    # wildcard in end
+    eq_test_nrows(
+        'SELECT abc FROM csv WHERE abc LIKE "test%"',
+        [{"abc": "test1"}, {"abc": "test2"}],
+        data=base_data
+    )
+
+    # wildcard in start
+    eq_test_nrows(
+        'SELECT abc FROM csv WHERE abc LIKE "%test"',
+        [{"abc": "1test"}, {"abc": "2test"}],
+        data=base_data+"1test,a\n2test,a\n"
+    )
+
+    # wildcard in start and end
+    eq_test_nrows(
+        'SELECT abc FROM csv WHERE abc LIKE "%test%"',
+        [{"abc": "test1"}, {"abc": "test2"}, {"abc": "1test1"}, {"abc": "2test2"}],
+        data=base_data+"1test1,a\n2test2,a\n"
+    )
+
+    # wildcard escaping
+    eq_test_nrows(
+        r'SELECT abc FROM csv WHERE abc LIKE "bla\\%bla"',
+        [{"abc": "bla%bla"}],
+        data=base_data+"bla%bla,a\n"
+    )
+
+    # wildcards only
+    eq_test_nrows(
+        r'SELECT abc FROM csv WHERE abc LIKE "%\\%%"',
+        [{"abc": "bla%bla"}],
+        data=base_data+"bla%bla,a\n"
+    )
+
+
 def test_orderby():
     # order by (1 col)
     eq_test_nrows(
@@ -773,6 +845,8 @@ def test_errors():
     exception_test("SELECT DISTINCT count_agg(1)", SyntaxError)
     exception_test("SELECT count_agg(1) GROUP BY 1", SyntaxError)
     exception_test("SELECT 1 FROM range(3) WHERE max_agg(col1) > 0", SyntaxError)
+    exception_test("SELECT * from range(3) WHERE col1 LIKE 1", SyntaxError)
+    exception_test("SELECT * from range(3) WHERE col1 LIKE", SyntaxError)
 
 
 def test_sql_output():