From a7697becf81c1000485a55294b1d856fcccd70b7 Mon Sep 17 00:00:00 2001 From: dcmoura Date: Wed, 2 Jun 2021 07:25:16 +0100 Subject: [PATCH 01/10] moved quote handling to dedicated class --- spyql/quotes_handler.py | 52 +++++++++++++++++++++++++++++++ spyql/spyql.py | 68 ++++++++++++----------------------------- 2 files changed, 72 insertions(+), 48 deletions(-) create mode 100644 spyql/quotes_handler.py diff --git a/spyql/quotes_handler.py b/spyql/quotes_handler.py new file mode 100644 index 0000000..4d28a1b --- /dev/null +++ b/spyql/quotes_handler.py @@ -0,0 +1,52 @@ +import re +import random +import string +import logging + +STRING_PLACEHOLDER_LEN = 32 + +class QuotesHandler: + def __init__(self): + self.strings = {} + + # replaces quoted strings by placeholders to make parsing easier + # populates dictionary of placeholders and the strings they hold + def extract_strings(self, query): + res = [] + quotes = [ + r"\'(\'\'|\\.|[^'])*\'", + r'\"(\"\"|\\.|[^"])*\"'#, + # r'\`(\`\`|\\.|[^`])*\`' + ] + + spans = [(0,0)] + for quote in quotes: + spans.extend([m.span() for m in re.finditer(quote, query)]) + spans.append((len(query), 0)) + + #print(spans) + + self.strings = {} + for i in range(len(spans)-1): + if i>0: + sid = ''.join(random.choice(string.ascii_letters) for _ in range(STRING_PLACEHOLDER_LEN)) + sid = f"_{sid}_" + res.append(sid) + self.strings[sid] = query[spans[i][0]+1:spans[i][1]-1] + + res.append(query[spans[i][1]:spans[i+1][0]]) + + return "".join(res) + + @staticmethod + def string_placeholder_re(): + return r'\_\w{%d}\_'%(STRING_PLACEHOLDER_LEN) + + # replace string placeholders by their actual strings + def put_strings_back(self, text, quote=True): + quote_char = '"' if quote else '' + sids = {m.group(0) for m in re.finditer(self.string_placeholder_re(), text)} + for sid in sids: + text = text.replace(sid, f'{quote_char}{self.strings[sid]}{quote_char}') + return text + diff --git a/spyql/spyql.py b/spyql/spyql.py index fb1ccd2..971b6bd 100644 --- a/spyql/spyql.py +++ b/spyql/spyql.py @@ -13,50 +13,20 @@ # from spyql.processor import Processor +from spyql.quotes_handler import QuotesHandler import sys import re import logging -import random -import string query_struct_keywords = ['select', 'from', 'explode', 'where', 'limit', 'offset', 'to'] -STRING_PLACEHOLDER_LEN = 32 #makes sure that queries start with a space (required for parse_structure) def clean_query(q): q = " " + q.strip() return q -# replaces quoted strings by placeholders to make parsing easier -# also returns dictionary of placeholders and the strings they hold -def get_query_strings(query): - res = [] - quotes = [ - r"\'(\'\'|\\.|[^'])*\'", - r'\"(\"\"|\\.|[^"])*\"'#, - # r'\`(\`\`|\\.|[^`])*\`' - ] - - spans = [(0,0)] - for quote in quotes: - spans.extend([m.span() for m in re.finditer(quote, query)]) - spans.append((len(query), 0)) - - #print(spans) - - strings = {} - for i in range(len(spans)-1): - if i>0: - sid = ''.join(random.choice(string.ascii_letters) for _ in range(STRING_PLACEHOLDER_LEN)) - sid = f"_{sid}_" - res.append(sid) - strings[sid] = query[spans[i][0]+1:spans[i][1]-1] - - res.append(query[spans[i][1]:spans[i+1][0]]) - - return ("".join(res), strings) #parse the supported keywords, which must follow a given order def parse_structure(q): @@ -87,9 +57,6 @@ def parse_structure(q): return d -def string_placeholder_re(): - return r'\_\w{%d}\_'%(STRING_PLACEHOLDER_LEN) - # replaces sql/custom syntax by python syntax def pythonize(s): #todo: check for special SQL stuff such as in, is, like @@ -102,19 +69,11 @@ def pythonize(s): # `json['hello']['planet hearth']` # first replace quoted keys (they do not need quotes) - s = re.compile(r"->(%s)"%(string_placeholder_re())).sub(r"[\1]", s) + s = re.compile(r"->(%s)"%(QuotesHandler.string_placeholder_re())).sub(r"[\1]", s) #then replace unquoted keys (they need quotes) s = re.compile(r"->([^\d\W]\w*)").sub(r"['\1']", s) return s - -# replace string placeholders by their actual strings -def put_strings_back(text, strings, quote=True): - quote_char = '"' if quote else '' - sids = {m.group(0) for m in re.finditer(string_placeholder_re(), text)} - for sid in sids: - text = text.replace(sid, f'{quote_char}{strings[sid]}{quote_char}') - return text def custom_sel_split(s): sin = list(s) @@ -167,7 +126,7 @@ def parse_select(sel, strings): c = "_values" name = '*' else: - name = put_strings_back(name, strings, quote=False) + name = strings.put_strings_back(name, quote=False) c = f"[{make_expr_ready(c, strings)}]" #new_sel[name] = c @@ -175,12 +134,16 @@ def parse_select(sel, strings): return new_sel -def make_expr_ready(expr, strings): - return put_strings_back(pythonize(expr), strings).strip() +def make_expr_ready(expr, strings): + return strings.put_strings_back(pythonize(expr)).strip() + #return pythonize(expr).strip() # parse entry point def parse(query): - (query, strings) = get_query_strings(query) + strings = QuotesHandler() + query = strings.extract_strings(query) + + # (query, strings) = get_query_strings(query) #print(query) #print(strings) prs = parse_structure(query) @@ -257,7 +220,7 @@ def print_select_syntax(): def main(): #sys.tracebacklimit = 0 # no exception traces - #logging.basicConfig(level=logging.INFO) + logging.basicConfig(level=logging.INFO) #logging.basicConfig(level=logging.DEBUG) #default query for simple testing: @@ -275,3 +238,12 @@ def main(): if __name__ == "__main__": main() + + # import cProfile + # import pstats + # from pstats import SortKey + # cProfile.run('main()', 'spyql.stats') + # p = pstats.Stats('spyql.stats').strip_dirs() + + # p.sort_stats(SortKey.CUMULATIVE).dump_stats('spyql.stats.cum') + # p.sort_stats(SortKey.TIME).dump_stats('spyql.stats.time') \ No newline at end of file From b41d963b381f7fcd27b630b4232afbd2f31ce536 Mon Sep 17 00:00:00 2001 From: dcmoura Date: Wed, 2 Jun 2021 07:30:05 +0100 Subject: [PATCH 02/10] single eval of expressions --- spyql/processor.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/spyql/processor.py b/spyql/processor.py index 9f26838..6494a0e 100644 --- a/spyql/processor.py +++ b/spyql/processor.py @@ -1,3 +1,11 @@ +# TODO: optimizations +# [x]: single eval +# [ ]: try to eliminate nested list +# [ ]: try to eliminate wrap row +# [ ]: try to eliminate is instance of +# [ ]: try to eliminate execute (replace vars - needs heads + keywords) + + import csv import json as jsonlib import sys @@ -17,8 +25,9 @@ # extra... need to find some way to add user imports... # e.g. ~/.spyql.py file with python code to run at startup -import requests - + + + class Processor: @staticmethod @@ -63,7 +72,7 @@ def handle_header_row(self, row): # Makes sure a row is always a list of columns (even when there is a single input col) def wrap_row(self, row): - if not isinstance(row, Iterable): + if not isinstance(row, Iterable): #TO DO: change this takes a lot return [row] return row @@ -138,7 +147,7 @@ def _go(self, output_handler): # compiles expressions for calculating outputs cmds = [c[1] for c in self.prs['select']] #todo: rename cmds to out_expressions - cmds = [compile(cmd, '', 'eval') for cmd in cmds] + cmds = compile('[' + ','.join(cmds) + ']', '', 'eval') explode_it_cmd = None explode_inst_cmd = None @@ -189,13 +198,12 @@ def _go(self, output_handler): row_number = row_number + 1 - if not where or eval(where): #filter + if not where or eval(where): #filter (opt: eventually could be done before exploding) # input line is eligeble the_globals = globals() the_locals = locals() # to do: filter out internal vars # calculate outputs - _res = [eval(cmd, the_globals, the_locals) for cmd in cmds] - _res = [item for sublist in _res for item in sublist] #flatten + _res = [item for sublist in eval(cmds, the_globals, the_locals) for item in sublist] output_handler.handle_result(_res) #deal with output if output_handler.is_done(): From 32d998273d2c5c38d44f1a5c12221a29819ecae0 Mon Sep 17 00:00:00 2001 From: dcmoura Date: Wed, 2 Jun 2021 07:46:45 +0100 Subject: [PATCH 03/10] moved strings replacement to processor --- spyql/processor.py | 37 +++++++++++++++++++------------------ spyql/spyql.py | 5 ++--- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/spyql/processor.py b/spyql/processor.py index 6494a0e..b2471c8 100644 --- a/spyql/processor.py +++ b/spyql/processor.py @@ -31,21 +31,21 @@ class Processor: @staticmethod - def make_processor(prs): + def make_processor(prs, strings): processor_name = prs['from'] if not processor_name: - return Processor(prs) + return Processor(prs, strings) processor_name = processor_name.upper() if processor_name == 'JSON': - return JSONProcessor(prs) + return JSONProcessor(prs, strings) if processor_name == 'CSV': - return CSVProcessor(prs) + return CSVProcessor(prs, strings) if processor_name == 'TEXT': #single col - return TextProcessor(prs) + return TextProcessor(prs, strings) - return PythonExprProcessor(prs) + return PythonExprProcessor(prs, strings) # if not reader_name or reader_name == 'CSV': # return CSVWriter(inputfile, options) @@ -57,10 +57,11 @@ def make_processor(prs): - def __init__(self, prs): + def __init__(self, prs, strings): self.prs = prs #parsed query self.row_instantiation_script = None self.input_col_names = [] + self.strings = strings # True after header, metadata, etc in input file def reading_data(self): @@ -146,7 +147,7 @@ def _go(self, output_handler): out_cols_names = [c[0] for c in self.prs['select']] # compiles expressions for calculating outputs - cmds = [c[1] for c in self.prs['select']] #todo: rename cmds to out_expressions + cmds = [self.strings.put_strings_back(c[1]) for c in self.prs['select']] #todo: rename cmds to out_expressions cmds = compile('[' + ','.join(cmds) + ']', '', 'eval') explode_it_cmd = None @@ -164,7 +165,7 @@ def _go(self, output_handler): where = self.prs['where'] if (where): - where = compile(where, '', 'eval') + where = compile(self.strings.put_strings_back(where), '', 'eval') logging.info("-- RESULT --") @@ -210,16 +211,16 @@ def _go(self, output_handler): return #e.g. when reached limit class PythonExprProcessor(Processor): - def __init__(self, prs): - super().__init__(prs) + def __init__(self, prs, strings): + super().__init__(prs, strings) # input is a Python expression def get_input_iterators(self): - return [eval(self.prs['from'])] + return [eval(self.strings.put_strings_back(self.prs['from']))] class TextProcessor(Processor): - def __init__(self, prs): - super().__init__(prs) + def __init__(self, prs, strings): + super().__init__(prs, strings) # reads a text row as a row with 1 column def get_input_iterators(self): @@ -232,8 +233,8 @@ def wrap_row(self, row): class JSONProcessor(Processor): - def __init__(self, prs): - super().__init__(prs) + def __init__(self, prs, strings): + super().__init__(prs, strings) def get_input_iterators(self): return [sys.stdin] #to do: suport files @@ -252,8 +253,8 @@ def make_row_instantiation_script(self): ## CSV class CSVProcessor(Processor): - def __init__(self, prs): - super().__init__(prs) + def __init__(self, prs, strings): + super().__init__(prs, strings) self.has_header = False def get_input_iterators(self): diff --git a/spyql/spyql.py b/spyql/spyql.py index 971b6bd..688c2ff 100644 --- a/spyql/spyql.py +++ b/spyql/spyql.py @@ -135,8 +135,7 @@ def parse_select(sel, strings): return new_sel def make_expr_ready(expr, strings): - return strings.put_strings_back(pythonize(expr)).strip() - #return pythonize(expr).strip() + return pythonize(expr).strip() # parse entry point def parse(query): @@ -197,7 +196,7 @@ def run(query): logging.info(prs) - processor = Processor.make_processor(prs) + processor = Processor.make_processor(prs, strings) processor.go() From 7d0cfbcc84f7eae6533f7262ea8f905b2320fa90 Mon Sep 17 00:00:00 2001 From: dcmoura Date: Wed, 2 Jun 2021 12:55:38 +0100 Subject: [PATCH 04/10] refactored dynamic execution --- spyql/processor.py | 125 ++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 64 deletions(-) diff --git a/spyql/processor.py b/spyql/processor.py index b2471c8..93daf98 100644 --- a/spyql/processor.py +++ b/spyql/processor.py @@ -1,9 +1,9 @@ # TODO: optimizations # [x]: single eval # [ ]: try to eliminate nested list -# [ ]: try to eliminate wrap row -# [ ]: try to eliminate is instance of -# [ ]: try to eliminate execute (replace vars - needs heads + keywords) +# [x]: try to eliminate wrap row +# [x]: try to eliminate is instance of +# [x]: try to eliminate execute (replace vars - needs heads + keywords) import csv @@ -59,9 +59,13 @@ def make_processor(prs, strings): def __init__(self, prs, strings): self.prs = prs #parsed query + self.strings = strings + # by default, a row does not need to be wrapped (only single cols need) + self.wrap_row = False self.row_instantiation_script = None self.input_col_names = [] - self.strings = strings + self.colnames2idx = {} + # True after header, metadata, etc in input file def reading_data(self): @@ -71,15 +75,17 @@ def reading_data(self): def handle_header_row(self, row): pass - # Makes sure a row is always a list of columns (even when there is a single input col) - def wrap_row(self, row): - if not isinstance(row, Iterable): #TO DO: change this takes a lot - return [row] - return row - + # Action for handling the first row of data def handle_1st_data_row(self, row): - self.n_input_cols = len(row) if row else 0 + self.n_input_cols = len(row) if row else 0 + + #dictionary to translate col names to indexes in `_values` + self.colnames2idx.update({self.default_col_name(_i): _i for _i in range(self.n_input_cols)}) + if self.input_col_names: + #TODO check if len(input_col_names) == self.n_input_cols + self.colnames2idx.update({self.input_col_names[_i]: _i for _i in range(self.n_input_cols)}) + # Create list of output column names def make_out_cols_names(self, out_cols_names): @@ -108,26 +114,17 @@ def get_input_iterators(self): def default_col_name(self, idx): return f"col{idx+1}" - def make_row_instantiation_script(self): - # script for instantianting input variables - # should return a list of string with assignment statements - # has access to the `_values` variable, which has a complete input row of values - # this should only be called by `get_row_instantiation_script` - # can be overrided (e.g. json processor overrides this) - - vars_script = [f"{self.default_col_name(_i)} = _values[{_i}]" for _i in range(self.n_input_cols)] - if self.input_col_names: - #TODO check if len(input_col_names) == self.n_input_cols - vars_script = [f"{self.input_col_names[_i]} = {vars_script[_i]}" for _i in range(self.n_input_cols)] - return vars_script + + # replace identifiers (column names) in sql expressions by references to `_values` + # and put (quoted) strings back + def prepare_expression(self, expr): + for id, idx in self.colnames2idx.items(): + pattern = rf"\b({id})\b" + replacement = f"_values[{idx}]" + expr = re.compile(pattern).sub(replacement, expr) + + return self.strings.put_strings_back(expr) - # lazy initialization of the row instantiation script - def get_row_instantiation_script(self): - if not self.row_instantiation_script: - vars_script = '\n'.join(self.make_row_instantiation_script()) - #print(vars_script) - self.row_instantiation_script = compile(vars_script, '', 'exec') - return self.row_instantiation_script # main def go(self): @@ -138,6 +135,8 @@ def go(self): output_handler.finish() def _go(self, output_handler): + vars = globals() # to do: filter out not useful/internal vars + _values = [[]] row_number = 0 vars_script = None @@ -145,10 +144,9 @@ def _go(self, output_handler): # gets user-defined output cols names (with AS alias) out_cols_names = [c[0] for c in self.prs['select']] - - # compiles expressions for calculating outputs - cmds = [self.strings.put_strings_back(c[1]) for c in self.prs['select']] #todo: rename cmds to out_expressions - cmds = compile('[' + ','.join(cmds) + ']', '', 'eval') + + cmds = [] + explode_it_cmd = None explode_inst_cmd = None @@ -163,21 +161,20 @@ def _go(self, output_handler): # an input iterator [[1],[2],[3]] is the same as [[1,2,3]] its_list = self.get_input_iterators() - where = self.prs['where'] - if (where): - where = compile(self.strings.put_strings_back(where), '', 'eval') + where = None logging.info("-- RESULT --") for its in its_list: for it in its: - _values = it + _values = it if not self.reading_data(): self.handle_header_row(_values) continue - _values = self.wrap_row(_values) + if self.wrap_row: + _values = [_values] # print header if row_number == 0: @@ -185,10 +182,15 @@ def _go(self, output_handler): output_handler.writer.writeheader(self.make_out_cols_names(out_cols_names)) if output_handler.is_done(): return # in case of `limit 0` + + # compiles expressions for calculating outputs + cmds = [self.prepare_expression(c[1]) for c in self.prs['select']] #todo: rename cmds to out_expressions + cmds = compile('[' + ','.join(cmds) + ']', '', 'eval') + cmds = [item for sublist in cmds for item in sublist] #flatten (because of '*') + cmds = compile('[' + ','.join(cmds) + ']', '', 'eval') + where = self.prs['where'] + if (where): + #TODO: check if * is not used in where... or pass argument + where = compile(self.prepare_expression(where)[0], '', 'eval') + + if explode_path: + explode_its = eval(explode_it_cmd) - if self.wrap_row: - _values = [_values] - - # print header - if row_number == 0: - self.handle_1st_data_row(_values) - output_handler.writer.writeheader(self.make_out_cols_names(out_cols_names)) - if output_handler.is_done(): - return # in case of `limit 0` - - # TODO: move to function(s) - # compiles expressions for calculating outputs - cmds = [self.prepare_expression(c[1]) for c in self.prs['select']] #todo: rename cmds to out_expressions - cmds = [item for sublist in cmds for item in sublist] #flatten (because of '*') - cmds = compile('[' + ','.join(cmds) + ']', '', 'eval') - where = self.prs['where'] - if (where): + select_expr = [self.prepare_expression(c['expr']) for c in self.prs['select']] + select_expr = [item for sublist in select_expr for item in sublist] #flatten (because of '*') + select_expr = compile('[' + ','.join(select_expr) + ']', '