From a7697becf81c1000485a55294b1d856fcccd70b7 Mon Sep 17 00:00:00 2001
From: dcmoura <dmoura@veniam.com>
Date: Wed, 2 Jun 2021 07:25:16 +0100
Subject: [PATCH 01/10] moved quote handling to dedicated class

---
 spyql/quotes_handler.py | 52 +++++++++++++++++++++++++++++++
 spyql/spyql.py          | 68 ++++++++++++-----------------------------
 2 files changed, 72 insertions(+), 48 deletions(-)
 create mode 100644 spyql/quotes_handler.py

diff --git a/spyql/quotes_handler.py b/spyql/quotes_handler.py
new file mode 100644
index 0000000..4d28a1b
--- /dev/null
+++ b/spyql/quotes_handler.py
@@ -0,0 +1,52 @@
+import re
+import random
+import string
+import logging
+
+STRING_PLACEHOLDER_LEN = 32
+
+class QuotesHandler: 
+    def __init__(self):
+        self.strings = {}
+
+    # replaces quoted strings by placeholders to make parsing easier
+    # populates dictionary of placeholders and the strings they hold
+    def extract_strings(self, query):    
+        res = []
+        quotes = [
+            r"\'(\'\'|\\.|[^'])*\'",
+            r'\"(\"\"|\\.|[^"])*\"'#,
+        #    r'\`(\`\`|\\.|[^`])*\`'
+        ]
+        
+        spans = [(0,0)]
+        for quote in quotes:
+            spans.extend([m.span() for m in re.finditer(quote, query)])
+        spans.append((len(query), 0))
+        
+        #print(spans)
+        
+        self.strings = {}
+        for i in range(len(spans)-1):
+            if i>0:            
+                sid = ''.join(random.choice(string.ascii_letters) for _ in range(STRING_PLACEHOLDER_LEN))
+                sid = f"_{sid}_"
+                res.append(sid)
+                self.strings[sid] = query[spans[i][0]+1:spans[i][1]-1] 
+                
+            res.append(query[spans[i][1]:spans[i+1][0]])
+            
+        return "".join(res)
+
+    @staticmethod
+    def string_placeholder_re():
+        return r'\_\w{%d}\_'%(STRING_PLACEHOLDER_LEN)
+
+    # replace string placeholders by their actual strings
+    def put_strings_back(self, text, quote=True):
+        quote_char = '"' if quote else ''
+        sids = {m.group(0) for m in re.finditer(self.string_placeholder_re(), text)}
+        for sid in sids:
+            text = text.replace(sid, f'{quote_char}{self.strings[sid]}{quote_char}')
+        return text    
+
diff --git a/spyql/spyql.py b/spyql/spyql.py
index fb1ccd2..971b6bd 100644
--- a/spyql/spyql.py
+++ b/spyql/spyql.py
@@ -13,50 +13,20 @@
 #  
 
 from spyql.processor import Processor
+from spyql.quotes_handler import QuotesHandler
 import sys
 import re
 
 import logging
-import random
-import string
 
 
 query_struct_keywords = ['select', 'from', 'explode', 'where', 'limit', 'offset', 'to']
-STRING_PLACEHOLDER_LEN = 32
 
 #makes sure that queries start with a space (required for parse_structure)
 def clean_query(q):
     q = " " + q.strip()
     return q
 
-# replaces quoted strings by placeholders to make parsing easier
-# also returns dictionary of placeholders and the strings they hold
-def get_query_strings(query):    
-    res = []
-    quotes = [
-        r"\'(\'\'|\\.|[^'])*\'",
-        r'\"(\"\"|\\.|[^"])*\"'#,
-    #    r'\`(\`\`|\\.|[^`])*\`'
-    ]
-    
-    spans = [(0,0)]
-    for quote in quotes:
-        spans.extend([m.span() for m in re.finditer(quote, query)])
-    spans.append((len(query), 0))
-    
-    #print(spans)
-    
-    strings = {}
-    for i in range(len(spans)-1):
-        if i>0:            
-            sid = ''.join(random.choice(string.ascii_letters) for _ in range(STRING_PLACEHOLDER_LEN))
-            sid = f"_{sid}_"
-            res.append(sid)
-            strings[sid] = query[spans[i][0]+1:spans[i][1]-1] 
-            
-        res.append(query[spans[i][1]:spans[i+1][0]])
-        
-    return ("".join(res), strings)
           
 #parse the supported keywords, which must follow a given order
 def parse_structure(q):    
@@ -87,9 +57,6 @@ def parse_structure(q):
 
     return d
 
-def string_placeholder_re():
-    return r'\_\w{%d}\_'%(STRING_PLACEHOLDER_LEN)
-
 # replaces sql/custom syntax by python syntax
 def pythonize(s):
     #todo: check for special SQL stuff such as in, is, like    
@@ -102,19 +69,11 @@ def pythonize(s):
     #       `json['hello']['planet hearth']`
 
     # first replace quoted keys (they do not need quotes)
-    s = re.compile(r"->(%s)"%(string_placeholder_re())).sub(r"[\1]", s)
+    s = re.compile(r"->(%s)"%(QuotesHandler.string_placeholder_re())).sub(r"[\1]", s)
     #then replace unquoted keys (they need quotes)
     s = re.compile(r"->([^\d\W]\w*)").sub(r"['\1']", s)
 
     return s
-
-# replace string placeholders by their actual strings
-def put_strings_back(text, strings, quote=True):
-    quote_char = '"' if quote else ''
-    sids = {m.group(0) for m in re.finditer(string_placeholder_re(), text)}
-    for sid in sids:
-        text = text.replace(sid, f'{quote_char}{strings[sid]}{quote_char}')
-    return text    
         
 def custom_sel_split(s):
     sin = list(s)
@@ -167,7 +126,7 @@ def parse_select(sel, strings):
             c = "_values"
             name = '*'
         else:            
-            name = put_strings_back(name, strings, quote=False)
+            name = strings.put_strings_back(name, quote=False)
             c = f"[{make_expr_ready(c, strings)}]" 
         
         #new_sel[name] = c
@@ -175,12 +134,16 @@ def parse_select(sel, strings):
     
     return new_sel
 
-def make_expr_ready(expr, strings):
-    return put_strings_back(pythonize(expr), strings).strip()
+def make_expr_ready(expr, strings):    
+    return strings.put_strings_back(pythonize(expr)).strip()
+    #return pythonize(expr).strip()
 
 # parse entry point
 def parse(query):
-    (query, strings) = get_query_strings(query)
+    strings = QuotesHandler()
+    query = strings.extract_strings(query)
+
+   # (query, strings) = get_query_strings(query)
     #print(query)
     #print(strings)
     prs = parse_structure(query)
@@ -257,7 +220,7 @@ def print_select_syntax():
 
 def main():
     #sys.tracebacklimit = 0 # no exception traces
-    #logging.basicConfig(level=logging.INFO)
+    logging.basicConfig(level=logging.INFO)
     #logging.basicConfig(level=logging.DEBUG)
 
     #default query for simple testing:
@@ -275,3 +238,12 @@ def main():
 
 if __name__ == "__main__":    
     main()
+
+    # import cProfile    
+    # import pstats
+    # from pstats import SortKey
+    # cProfile.run('main()', 'spyql.stats')
+    # p = pstats.Stats('spyql.stats').strip_dirs()
+
+    # p.sort_stats(SortKey.CUMULATIVE).dump_stats('spyql.stats.cum')
+    # p.sort_stats(SortKey.TIME).dump_stats('spyql.stats.time')
\ No newline at end of file

From b41d963b381f7fcd27b630b4232afbd2f31ce536 Mon Sep 17 00:00:00 2001
From: dcmoura <dmoura@veniam.com>
Date: Wed, 2 Jun 2021 07:30:05 +0100
Subject: [PATCH 02/10] single eval of expressions

---
 spyql/processor.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/spyql/processor.py b/spyql/processor.py
index 9f26838..6494a0e 100644
--- a/spyql/processor.py
+++ b/spyql/processor.py
@@ -1,3 +1,11 @@
+# TODO: optimizations
+# [x]: single eval
+# [ ]: try to eliminate nested list
+# [ ]: try to eliminate wrap row 
+# [ ]: try to eliminate is instance of
+# [ ]: try to eliminate execute (replace vars - needs heads + keywords)
+
+
 import csv
 import json as jsonlib
 import sys
@@ -17,8 +25,9 @@
 
 # extra... need to find some way to add user imports...
 # e.g. ~/.spyql.py file with python code to run at startup
-import requests
- 
+
+
+
 class Processor: 
         
     @staticmethod
@@ -63,7 +72,7 @@ def handle_header_row(self, row):
 
     # Makes sure a row is always a list of columns (even when there is a single input col)
     def wrap_row(self, row):
-        if not isinstance(row, Iterable):
+        if not isinstance(row, Iterable): #TO DO: change this takes a lot
             return [row]            
         return row
 
@@ -138,7 +147,7 @@ def _go(self, output_handler):
 
         # compiles expressions for calculating outputs
         cmds = [c[1] for c in self.prs['select']]  #todo: rename cmds to out_expressions        
-        cmds = [compile(cmd, '', 'eval') for cmd in cmds]
+        cmds = compile('[' + ','.join(cmds) + ']', '', 'eval')
 
         explode_it_cmd = None
         explode_inst_cmd = None
@@ -189,13 +198,12 @@ def _go(self, output_handler):
 
                     row_number = row_number + 1
                     
-                    if not where or eval(where): #filter
+                    if not where or eval(where): #filter (opt: eventually could be done before exploding)
                         # input line is eligeble 
                         the_globals = globals()
                         the_locals = locals() # to do: filter out internal vars
                         # calculate outputs
-                        _res = [eval(cmd, the_globals, the_locals) for cmd in cmds]
-                        _res = [item for sublist in _res for item in sublist] #flatten
+                        _res = [item for sublist in eval(cmds, the_globals, the_locals) for item in sublist]                        
 
                         output_handler.handle_result(_res) #deal with output
                         if output_handler.is_done():

From 32d998273d2c5c38d44f1a5c12221a29819ecae0 Mon Sep 17 00:00:00 2001
From: dcmoura <dmoura@veniam.com>
Date: Wed, 2 Jun 2021 07:46:45 +0100
Subject: [PATCH 03/10] moved strings replacement to processor

---
 spyql/processor.py | 37 +++++++++++++++++++------------------
 spyql/spyql.py     |  5 ++---
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/spyql/processor.py b/spyql/processor.py
index 6494a0e..b2471c8 100644
--- a/spyql/processor.py
+++ b/spyql/processor.py
@@ -31,21 +31,21 @@
 class Processor: 
         
     @staticmethod
-    def make_processor(prs):
+    def make_processor(prs, strings):
         processor_name = prs['from']
         if not processor_name:
-            return Processor(prs)
+            return Processor(prs, strings)
 
         processor_name = processor_name.upper() 
 
         if processor_name == 'JSON':
-            return JSONProcessor(prs)
+            return JSONProcessor(prs, strings)
         if processor_name == 'CSV': 
-            return CSVProcessor(prs)
+            return CSVProcessor(prs, strings)
         if processor_name == 'TEXT': #single col
-            return TextProcessor(prs)
+            return TextProcessor(prs, strings)
 
-        return PythonExprProcessor(prs)
+        return PythonExprProcessor(prs, strings)
         # if not reader_name or reader_name == 'CSV':
         #     return CSVWriter(inputfile, options)
         
@@ -57,10 +57,11 @@ def make_processor(prs):
 
 
 
-    def __init__(self, prs):
+    def __init__(self, prs, strings):
         self.prs = prs #parsed query
         self.row_instantiation_script = None  
         self.input_col_names = []
+        self.strings = strings
 
     # True after header, metadata, etc in input file
     def reading_data(self):
@@ -146,7 +147,7 @@ def _go(self, output_handler):
         out_cols_names = [c[0] for c in self.prs['select']]
 
         # compiles expressions for calculating outputs
-        cmds = [c[1] for c in self.prs['select']]  #todo: rename cmds to out_expressions        
+        cmds = [self.strings.put_strings_back(c[1]) for c in self.prs['select']]  #todo: rename cmds to out_expressions        
         cmds = compile('[' + ','.join(cmds) + ']', '', 'eval')
 
         explode_it_cmd = None
@@ -164,7 +165,7 @@ def _go(self, output_handler):
         
         where = self.prs['where']
         if (where):
-            where = compile(where, '', 'eval') 
+            where = compile(self.strings.put_strings_back(where), '', 'eval') 
 
         logging.info("-- RESULT --")        
         
@@ -210,16 +211,16 @@ def _go(self, output_handler):
                             return #e.g. when reached limit
 
 class PythonExprProcessor(Processor):         
-    def __init__(self, prs):
-        super().__init__(prs)
+    def __init__(self, prs, strings):
+        super().__init__(prs, strings)
 
     # input is a Python expression
     def get_input_iterators(self):
-        return [eval(self.prs['from'])]
+        return [eval(self.strings.put_strings_back(self.prs['from']))]
 
 class TextProcessor(Processor):
-    def __init__(self, prs):
-        super().__init__(prs)
+    def __init__(self, prs, strings):
+        super().__init__(prs, strings)
 
     # reads a text row as a row with 1 column
     def get_input_iterators(self):
@@ -232,8 +233,8 @@ def wrap_row(self, row):
 
     
 class JSONProcessor(Processor):
-    def __init__(self, prs):
-        super().__init__(prs)
+    def __init__(self, prs, strings):
+        super().__init__(prs, strings)
 
     def get_input_iterators(self):
         return [sys.stdin] #to do: suport files
@@ -252,8 +253,8 @@ def make_row_instantiation_script(self):
 
 ## CSV
 class CSVProcessor(Processor):
-    def __init__(self, prs):
-        super().__init__(prs)
+    def __init__(self, prs, strings):
+        super().__init__(prs, strings)
         self.has_header = False
 
     def get_input_iterators(self):
diff --git a/spyql/spyql.py b/spyql/spyql.py
index 971b6bd..688c2ff 100644
--- a/spyql/spyql.py
+++ b/spyql/spyql.py
@@ -135,8 +135,7 @@ def parse_select(sel, strings):
     return new_sel
 
 def make_expr_ready(expr, strings):    
-    return strings.put_strings_back(pythonize(expr)).strip()
-    #return pythonize(expr).strip()
+    return pythonize(expr).strip()
 
 # parse entry point
 def parse(query):
@@ -197,7 +196,7 @@ def run(query):
         
     logging.info(prs)
 
-    processor = Processor.make_processor(prs)
+    processor = Processor.make_processor(prs, strings)
 
     processor.go()
 

From 7d0cfbcc84f7eae6533f7262ea8f905b2320fa90 Mon Sep 17 00:00:00 2001
From: dcmoura <dmoura@veniam.com>
Date: Wed, 2 Jun 2021 12:55:38 +0100
Subject: [PATCH 04/10] refactored dynamic execution

---
 spyql/processor.py | 125 ++++++++++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 64 deletions(-)

diff --git a/spyql/processor.py b/spyql/processor.py
index b2471c8..93daf98 100644
--- a/spyql/processor.py
+++ b/spyql/processor.py
@@ -1,9 +1,9 @@
 # TODO: optimizations
 # [x]: single eval
 # [ ]: try to eliminate nested list
-# [ ]: try to eliminate wrap row 
-# [ ]: try to eliminate is instance of
-# [ ]: try to eliminate execute (replace vars - needs heads + keywords)
+# [x]: try to eliminate wrap row 
+# [x]: try to eliminate is instance of
+# [x]: try to eliminate execute (replace vars - needs heads + keywords)
 
 
 import csv
@@ -59,9 +59,13 @@ def make_processor(prs, strings):
 
     def __init__(self, prs, strings):
         self.prs = prs #parsed query
+        self.strings = strings
+        # by default, a row does not need to be wrapped (only single cols need)
+        self.wrap_row = False 
         self.row_instantiation_script = None  
         self.input_col_names = []
-        self.strings = strings
+        self.colnames2idx = {}
+        
 
     # True after header, metadata, etc in input file
     def reading_data(self):
@@ -71,15 +75,17 @@ def reading_data(self):
     def handle_header_row(self, row):
         pass
 
-    # Makes sure a row is always a list of columns (even when there is a single input col)
-    def wrap_row(self, row):
-        if not isinstance(row, Iterable): #TO DO: change this takes a lot
-            return [row]            
-        return row
-
+    
     # Action for handling the first row of data 
     def handle_1st_data_row(self, row):
-        self.n_input_cols = len(row) if row else 0        
+        self.n_input_cols = len(row) if row else 0   
+
+        #dictionary to translate col names to indexes in `_values`
+        self.colnames2idx.update({self.default_col_name(_i): _i for _i in range(self.n_input_cols)})
+        if self.input_col_names:
+            #TODO check if len(input_col_names) == self.n_input_cols 
+            self.colnames2idx.update({self.input_col_names[_i]: _i for _i in range(self.n_input_cols)})
+     
 
     # Create list of output column names
     def make_out_cols_names(self, out_cols_names):
@@ -108,26 +114,17 @@ def get_input_iterators(self):
     def default_col_name(self, idx):
         return f"col{idx+1}"
 
-    def make_row_instantiation_script(self):
-        # script for instantianting input variables
-        # should return a list of string with assignment statements
-        # has access to the `_values` variable, which has a complete input row of values
-        # this should only be called by `get_row_instantiation_script`
-        # can be overrided (e.g. json processor overrides this)
-        
-        vars_script = [f"{self.default_col_name(_i)} = _values[{_i}]" for _i in range(self.n_input_cols)]        
-        if self.input_col_names:
-            #TODO check if len(input_col_names) == self.n_input_cols 
-            vars_script = [f"{self.input_col_names[_i]} = {vars_script[_i]}" for _i in range(self.n_input_cols)]
-        return vars_script
+    
+    # replace identifiers (column names) in sql expressions by references to `_values`
+    # and put (quoted) strings back
+    def prepare_expression(self, expr):                        
+        for id, idx in self.colnames2idx.items():
+            pattern = rf"\b({id})\b"
+            replacement = f"_values[{idx}]"
+            expr = re.compile(pattern).sub(replacement, expr)
+
+        return self.strings.put_strings_back(expr)
 
-    # lazy initialization of the row instantiation script 
-    def get_row_instantiation_script(self):        
-        if not self.row_instantiation_script:
-            vars_script = '\n'.join(self.make_row_instantiation_script())
-            #print(vars_script)
-            self.row_instantiation_script = compile(vars_script, '', 'exec')         
-        return self.row_instantiation_script
 
     # main
     def go(self):        
@@ -138,6 +135,8 @@ def go(self):
         output_handler.finish()
 
     def _go(self, output_handler):
+        vars = globals() # to do: filter out not useful/internal vars
+
         _values = [[]]
         row_number = 0
         vars_script = None
@@ -145,10 +144,9 @@ def _go(self, output_handler):
 
         # gets user-defined output cols names (with AS alias)
         out_cols_names = [c[0] for c in self.prs['select']]
-
-        # compiles expressions for calculating outputs
-        cmds = [self.strings.put_strings_back(c[1]) for c in self.prs['select']]  #todo: rename cmds to out_expressions        
-        cmds = compile('[' + ','.join(cmds) + ']', '', 'eval')
+        
+        cmds = []      
+        
 
         explode_it_cmd = None
         explode_inst_cmd = None
@@ -163,21 +161,20 @@ def _go(self, output_handler):
         # an input iterator [[1],[2],[3]] is the same as [[1,2,3]]
         its_list = self.get_input_iterators()
         
-        where = self.prs['where']
-        if (where):
-            where = compile(self.strings.put_strings_back(where), '', 'eval') 
+        where = None
 
         logging.info("-- RESULT --")        
         
         for its in its_list: 
             for it in its:        
-                _values = it
+                _values = it                
                 
                 if not self.reading_data():
                     self.handle_header_row(_values)
                     continue
                             
-                _values = self.wrap_row(_values) 
+                if self.wrap_row:
+                    _values = [_values]
                 
                 # print header
                 if row_number == 0:
@@ -185,10 +182,15 @@ def _go(self, output_handler):
                     output_handler.writer.writeheader(self.make_out_cols_names(out_cols_names))
                     if output_handler.is_done():
                         return # in case of `limit 0`
+                
+                    # compiles expressions for calculating outputs
+                    cmds = [self.prepare_expression(c[1]) for c in self.prs['select']]  #todo: rename cmds to out_expressions        
+                    cmds = compile('[' + ','.join(cmds) + ']', '<select>', 'eval')
+                    where = self.prs['where']
+                    if (where):
+                        where = compile(self.prepare_expression(where), '<where>', 'eval') 
 
-                #make input variables (uses `_values`)
-                exec(self.get_row_instantiation_script()) 
-                            
+                    
                 explode_its = [None] # 1 element by default (no explosion)
                 if explode_path:
                     explode_its = eval(explode_it_cmd)
@@ -198,13 +200,14 @@ def _go(self, output_handler):
                         exec(explode_inst_cmd)
 
                     row_number = row_number + 1
-                    
-                    if not where or eval(where): #filter (opt: eventually could be done before exploding)
+
+                    vars["_values"] = _values
+
+                    if not where or eval(where,{},vars): #filter (opt: eventually could be done before exploding)
                         # input line is eligeble 
-                        the_globals = globals()
-                        the_locals = locals() # to do: filter out internal vars
+                        
                         # calculate outputs
-                        _res = [item for sublist in eval(cmds, the_globals, the_locals) for item in sublist]                        
+                        _res = [item for sublist in eval(cmds,{},vars) for item in sublist]                        
 
                         output_handler.handle_result(_res) #deal with output
                         if output_handler.is_done():
@@ -216,46 +219,40 @@ def __init__(self, prs, strings):
 
     # input is a Python expression
     def get_input_iterators(self):
-        return [eval(self.strings.put_strings_back(self.prs['from']))]
+        e = eval(self.strings.put_strings_back(self.prs['from']))
+        if e: 
+            if not isinstance(e, Iterable):
+                e = [e]
+            self.wrap_row = not isinstance(e[0], Iterable)
+        return [e]
 
 class TextProcessor(Processor):
     def __init__(self, prs, strings):
         super().__init__(prs, strings)
+        self.wrap_row = True # since it's a single col, always wrap it
 
     # reads a text row as a row with 1 column
     def get_input_iterators(self):
         #return [sys.stdin] #to do: suport files
         return [[line.rstrip("\n\r") for line in sys.stdin]]
 
-    # since it's a single col, always wrap it
-    def wrap_row(self, row):
-        return [row]
-
+    
     
 class JSONProcessor(Processor):
     def __init__(self, prs, strings):
         super().__init__(prs, strings)
+        self.wrap_row = True # since it's a single col, always wrap it
+        self.colnames2idx.update({"json": 0}) # first column alias as json
 
     def get_input_iterators(self):
         return [sys.stdin] #to do: suport files
 
-    # since it's a single col, always wrap it
-    def wrap_row(self, row):
-        return [row]
-
-    def make_row_instantiation_script(self):
-        # overriding default: json input is considered a single col        
-        return [
-            "json = jsonlib.loads(_values[0])",
-            self.default_col_name(0) + " = _values[0] = json"
-            ]
-
-
 ## CSV
 class CSVProcessor(Processor):
     def __init__(self, prs, strings):
         super().__init__(prs, strings)
         self.has_header = False
+        self.wrap_row = False # since it's a multi col, it is already wraped
 
     def get_input_iterators(self):
         # Part 1 reads sample to detect dialect and if has header

From 7e73b762e54849402fe00a66034e137f98bf4f83 Mon Sep 17 00:00:00 2001
From: dcmoura <dmoura@veniam.com>
Date: Wed, 2 Jun 2021 18:44:51 +0100
Subject: [PATCH 05/10] row evaluation as flat list

---
 spyql/output_handler.py |  1 +
 spyql/processor.py      | 22 ++++++++++++++--------
 spyql/spyql.py          |  6 +++---
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/spyql/output_handler.py b/spyql/output_handler.py
index 767f0ed..6e134ff 100644
--- a/spyql/output_handler.py
+++ b/spyql/output_handler.py
@@ -26,6 +26,7 @@ def write(self, row):
             self.rows_written = self.rows_written + 1
 
     def finish(self):
+        #self.writer.writerow([self.rows_written])
         self.writer.flush()
 
 class LineInLineOut(OutputHandler):
diff --git a/spyql/processor.py b/spyql/processor.py
index 93daf98..0744e3f 100644
--- a/spyql/processor.py
+++ b/spyql/processor.py
@@ -117,13 +117,16 @@ def default_col_name(self, idx):
     
     # replace identifiers (column names) in sql expressions by references to `_values`
     # and put (quoted) strings back
-    def prepare_expression(self, expr):                        
+    def prepare_expression(self, expr):
+        if expr == '*':
+            return [f"_values[{idx}]" for idx in range(self.n_input_cols)]
+
         for id, idx in self.colnames2idx.items():
             pattern = rf"\b({id})\b"
             replacement = f"_values[{idx}]"
             expr = re.compile(pattern).sub(replacement, expr)
 
-        return self.strings.put_strings_back(expr)
+        return [self.strings.put_strings_back(expr)]
 
 
     # main
@@ -162,6 +165,7 @@ def _go(self, output_handler):
         its_list = self.get_input_iterators()
         
         where = None
+        explode_its = [None] # 1 element by default (no explosion)
 
         logging.info("-- RESULT --")        
         
@@ -183,15 +187,16 @@ def _go(self, output_handler):
                     if output_handler.is_done():
                         return # in case of `limit 0`
                 
+                    # TODO: move to function(s)
                     # compiles expressions for calculating outputs
                     cmds = [self.prepare_expression(c[1]) for c in self.prs['select']]  #todo: rename cmds to out_expressions        
-                    cmds = compile('[' + ','.join(cmds) + ']', '<select>', 'eval')
+                    cmds = [item for sublist in cmds for item in sublist] #flatten (because of '*')                    
+                    cmds = compile('[' + ','.join(cmds) + ']', '<select>', 'eval')                    
                     where = self.prs['where']
                     if (where):
-                        where = compile(self.prepare_expression(where), '<where>', 'eval') 
-
-                    
-                explode_its = [None] # 1 element by default (no explosion)
+                        #TODO: check if * is not used in where... or pass argument
+                        where = compile(self.prepare_expression(where)[0], '<where>', 'eval') 
+                
                 if explode_path:
                     explode_its = eval(explode_it_cmd)
                                 
@@ -207,12 +212,13 @@ def _go(self, output_handler):
                         # input line is eligeble 
                         
                         # calculate outputs
-                        _res = [item for sublist in eval(cmds,{},vars) for item in sublist]                        
+                        _res = eval(cmds,{},vars)
 
                         output_handler.handle_result(_res) #deal with output
                         if output_handler.is_done():
                             return #e.g. when reached limit
 
+
 class PythonExprProcessor(Processor):         
     def __init__(self, prs, strings):
         super().__init__(prs, strings)
diff --git a/spyql/spyql.py b/spyql/spyql.py
index 688c2ff..6abaf61 100644
--- a/spyql/spyql.py
+++ b/spyql/spyql.py
@@ -123,11 +123,11 @@ def parse_select(sel, strings):
             c = c[:(sas.span()[0])]
 
         if c.strip() == '*':
-            c = "_values"
+            c = "*"
             name = '*'
         else:            
             name = strings.put_strings_back(name, quote=False)
-            c = f"[{make_expr_ready(c, strings)}]" 
+            c = f"{make_expr_ready(c, strings)}" 
         
         #new_sel[name] = c
         new_sel.append((name,c))
@@ -219,7 +219,7 @@ def print_select_syntax():
 
 def main():
     #sys.tracebacklimit = 0 # no exception traces
-    logging.basicConfig(level=logging.INFO)
+    #logging.basicConfig(level=logging.INFO)
     #logging.basicConfig(level=logging.DEBUG)
 
     #default query for simple testing:

From 70f05c4f9cd7ccdea6f22fa85d47795e6c8f2381 Mon Sep 17 00:00:00 2001
From: dcmoura <dmoura@veniam.com>
Date: Fri, 4 Jun 2021 03:29:43 +0100
Subject: [PATCH 06/10] simplified input iteration

---
 spyql/processor.py | 156 +++++++++++++++++++--------------------------
 1 file changed, 64 insertions(+), 92 deletions(-)

diff --git a/spyql/processor.py b/spyql/processor.py
index 0744e3f..902e68a 100644
--- a/spyql/processor.py
+++ b/spyql/processor.py
@@ -1,11 +1,3 @@
-# TODO: optimizations
-# [x]: single eval
-# [ ]: try to eliminate nested list
-# [x]: try to eliminate wrap row 
-# [x]: try to eliminate is instance of
-# [x]: try to eliminate execute (replace vars - needs heads + keywords)
-
-
 import csv
 import json as jsonlib
 import sys
@@ -14,7 +6,7 @@
 from math import * 
 import logging
 from collections.abc import Iterable
-from itertools import islice
+from itertools import islice, chain
 
 from spyql.writer import Writer
 from spyql.output_handler import OutputHandler
@@ -61,8 +53,6 @@ def __init__(self, prs, strings):
         self.prs = prs #parsed query
         self.strings = strings
         # by default, a row does not need to be wrapped (only single cols need)
-        self.wrap_row = False 
-        self.row_instantiation_script = None  
         self.input_col_names = []
         self.colnames2idx = {}
         
@@ -96,20 +86,14 @@ def make_out_cols_names(self, out_cols_names):
         out_cols_names = [name for sublist in out_cols_names for name in sublist] #flatten
         return out_cols_names 
 
-    # Returns iterator over input
-    # Input iterator should be a list of lists of rows for convinience    
-    # Each row can be a list (in case of multiple columns) or a literal (single column)    
+    # Returns iterator over input (e.g. list if rows)   
+    # Each row is list with one value per column
     # e.g.
-    #   [[1],[2],[3]] is the same as 
-    #   [[1,2],[3]] and is the same as 
-    #   [[1,2,3]]: 3 rows with a single col
-    #
-    #   [[[1,'a']], [[2,'b']], [[3,'c']]] is the same as
-    #   [[[1,'a']], [[2,'b'], [3,'c']]] and is the same as
-    #   [[[1,'a'], [2,'b'], [3,'c']]]: 3 rows with 2 cols 
-    def get_input_iterators(self):        
+    #   [[1] ,[2], [3]]:                3 rows with a single col
+    #   [[1,'a'], [2,'b'], [3,'c']]:    3 rows with 2 cols 
+    def get_input_iterator(self):        
         return [[None]] #default: returns a single line with a 'null' column
-
+    
     # Default column names, e.g. col1 for the first column
     def default_col_name(self, idx):
         return f"col{idx+1}"
@@ -117,7 +101,7 @@ def default_col_name(self, idx):
     
     # replace identifiers (column names) in sql expressions by references to `_values`
     # and put (quoted) strings back
-    def prepare_expression(self, expr):
+    def prepare_expression(self, expr):        
         if expr == '*':
             return [f"_values[{idx}]" for idx in range(self.n_input_cols)]
 
@@ -140,7 +124,7 @@ def go(self):
     def _go(self, output_handler):
         vars = globals() # to do: filter out not useful/internal vars
 
-        _values = [[]]
+        _values = []
         row_number = 0
         vars_script = None
         #json = {}
@@ -148,75 +132,64 @@ def _go(self, output_handler):
         # gets user-defined output cols names (with AS alias)
         out_cols_names = [c[0] for c in self.prs['select']]
         
-        cmds = []      
+        cmds = []
         
-
         explode_it_cmd = None
         explode_inst_cmd = None
         explode_path = self.prs['explode']    
         if (explode_path):
             explode_it_cmd = compile(explode_path, '', 'eval')
-            explode_inst_cmd = compile(f'{explode_path} = explode_it', '', 'exec')
-
-
-        # should not accept than 1 source, joins, etc (at least for now)
-        # input iterator is a list of lists for convinence
-        # an input iterator [[1],[2],[3]] is the same as [[1,2,3]]
-        its_list = self.get_input_iterators()
+            explode_inst_cmd = compile(f'{explode_path} = explode_it', '', 'exec')            
         
         where = None
         explode_its = [None] # 1 element by default (no explosion)
 
         logging.info("-- RESULT --")        
-        
-        for its in its_list: 
-            for it in its:        
-                _values = it                
-                
-                if not self.reading_data():
-                    self.handle_header_row(_values)
-                    continue
+
+        # should not accept more than 1 source, joins, etc (at least for now)    
+        for _values in self.get_input_iterator():
+            
+            if not self.reading_data():
+                self.handle_header_row(_values)
+                continue
+                                    
+            # print header
+            if row_number == 0:
+                self.handle_1st_data_row(_values)
+                output_handler.writer.writeheader(self.make_out_cols_names(out_cols_names))
+                if output_handler.is_done():
+                    return # in case of `limit 0`
+            
+                # TODO: move to function(s)
+                # compiles expressions for calculating outputs
+                cmds = [self.prepare_expression(c[1]) for c in self.prs['select']]  #todo: rename cmds to out_expressions        
+                cmds = [item for sublist in cmds for item in sublist] #flatten (because of '*')                    
+                cmds = compile('[' + ','.join(cmds) + ']', '<select>', 'eval')                    
+                where = self.prs['where']                
+                if (where):
+                    #TODO: check if * is not used in where... or pass argument
+                    where = compile(self.prepare_expression(where)[0], '<where>', 'eval') 
+            
+            if explode_path:
+                explode_its = eval(explode_it_cmd)
                             
-                if self.wrap_row:
-                    _values = [_values]
-                
-                # print header
-                if row_number == 0:
-                    self.handle_1st_data_row(_values)
-                    output_handler.writer.writeheader(self.make_out_cols_names(out_cols_names))
-                    if output_handler.is_done():
-                        return # in case of `limit 0`
-                
-                    # TODO: move to function(s)
-                    # compiles expressions for calculating outputs
-                    cmds = [self.prepare_expression(c[1]) for c in self.prs['select']]  #todo: rename cmds to out_expressions        
-                    cmds = [item for sublist in cmds for item in sublist] #flatten (because of '*')                    
-                    cmds = compile('[' + ','.join(cmds) + ']', '<select>', 'eval')                    
-                    where = self.prs['where']
-                    if (where):
-                        #TODO: check if * is not used in where... or pass argument
-                        where = compile(self.prepare_expression(where)[0], '<where>', 'eval') 
-                
+            for explode_it in explode_its:  
                 if explode_path:
-                    explode_its = eval(explode_it_cmd)
-                                
-                for explode_it in explode_its:  
-                    if explode_path:
-                        exec(explode_inst_cmd)
+                    exec(explode_inst_cmd)
 
-                    row_number = row_number + 1
+                row_number = row_number + 1
 
-                    vars["_values"] = _values
+                vars["_values"] = _values
 
-                    if not where or eval(where,{},vars): #filter (opt: eventually could be done before exploding)
-                        # input line is eligeble 
-                        
-                        # calculate outputs
-                        _res = eval(cmds,{},vars)
+                if not where or eval(where,{},vars): #filter (opt: eventually could be done before exploding)
+                    # input line is eligeble 
+                    
+                    # calculate outputs
+                    _res = eval(cmds,{},vars)
 
-                        output_handler.handle_result(_res) #deal with output
-                        if output_handler.is_done():
-                            return #e.g. when reached limit
+                    output_handler.handle_result(_res) #deal with output
+                    if output_handler.is_done():
+                        return #e.g. when reached limit
 
 
 class PythonExprProcessor(Processor):         
@@ -224,43 +197,42 @@ def __init__(self, prs, strings):
         super().__init__(prs, strings)
 
     # input is a Python expression
-    def get_input_iterators(self):
+    def get_input_iterator(self):
         e = eval(self.strings.put_strings_back(self.prs['from']))
         if e: 
             if not isinstance(e, Iterable):
                 e = [e]
-            self.wrap_row = not isinstance(e[0], Iterable)
-        return [e]
+            if not isinstance(e[0], Iterable):
+                e = [[el] for el in e]
+        return e
 
 class TextProcessor(Processor):
     def __init__(self, prs, strings):
         super().__init__(prs, strings)
-        self.wrap_row = True # since it's a single col, always wrap it
 
     # reads a text row as a row with 1 column
-    def get_input_iterators(self):
-        #return [sys.stdin] #to do: suport files
-        return [[line.rstrip("\n\r") for line in sys.stdin]]
+    def get_input_iterator(self):
+        #to do: suport files
+        return [[line.rstrip("\n\r")] for line in sys.stdin]
 
     
-    
 class JSONProcessor(Processor):
     def __init__(self, prs, strings):
         super().__init__(prs, strings)
-        self.wrap_row = True # since it's a single col, always wrap it
         self.colnames2idx.update({"json": 0}) # first column alias as json
 
-    def get_input_iterators(self):
-        return [sys.stdin] #to do: suport files
+    # 1 row = 1 json
+    def get_input_iterator(self):
+        #to do: suport files
+        return [[jsonlib.loads(line)] for line in sys.stdin]
 
 ## CSV
 class CSVProcessor(Processor):
     def __init__(self, prs, strings):
         super().__init__(prs, strings)
         self.has_header = False
-        self.wrap_row = False # since it's a multi col, it is already wraped
 
-    def get_input_iterators(self):
+    def get_input_iterator(self):
         # Part 1 reads sample to detect dialect and if has header
         # TODO: infer data type
         sample_size = 10 #make a input parameter
@@ -274,9 +246,9 @@ def get_input_iterators(self):
         #print(self.has_header)
         #print(dialect)        
         sample.seek(0) #rewinds the sample
-        return [
+        return chain(
             csv.reader(sample, dialect), #goes through sample again (for reading input data)
-            csv.reader(sys.stdin, dialect)] #continues to the rest of the file 
+            csv.reader(sys.stdin, dialect)) #continues to the rest of the file 
             #TODO: suport files
 
     def reading_data(self):          

From 5a84ad5aa5983da58091c3362dc9603eeccd09a1 Mon Sep 17 00:00:00 2001
From: dcmoura <dmoura@veniam.com>
Date: Sun, 6 Jun 2021 18:40:20 +0100
Subject: [PATCH 07/10] cleanup

---
 spyql/processor.py | 20 ++++++++------------
 spyql/spyql.py     |  5 ++++-
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/spyql/processor.py b/spyql/processor.py
index 902e68a..e856f15 100644
--- a/spyql/processor.py
+++ b/spyql/processor.py
@@ -51,10 +51,9 @@ def make_processor(prs, strings):
 
     def __init__(self, prs, strings):
         self.prs = prs #parsed query
-        self.strings = strings
-        # by default, a row does not need to be wrapped (only single cols need)
-        self.input_col_names = []
-        self.colnames2idx = {}
+        self.strings = strings #quoted strings
+        self.input_col_names = [] #column names of the input data
+        self.colnames2idx = {} #map from column names to indexes
         
 
     # True after header, metadata, etc in input file
@@ -64,7 +63,6 @@ def reading_data(self):
     # Action for header row (e.g. column name definition)
     def handle_header_row(self, row):
         pass
-
     
     # Action for handling the first row of data 
     def handle_1st_data_row(self, row):
@@ -97,7 +95,6 @@ def get_input_iterator(self):
     # Default column names, e.g. col1 for the first column
     def default_col_name(self, idx):
         return f"col{idx+1}"
-
     
     # replace identifiers (column names) in sql expressions by references to `_values`
     # and put (quoted) strings back
@@ -124,16 +121,13 @@ def go(self):
     def _go(self, output_handler):
         vars = globals() # to do: filter out not useful/internal vars
 
+        cmds = []
         _values = []
         row_number = 0
-        vars_script = None
-        #json = {}
-
+        
         # gets user-defined output cols names (with AS alias)
         out_cols_names = [c[0] for c in self.prs['select']]
-        
-        cmds = []
-        
+
         explode_it_cmd = None
         explode_inst_cmd = None
         explode_path = self.prs['explode']    
@@ -206,6 +200,7 @@ def get_input_iterator(self):
                 e = [[el] for el in e]
         return e
 
+
 class TextProcessor(Processor):
     def __init__(self, prs, strings):
         super().__init__(prs, strings)
@@ -226,6 +221,7 @@ def get_input_iterator(self):
         #to do: suport files
         return [[jsonlib.loads(line)] for line in sys.stdin]
 
+
 ## CSV
 class CSVProcessor(Processor):
     def __init__(self, prs, strings):
diff --git a/spyql/spyql.py b/spyql/spyql.py
index 6abaf61..e173a94 100644
--- a/spyql/spyql.py
+++ b/spyql/spyql.py
@@ -238,6 +238,8 @@ def main():
 if __name__ == "__main__":    
     main()
 
+    ## For profiling:
+    #
     # import cProfile    
     # import pstats
     # from pstats import SortKey
@@ -245,4 +247,5 @@ def main():
     # p = pstats.Stats('spyql.stats').strip_dirs()
 
     # p.sort_stats(SortKey.CUMULATIVE).dump_stats('spyql.stats.cum')
-    # p.sort_stats(SortKey.TIME).dump_stats('spyql.stats.time')
\ No newline at end of file
+    # p.sort_stats(SortKey.TIME).dump_stats('spyql.stats.time')
+    
\ No newline at end of file

From 8e57b90ab38697dbb7a1c8e96dbd9605ce5de63c Mon Sep 17 00:00:00 2001
From: dcmoura <dmoura@veniam.com>
Date: Mon, 7 Jun 2021 11:07:51 +0100
Subject: [PATCH 08/10] improved code readability

---
 spyql/processor.py | 27 +++++++++++++--------------
 spyql/spyql.py     |  3 +--
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/spyql/processor.py b/spyql/processor.py
index e856f15..5adf252 100644
--- a/spyql/processor.py
+++ b/spyql/processor.py
@@ -121,22 +121,21 @@ def go(self):
     def _go(self, output_handler):
         vars = globals() # to do: filter out not useful/internal vars
 
-        cmds = []
+        select_expr = []
+        where_expr = None
         _values = []
         row_number = 0
+        explode_its = [None] # 1 element by default (no explosion)
         
         # gets user-defined output cols names (with AS alias)
-        out_cols_names = [c[0] for c in self.prs['select']]
+        out_cols_names = [c['name'] for c in self.prs['select']]
 
         explode_it_cmd = None
         explode_inst_cmd = None
         explode_path = self.prs['explode']    
         if (explode_path):
             explode_it_cmd = compile(explode_path, '', 'eval')
-            explode_inst_cmd = compile(f'{explode_path} = explode_it', '', 'exec')            
-        
-        where = None
-        explode_its = [None] # 1 element by default (no explosion)
+            explode_inst_cmd = compile(f'{explode_path} = explode_it', '', 'exec')
 
         logging.info("-- RESULT --")        
 
@@ -156,13 +155,13 @@ def _go(self, output_handler):
             
                 # TODO: move to function(s)
                 # compiles expressions for calculating outputs
-                cmds = [self.prepare_expression(c[1]) for c in self.prs['select']]  #todo: rename cmds to out_expressions        
-                cmds = [item for sublist in cmds for item in sublist] #flatten (because of '*')                    
-                cmds = compile('[' + ','.join(cmds) + ']', '<select>', 'eval')                    
-                where = self.prs['where']                
-                if (where):
+                select_expr = [self.prepare_expression(c['expr']) for c in self.prs['select']] 
+                select_expr = [item for sublist in select_expr for item in sublist] #flatten (because of '*')                    
+                select_expr = compile('[' + ','.join(select_expr) + ']', '<select>', 'eval')                    
+                where_expr = self.prs['where']                
+                if (where_expr):
                     #TODO: check if * is not used in where... or pass argument
-                    where = compile(self.prepare_expression(where)[0], '<where>', 'eval') 
+                    where_expr = compile(self.prepare_expression(where_expr)[0], '<where>', 'eval') 
             
             if explode_path:
                 explode_its = eval(explode_it_cmd)
@@ -175,11 +174,11 @@ def _go(self, output_handler):
 
                 vars["_values"] = _values
 
-                if not where or eval(where,{},vars): #filter (opt: eventually could be done before exploding)
+                if not where_expr or eval(where_expr,{},vars): #filter (opt: eventually could be done before exploding)
                     # input line is eligeble 
                     
                     # calculate outputs
-                    _res = eval(cmds,{},vars)
+                    _res = eval(select_expr,{},vars)
 
                     output_handler.handle_result(_res) #deal with output
                     if output_handler.is_done():
diff --git a/spyql/spyql.py b/spyql/spyql.py
index e173a94..848e371 100644
--- a/spyql/spyql.py
+++ b/spyql/spyql.py
@@ -130,7 +130,7 @@ def parse_select(sel, strings):
             c = f"{make_expr_ready(c, strings)}" 
         
         #new_sel[name] = c
-        new_sel.append((name,c))
+        new_sel.append({"name": name, "expr": c})
     
     return new_sel
 
@@ -248,4 +248,3 @@ def main():
 
     # p.sort_stats(SortKey.CUMULATIVE).dump_stats('spyql.stats.cum')
     # p.sort_stats(SortKey.TIME).dump_stats('spyql.stats.time')
-    
\ No newline at end of file

From f7ca07ce7f920bc6d6d841ca3be2d6408beffb66 Mon Sep 17 00:00:00 2001
From: Daniel Moura <daniel.c.moura@gmail.com>
Date: Mon, 19 Jul 2021 21:23:25 +0100
Subject: [PATCH 09/10] Update spyql/processor.py

Co-authored-by: Diogo Recharte <drecharte@veniam.com>
---
 spyql/processor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spyql/processor.py b/spyql/processor.py
index 5adf252..3dd0a88 100644
--- a/spyql/processor.py
+++ b/spyql/processor.py
@@ -84,7 +84,7 @@ def make_out_cols_names(self, out_cols_names):
         out_cols_names = [name for sublist in out_cols_names for name in sublist] #flatten
         return out_cols_names 
 
-    # Returns iterator over input (e.g. list if rows)   
+    # Returns iterator over input (e.g. list of rows)   
     # Each row is list with one value per column
     # e.g.
     #   [[1] ,[2], [3]]:                3 rows with a single col
@@ -267,4 +267,3 @@ def make_str_valid_varname(self, s):
         s = re.sub(r'\s+', '_', s)
         
         return s
-

From 0116811ac89a5461cceae4d0e3bcb1eb0f150fef Mon Sep 17 00:00:00 2001
From: Daniel Moura <daniel.c.moura@gmail.com>
Date: Mon, 19 Jul 2021 21:29:59 +0100
Subject: [PATCH 10/10] Update spyql/processor.py

Co-authored-by: Diogo Recharte <drecharte@veniam.com>
---
 spyql/processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spyql/processor.py b/spyql/processor.py
index 3dd0a88..9ee0e88 100644
--- a/spyql/processor.py
+++ b/spyql/processor.py
@@ -146,7 +146,6 @@ def _go(self, output_handler):
                 self.handle_header_row(_values)
                 continue
                                     
-            # print header
             if row_number == 0:
                 self.handle_1st_data_row(_values)
                 output_handler.writer.writeheader(self.make_out_cols_names(out_cols_names))