# tokenizer demo (RPN)

In [1]:
from tokenizer import *

In [2]:
# TODO: shunting_yard tolerant to spaces before '='

In [3]:
# example 1
frml = '''= IF( $D21=Blank, Zero,
          ( INT( INDEX( TicketTypes_MOIRAheader, MATCH( $I21, TicketTypes_code, Zero ) ) = iTT1_RevShare!$G$12 ) * (
                    SUMIFS( iTT1_RevShare!$I:$I, INDEX( cFlowLookup!$D:$W, Zero, MATCH( P$17, cFlowLookup!$D$19:$W$19, Zero ) ), $F21 )
                    + SUMIFS( cTOCchoice_summary_diff!$J:$J, cTOCchoice_summary_diff!$C:$C, INDEX( Flows, $F21 ), cTOCchoice_summary_diff!$B:$B, P$6 ) * TOCchoice_switch )
             + INT( INDEX( TicketTypes_MOIRAheader, MATCH( $I21, TicketTypes_code, Zero ) ) = iTT1_RevShare!$J$12 ) * (
                    SUMIFS( iTT1_RevShare!$L:$L, INDEX( cFlowLookup!$D:$W, Zero, MATCH( P$17, cFlowLookup!$D$19:$W$19, Zero ) ), $F21 )
                    + SUMIFS( cTOCchoice_summary_diff!$M:$M, cTOCchoice_summary_diff!$C:$C, INDEX( Flows, $F21 ), cTOCchoice_summary_diff!$B:$B, P$6 ) * TOCchoice_switch )
             + INT( INDEX( TicketTypes_MOIRAheader, MATCH( $I21, TicketTypes_code, Zero ) ) = iTT1_RevShare!$M$12 ) * (
                    SUMIFS( iTT1_RevShare!$O:$O, INDEX( cFlowLookup!$D:$W, Zero, MATCH( P$17, cFlowLookup!$D$19:$W$19, Zero ) ), $F21 )
                    + SUMIFS( cTOCchoice_summary_diff!$P:$P, cTOCchoice_summary_diff!$C:$C, INDEX( Flows, $F21 ), cTOCchoice_summary_diff!$B:$B, P$6 ) * TOCchoice_switch ) )
          * $L21 )
'''

In [4]:
# example 2
frml = '''= IF( P$7 = "", "", TRUNC( P19 ) = TRUNC( P$15 ))
'''

In [5]:
rpn = shunting_yard(frml)

In [6]:
for tk in rpn:
    print(tk.__repr__(), '\t', tk)

<tokenizer.RangeNode object at 0x000000000550EFD0> 	 P$7
<tokenizer.ASTNode object at 0x0000000005705048> 	 
<tokenizer.ASTNode object at 0x0000000005705080> 	 =
<tokenizer.ASTNode object at 0x00000000057050B8> 	 
<tokenizer.RangeNode object at 0x00000000057050F0> 	 P19
<tokenizer.FunctionNode object at 0x0000000005705128> 	 TRUNC
<tokenizer.RangeNode object at 0x0000000005705160> 	 P$15
<tokenizer.FunctionNode object at 0x0000000005705198> 	 TRUNC
<tokenizer.ASTNode object at 0x00000000057051D0> 	 =
<tokenizer.FunctionNode object at 0x0000000005705208> 	 IF
<tokenizer.RangeNode object at 0x0000000005705240> 	 



# To AST

In [7]:
def rpn_to_ast(rpn):
    '''Turns a deque in RPN order to AST
    src: https://stackoverflow.com/a/12929794/2802352'''
    stack = []
    for n in rpn:
        num_args = (2 if n.token.ttype == "operator-infix" else
                    1 if n.token.ttype.startswith('operator') else
                    n.num_args if n.token.ttype == 'function' else 0)
        n.args = [stack.pop() for _ in range(num_args)][::-1]
        stack.append(n)
    return stack[0]

In [8]:
def walk(ast):
    '''src: https://stackoverflow.com/a/12929794/2802352'''
    yield ast
    for arg in getattr(ast, 'args', []):
        for node in walk(arg):
            yield node

In [9]:
ast = list(walk(rpn_to_ast(rpn)))

In [10]:
for tk in ast:
    print(tk.__repr__(), '\t', tk.token.tvalue, '\t', tk.token.ttype, '\t', tk.token.tsubtype)

<tokenizer.FunctionNode object at 0x0000000005705208> 	 IF 	 function 	 
<tokenizer.ASTNode object at 0x0000000005705080> 	 = 	 operator-infix 	 logical
<tokenizer.RangeNode object at 0x000000000550EFD0> 	 P$7 	 operand 	 range
<tokenizer.ASTNode object at 0x0000000005705048> 	  	 operand 	 text
<tokenizer.ASTNode object at 0x00000000057050B8> 	  	 operand 	 text
<tokenizer.ASTNode object at 0x00000000057051D0> 	 = 	 operator-infix 	 logical
<tokenizer.FunctionNode object at 0x0000000005705128> 	 TRUNC 	 function 	 
<tokenizer.RangeNode object at 0x00000000057050F0> 	 P19 	 operand 	 range
<tokenizer.FunctionNode object at 0x0000000005705198> 	 TRUNC 	 function 	 
<tokenizer.RangeNode object at 0x0000000005705160> 	 P$15 	 operand 	 range


# Formula formatting

In [11]:
def pretty_print(expression):
    '''Outputs a formula tree in RPN in a  nice format.'''
    
    otxt = ''
    lvl = 0
    prevn = None
    
    p = ExcelParser();
    p.parse(expression)

    # insert tokens for '(' and ')', to make things cleaner below
    tokens = []
    for t in p.tokens.items:
        
        if t.ttype == "function" and t.tsubtype == "start":
            t.tsubtype = ""
            tokens.append(t)
            tokens.append(f_token('(','arglist','start'))
            
        elif t.ttype == "function" and t.tsubtype == "stop":
            tokens.append(f_token(')','arglist','stop'))
            
        elif t.ttype == "subexpression" and t.tsubtype == "start":
            t.tvalue = '('
            tokens.append(t)
            
        elif t.ttype == "subexpression" and t.tsubtype == "stop":
            t.tvalue = ')'
            tokens.append(t)
            
        else:
            tokens.append(t)
    
    for tk in tokens:
        
        if tk.tsubtype == 'text':
            otxt += '"'
        
        otxt += str(tk.tvalue)
        
        if tk.tsubtype == 'text':
            otxt += '"'
        
        if tk.tvalue == '(':
            lvl += 1
        
        elif tk.tvalue == ',':
            otxt += '\n' + ' ' * 4 * lvl
        
        elif tk.tvalue == ')':
            lvl -= 1
    
    return otxt

In [12]:
pretty_print(frml)

'IF(P$7="",\n    "",\n    TRUNC(P19)=TRUNC(P$15))\n'

In [13]:
print(pretty_print(frml))

IF(P$7="",
    "",
    TRUNC(P19)=TRUNC(P$15))

