# GenParse: Lark Interface

In [None]:
from genparse.util import LarkStuff
from arsenal import Integerizer
from genparse.util import regex_to_greenery, greenery_to_fsa

## Using lark as a front end

In [None]:
grammar2 = r"""
WS: /[ \t\f\r\n]/
STAR: "*"
NUMBER: /\d+/

start: WS? "SELECT" WS select_expr WS "FROM" WS from_expr
  [WS "WHERE" WS bool_condition] [WS "GROUP BY" WS var_list] [WS "ORDER BY" WS orderby_expr] WS EOS
EOS: "</s>"
select_expr: STAR | select_list
bool_condition: bool_expr | "(" bool_condition WS "AND" WS bool_condition ")" | "(" bool_condition WS "OR" WS bool_condition ")" 
bool_expr: var "=" value | var ">" value | var "<" value
from_expr: "data"
orderby_expr: var_list WS "ASC" | var_list WS "DESC"
select_list: select_var ("," WS select_var)*
var_list: var ("," WS var)*
select_var: var | "AVG(" var ")" | "MEDIAN(" var ")" | "COUNT(" var ")"
var: "age" | "gender" | "year" | "state_color" | "zipcode" | "vote" | "race_ethnicity"
value: NUMBER | "red" | "blue" | "white" | "black" | "latino" | "republican" | "democrat" | "male" | "female"

"""

In [None]:
grammar1 = """
start: query_expr EOS

EOS: "</s>"

query_expr: select [ "ORDER" "BY" (order_by_expr ",")*  order_by_expr] [ "LIMIT" integer_ ] 

select: "SELECT" [(select_expr ",")*] select_expr "FROM" "data" [ "WHERE" bool_expression ] [ "GROUP" "BY" [(expression ",")*] expression ]

select_expr.0: expression_math [ [ "AS" ] alias ] -> select_expression

?expression_math: expression_product
               | expression_math PLUS expression_product -> expression_add
               | expression_math "-" expression_product -> expression_sub
               | AGGREGATION expression_math /\)/ -> sql_aggregation

?expression: (name | STAR) -> column_name
            | literal

?expression_product: expression_parens
                  | expression_product STAR expression_parens
                  | expression_product "/" expression_parens 

?expression_parens: expression
                  | /\(/ expression_parens STAR expression /\)/ 
                  | /\(/  expression_parens "/" expression /\)/ 
                  | /\(/  expression_parens PLUS expression /\)/
                  | /\(/  expression_parens "-" expression /\)/

bool_expression: bool_parentheses
                 | bool_expression "AND" bool_parentheses 
                 | bool_expression "OR" bool_parentheses
bool_parentheses: comparison_type
                 | /\(/   bool_expression "AND" comparison_type /\)/
                 | /\(/  bool_expression "OR" comparison_type /\)/
comparison_type: equals | not_equals | greater_than | less_than | greater_than_or_equal
| less_than_or_equal | is_null | is_not_null
equals: expression_math "=" expression_math
not_equals: expression_math ("<>" | "!=") expression_math
greater_than: expression_math ">" expression_math
less_than: expression_math "<" expression_math
greater_than_or_equal: expression_math ">=" expression_math
less_than_or_equal: expression_math "<=" expression_math
is_null: expression_math "is" "null"
is_not_null: expression_math "is" "not" "null"

alias: /[A-Za-z]+/
name: /[A-Za-z]+/
PLUS: /\+/

order_by_expr: expression_math ["ASC"] -> order_asc
        | expression_math "DESC" -> order_desc

AGGREGATION.8: ("sum(" | "avg(" | "min(" | "max(" | "count(" "distinct" | "count(")
STAR: /\*/
integer_: /[1-9][0-9]*/
?literal: boolean -> bool
       | integer_ -> number
       | ESCAPED_STRING -> string

boolean: "true" -> true
       | "false" -> false

%import common.WS
%ignore WS
%import common.ESCAPED_STRING
    
"""

In [None]:
raw_grammar = grammar1

lark_stuff = LarkStuff(raw_grammar)

In [None]:
intern = Integerizer()  # rename nonterminals to integers
g = lark_stuff.convert()
g = g.rename(intern)
assert g.in_cnf()  # lark returns a grammar in CNF
# g = g.cnf

In [None]:
g

In [None]:
len(g.rules), len(g.V), len(g.N)

In [None]:
sorted(g.cnf.V)

In [None]:
# from newton.linking import LinkAnalysis
# f = Integerizer()
# links = LinkAnalysis(g.rename(f))
# links.dfs

In [None]:
# g.language(6)

## Tokenization

We can extract lark's tokenizer in a format that we can build on.  We will even make a DIY tokenizer based on Python's `re` library.

| Terminology  |         |
|--------------|---------|
| tokenization | lexing  |
| tokenizers   | lexers  |
| tokens       | lexemes |



In [None]:
sorted(lark_stuff.terminals, key=lambda t: -t.priority)

### DIY tokenizer

In [None]:
text = (
    "12 + 24 - 36 * 48 / 60 SELECT table.name AS thing WHERE table.potato IS NOT 'banana'"
)

for x, y in lark_stuff.simple_tokenizer(text):
    print(f'{x:15s} -> {y!r}')

### Parsing tokenized input

In [None]:
text = 'SELECT name FROM data </s>'

In [None]:
tokens = list(lark_stuff.lex(text))
tokens

Call the lark parser on the text:

In [None]:
# lark_stuff.instance.parse(text)

We can call the lark parser on these tokens:

In [None]:
lark_stuff.parser.parse(tokens, 'start')

We can call our parser on this text to get its total weight

In [None]:
g([t.type for t in tokens])

### Tokenizer State Machines

**Note**: Tokenizers are FSTs, not FSAs.  However, these libraries implement the kind of restricted FSTs with a separate FSA per token type.

In [None]:
i = 4
token_class = lark_stuff.terminals[i]
m = greenery_to_fsa(regex_to_greenery(token_class.pattern.to_regexp()))
display(token_class)
display(m.min())

In [None]:
# for e in m.arcs(): print(e)

## Prefix Grammar

In [None]:
(
    len(g.cnf.rules),
    len(g.cnf.prefix_grammar.trim().rules),
    len(g.cnf.prefix_grammar.trim().rules) / len(g.cnf.rules),
)

In [None]:
# N = G.nullaryremove()    # could be faster with SCC-based prioritization

In [None]:
g.cnf.prefix_grammar.trim().cnf

In [None]:
g.prefix_weight([t.type for t in tokens])