# Hiena Multipass Parser

    hiena_mp()

A Hiena parser takes a Grammar and a Target, and generates a Dictionary tree.

    def hiena_mp(grammar, target, rulename) -> dict:
        ...

`hiena_mp()` is a multi-pass recursive-descent implementation suitable for small files with limited depth.

## Command Interpreter Mode

The 'hiena' parser can be specified as a command interpreter by putting a '#!' on the first line of the grammar file -and/or- in case the grammar is cached, a '#!' item in the grammar dict.

    { "#!": [hienapath, opts], 
       ...


In [9]:
import re
from Dcel import Dcel

class HienaStr(str):
    def __create__(self,string,rematchobj,match_index):
        super.__create__(string)
        
    def __init__(self,string,rematchobj,match_index):
        self.hiena_data_start = rematchobj.start(match_index)
        self.hiena_data_end = rematchobj.end(match_index)
        super.__init__(string)

def hiena_mp(g:dict, text:Dcel, rulename="$__start__"):
    
    assert(type(text) is Dcel)
    
    # This begins its life as a list()
    # it collects the matches for a repeating grammar rule.
    
    tree = list()
    
    # Parse a layer of `text` using current `rulename` from grammar `g`.
    
    if rulename in g:
        
        # Hook for beginning of parsing a grammar.
        # The function is recursive, so the any rule could
        # be a start rule if the function is
        # called programmatically. When APath uses HienaMP
        # as a executable interpreter, it expects $__start__. 
        
        if rulename=="$__start__":
            rulename=g["$__start__"]
        
        # If $__start__ was not specified, this is the rulename
        # called in the function args. Otherwise, it is the rulename
        # resolved from $__start__.
        
        rule = g[rulename]
        
        # all matches within `text`.
        
        m = re.finditer(rule[0], 
                        str(text),
                        re.M
                       )
        
        # next rule that parses each match in `m`.
        nextrulename = rule[1]
        
        # branch rule
        if nextrulename != "":
            for ea in m:
                
                # create fragment Dcel from `text`
                map_fragment = text[ea.start(0):ea.end(0)]
                
                # parse match and collect result in list
                tree.append(hiena_mp(
                      g,
                      # ea.group(0),    # old string version
                      map_fragment,     # new Dcel fragment version
                      nextrulename
                     ))
        
        # terminal rule
        else:
            for ea in m:
                # terminal_value = ea.group(0)  # old string version
                map_fragment = text[ea.start(0):ea.end(0)]  # new Dcel fragment version
                
                # WIP: need to attach data_map to terminal_value object
                # ie. HienaValue(ea.group(valno),ea.start(valno),ea.end(valno))
                # ie. terminal_value.data_map = data_map
                
                # tree.append(terminal_value) 
                tree.append(map_fragment)
                
        # After all matches have been recursively parsed
        # create a dictionary keyed by `labels` provided in field 2
        # of the grammar rule.
        
        # If the `labels` are a dictionary {key:number,value:number}
        # then, extract the label from the text of the match.
        
        # FIXME: validate presence of 'key' and 'value' before entering this block.
        labels = rule[2]
        if type(labels) == dict:
            keyno = labels['key']
            valno = labels['value']
            # FIXME: eleminate this double-run of the grammar
            # by caching the results earlier in the function.
            m = re.finditer(rule[0], 
                    str(text),
                    re.M
                   )
            
            # HACK to populate empty Key-val-pairs with something useful.
            # This should propogate back to the underlier correctly.
            if(ea.start(valno) == -1):
                valno = keyno
            # end HACK
            tree = { ea.group(keyno):text[ea.start(valno):ea.end(valno)]
                    # WIP: need to attach data_map to terminal_value object
                    # ie. HienaValue(ea.group(valno),ea.start(valno),ea.end(valno))
                    for ea in m
                   }
            # WIP: need to attach a data_map to the tree
            # ie. tree.data_map
            return tree
        else:        
            tree = { k:v for k,v 
                    in zip(labels, 
                           tree
                          )}
        return tree

In [11]:
import json

LINE = '^.+$'
WORD = '[^ ]+'
CHAR = '\w'
entryschema = [str(i) for i in range(1,10)]
fieldschema = [
    'spec', 'file', 'vfstype', 
    'mntopts', 'freq', 'passno'
]
KVP = '([^= ,]+)(?:[=]([^=,]+))?'
kvpschema = {'key':1,'value':2}
fstabg = {
    "$__start__": "entry",
    "entry": [LINE, "field", entryschema ],
    "field": [WORD, "keyvaluepair", fieldschema],
    "keyvaluepair": [KVP, "", kvpschema]
}

sample = Dcel("""
one=1,two=2 three four
five six seven eight 23 4
""")

x = hiena_mp(fstabg,sample)
print(type(x))
print(x)

<class 'dict'>
{'1': {'spec': {'one': <Dcel.Dcel object at 0x7f8678653df0>, 'two': <Dcel.Dcel object at 0x7f8678634ca0>}, 'file': {'three': <Dcel.Dcel object at 0x7f8678653a90>}, 'vfstype': {'four': <Dcel.Dcel object at 0x7f8678634fd0>}}, '2': {'spec': {'five': <Dcel.Dcel object at 0x7f8678634c40>}, 'file': {'six': <Dcel.Dcel object at 0x7f8678634c10>}, 'vfstype': {'seven': <Dcel.Dcel object at 0x7f86786349a0>}, 'mntopts': {'eight': <Dcel.Dcel object at 0x7f8678634bb0>}, 'freq': {'23': <Dcel.Dcel object at 0x7f8678654460>}, 'passno': {'4': <Dcel.Dcel object at 0x7f8678654dc0>}}}


In [12]:
print(x['1']['spec']['one'])

1


In [15]:
x['1']['spec']['one'].value = "uno"

In [16]:
import json
from DcelJSONEncoder import DcelJSONEncoder

json.dumps(x,cls=DcelJSONEncoder)


'{"1": {"spec": {"one": "uno", "two": "2"}, "file": {"three": "three"}, "vfstype": {"four": "four"}}, "2": {"spec": {"five": "five"}, "file": {"six": "six"}, "vfstype": {"seven": "seven"}, "mntopts": {"eight": "eight"}, "freq": {"23": "23"}, "passno": {"4": "4"}}}'

In [17]:
print(sample)


one=uno,two=2 three four
five six seven eight 23 4



In [4]:
type(x['1']['file']['three'])
d = x['1']['file']['three']
e = x['1']['vfstype']['four']
d.value = "surprise"
e.value = "party"
print(f"d: value: {d.value}, address: {d.address}, service: {d.service}")
json.dumps(x,cls=DcelJSONEncoder)

print(sample)
# WARNING: after running this, the internal map becomes out-of-sync.
# The sample will need to be reparsed and will break the bindings
# to whatever key-value-pairs have changed in the sample.

KeyError: 'three'

In [6]:
import re

data = '1234'
p = re.compile('1234')
m = p.match(data)

print(m.end(0))

x = HienaStr(m[0],m)


4


TypeError: str() argument 2 must be str, not re.Match

In [22]:
# playground to practice attaching meta-data to a dict() object

from collections import namedtuple
Frag = namedtuple('Frag',['start','len','frags'])

class parsetree(dict):
    def __init__(self,datadict=dict(),fragmap=dict()):
        self.update(datadict)
        self.cbfrag = fragmap
        
tree = parsetree({"one":"uno"},{"one":(0,3,{"char":(1,1,None)})})

print(tree)
print(tree.cbfrag)
print(tree.cbfrag['one'][2])

{'one': 'uno'}
{'one': (0, 3, {'char': (1, 1, None)})}
{'char': (1, 1, None)}


In [2]:
d = { 'key': 1 }

d['key']

1

In [34]:
from Dcel import Dcel
from DictFS import DictFS

fstab = Dcel(address=fstabg, 
               service_class=DictFS
              )

a = Dcel(formula=hiena_mp, 
         args=[fstab,sample]
        )

if a.value is a.value:
    print('same')
    
a.value['1']['one'] = 'uno'

b = Dcel(address=a, 
         service_class=DictFS
        )



print(b)

for ea in b.listdir():
    try:
        print(ea)
    except:
        pass


SyntaxError: invalid syntax (402375693.py, line 12)

In [25]:
q = Dcel()
r = Dcel()
s = Dcel({'q':q,'r':r},service_class=DictFS)
for ea in s.listdir():
    print(s[ea])

None
None


In [4]:
import re

line = '^.+$'
word = '\w+'
flags = re.M

entry = [[word,flags],[1,2],dict()]
field = [[word,flags]
    ['spec', 
     'file', 
     'vfstype', 
     'mntopts',
     'freq',
     'passno'], 
     {'freq':'[0-9]+', 
      'passno':'[0-9]+'
     }
]

sample = """
one two three four 1 0
five six seven eight 23 4
"""

def parse(g, text):
    lex = g[0][0]
    fl  = g[0][1] 
    labels = g[1]
    sublex = g[2]
    m = re.findall(lex,text,flags)
    w = { k:v for k,v in zip(labels,m)}
    print(w)
    for ea in sublex:
        if ea in w:
            print(re.match(sublex[ea],w[ea],0))

parse(entry, sample)
parse(field, sample)

  field = [[word,flags]


TypeError: list indices must be integers or slices, not tuple

In [None]:
for i in 1-10:
    print(i)

In [None]:
import re

ruleref_re = re.compile("(?P<surface_id>[/@]?)(?P<rule_id>[^{} ]+)(?P<qty>[{][*][}])?")

class HienaParser:
    def __init__(self, 
                 target=None,
                 grammar=None,
                ):
        self.target = target
        self.grammar = grammar
    
    def run_lex(self, 
                text:str,
                regex_args:list,
                ) -> list:
        if not type(regex_args) is list:
            raise TypeError('Requires a list of args suitable for re.findall()')
        _re = regex_args[0]
        try:
            _flags = regex_args[1]
        except:
            _flags = 0
        return re.findall(_re, text, _flags)
        
    def parse_rule_reference(self,ref):
        m = re.match(ruleref_re,ref,0)
        surfaceid = ''
        args = [ ref ]
        return (surfaceid,args)
        
    def run_rule(self, 
                 target:str = None,
                 rulename:str = "",
                 quantity:str = "*",
                ):
        rulepart = self.grammar[rulename]
        print('rulepart: '+rulepart)
        # for first element
        i = 0
        try:
            self.run_lex(rulepart[0])
            i += 1
        except:
            pass  

        def process_rulebody(e):
            try:
                a = self.parse_rule_reference(e)
                res = self.run_rule(*a[1])
            except:
                raise
                
        if type(rulepart) is str:
            process_rulebody(rulepart)
            
        if type(rulepart) is list:
            # loop over rulebody
            for e in rulepart[i:]:
                print(e)
                process_rulebody(e)
        
    def run(self):
        startname = self.grammar["$__start__"]
        return self.run_rule(self.target, 
                             rulename=startname
                          )
                        
def hiena1(target: str, grammar: dict) -> (map, dict):
    mapp = { k: re.findall(grammar[k],
                 target)
            for k in grammar }
    dirr = None
    return (mapp,dirr)

def hiena(target: str, grammar: dict) -> (map, dict):
    parser = HienaParser(target,grammar)
    return parser.run()
    

In [None]:
import re
surface_id = "(?P<surface_id>[/@]?)"
rule_id = "(?P<rule_id>[^{}+*? ]+)"
qty = "(?P<qty>[+*?]|(?:[{][1-9]+[}]))?"
carver = "([{][^{}]*[}])"
carvers = "(?:\W*"+carver+")+"
_="\W*"+carver
ruleref = surface_id+rule_id+qty+carvers
quantifier = "([*])|([1-9])|([^*, ]+)"
ruleref_re = re.compile(ruleref)
quantifier_re = re.compile(quantifier)

In [None]:
import re
fstabGrammar = {
    "$__start__": "fstab",
    "fstab": [ "/entry+" ],
    "entry": [ "ENTRY{2}"
               "{@field+:spec,file,vfstype,mntopts,freq:digit,passno:digit} {other} {such}" ], 
    "ENTRY": [[r"^[^#\n]+", re.M]],
    "field": [[r"[^# ]+"]],
    "digit": [[r"[0-9]"]], 
    
    " " : " "
}

In [None]:
print(fstabGrammar['entry'][0])

In [None]:
a = re.search( 
    ruleref_re,
    fstabGrammar['entry'][0], 
    0 
)
print(a.groups())

b = re.findall( 
    carver,
    fstabGrammar['entry'][0], 
    0 
)
print(b)

In [None]:
from fs.osfs import OSFS
from Dcel import Dcel

d = Dcel(address='fs', 
         service_class=OSFS
        )
text = d.path_lookup('.cosm/etc/fstab').value


In [None]:
s = '/entry{*}'
print(s[0])
print(s[1:].split('{')[1][0])
print()

strings = [ '/entry{*}', 
           '/entry',
           'entry{*}',
           'entry'
          ]

import re
for s in strings:
    c = re.compile("(?P<surface_id>[/@]?)(?P<rule_id>[^{} ]+)(?P<qty>[{][*][}])?")
    m = re.match(c,s,0)
    print(m.groupdict())

In [None]:
res = hiena(text,fstabGrammar)

In [None]:
print(res)

In [None]:
grammar = {"word": r"[^ ]+"}
hiena("one two three", grammar)

In [None]:
g = {'fs_entry': r"(.+)\n"}
d = """
sftp://example.com  /  sftpfs

localhost:/example  /  file

files.example.com   /  webdavfs
"""
hiena(d,g)