In [2]:
from lark import Lark
import re

In [3]:
def apply_meta_rules(gramatika):
    gramatika = re.sub(r"([a-z_]+)-TVP",      r' "\1:" UNQUOTED_STRING -> \1',   gramatika)
    gramatika = re.sub(r"([a-z_]+)-BT",       r' "\1:" ( TRUE | FALSE ) -> \1',  gramatika)
    gramatika = re.sub(r"([a-z_]+)-Tag (.*)", r' "\1:" \2 -> \1',                gramatika)
    return gramatika

In [241]:
parser = Lark(apply_meta_rules(r"""
    start : not_important* term_frame+ typedef?
    typedef : "[Typedef]" /(.|\n)+/
    not_important : /.+/? _NL
    
    TRUE: "true"
    FALSE: "false"
       
    class_id : id
    rel_id : id
    instance_id : id
    ?id : URL_AS_ID | PREFIXED_ID | UNPREFIXED_ID
    URL_AS_ID  : ("https:" | "http:")  /(?:[^\s,\}\]\\\\]|\\\\.)+/ 
    PREFIXED_ID :  /[^\s:=]+:\s*(?:[^\s,\}\]\\\\]|\\\\.)+/             
    UNPREFIXED_ID: /(?:[^\s:,=\}\]\\\\]|\\\\.)+/
 
 
        UNQUOTED_STRING: /.+/

// line termination
    eol :  qualifier_block? HIDDEN_COMMENT? _NL
    HIDDEN_COMMENT : "!" /[^\n]*/
    ?qualifier_block : "{" qualifier_list? "}"
    qualifier_list :  qualifier ("," qualifier)*
    qualifier : rel_id  "="  QUOTED_STRING                 // requiers that ID dosn't allow '='
    
    
    xref_list : "[" (xref_no_comma ("," xref_no_comma)*)? "]"
    xref: id QUOTED_STRING?               // instead of 'id' it cold be /\S+/ ???, (does it have to escape ] ?)
    xref_no_comma: id QUOTED_STRING ?
    
        term_frame : "[Term]" _NL "id:" class_id eol (term_frame_clause eol )*
        
        ?term_frame_clause : is_anonymous-BT
                           | name-TVP
                           | namespace-Tag OBO_NAMESPACE
                           | alt_id-Tag id
                           | def-Tag QUOTED_STRING xref_list
                           | comment-TVP
                           | subset-Tag id                                           // Subset-ID  (def in header) 
                           | synonym-Tag QUOTED_STRING SYNONYM_SCOPE NAME? xref_list   // synonym-ID (def in header)
                           | xref-Tag xref
                           | builtin-BT
                           | property_value-Tag RELATION_ID   /.+/        // TODO
                           | is_a-Tag class_id
                           
                           | intersection_of-Tag  /.+/        // TODO: RElATION_ID? class_id
                           
                           | union_of-Tag class_id
                           | equivalent_to-Tag class_id
                           | disjoint_from-Tag class_id
                           | relationship-Tag RELATION_ID class_id
                           | is_obsolete-BT
                           | replaced_by-Tag class_id
                           | consider-Tag id
                           | created_by-Tag /.+/       // Person-ID
                           | creation_date-Tag /.+/       // ISO-8601-DateTime
                           //| /[^:]+:.+/          -> anything

                           
                            
        OBO_NAMESPACE : "cellular_component" | "biological_process" | "molecular_function"
        
        SYNONYM_SCOPE : "EXACT" | "BROAD" | "NARROW" | "RELATED"
        
        RELATION_ID : /\S+/
        
        %import common.ESCAPED_STRING -> QUOTED_STRING
        %import common.CNAME -> NAME
        %import common.NEWLINE -> _NL
        %import common.WS_INLINE
        %ignore WS_INLINE
        
        

    """), parser="lalr", lexer="contextual")

text = r"""
[Term]
id: GO:32
def: "hello" [wikipedia:nesto[lepo\]tako]
"""

with open("../data/go_1.obo", "r") as obo_file:
    #tree = parser.parse(text)
    tree = parser.parse(obo_file.read())
    pass
    
# def showTree(t, ind=""):
#     global tree
#     if not t:
#         return
    
#     print(ind, end='')
#     if type(t) == type(tree):
#         print(t.data)
#         for childe in t.children:
#             showTree(childe, ind+" "*2)
#     else:
#         print(repr(t))
        
# showTree(tree)

CPU times: user 208 ms, sys: 0 ns, total: 208 ms
Wall time: 206 ms


In [75]:
from lark import Transformer

In [243]:
class OBOTransformer(Transformer):  
    
    TRUE = lambda self, _: True
    FALSE = lambda self, _: False
    
    eol = lambda self, _: None
    
    def id(self, item): return item[0].value  # str
    def class_id(self, item): return item  # str
    def rel_id(self, item): return item[0].value  # str
    def instance_id(self, item): return item[0].value  # str
    def relation_id(self, item): return item[0].value  # str

    def xref_no_comma(self, items): return items[0].value # TODO
    xref_list = list



    
    

for x in OBOTransformer().transform(tree).children:
    for y in x.children:
        print y
    print "="*30

[Token(PREFIXED_ID, 'GO:0000001')]
None
Tree(name, [Token(UNQUOTED_STRING, 'mitochondrion inheritance')])
None
Tree(namespace, [Token(OBO_NAMESPACE, 'biological_process')])
None
Tree(def, [Token(QUOTED_STRING, '"The distribution of mitochondria, including the mitochondrial genome, into daughter cells after mitosis or meiosis, mediated by interactions between mitochondria and the cytoskeleton."'), ['GOC:mcc', 'PMID:10873824', 'PMID:11389764']])
None
Tree(term_frame_clause, [Token(QUOTED_STRING, '"mitochondrial inheritance"'), Token(SYNONYM_SCOPE, 'EXACT'), []])
None
Tree(is_a, [[Token(PREFIXED_ID, 'GO:0048308')]])
None
Tree(is_a, [[Token(PREFIXED_ID, 'GO:0048311')]])
None
[Token(PREFIXED_ID, 'GO:0000002')]
None
Tree(name, [Token(UNQUOTED_STRING, 'mitochondrial genome maintenance')])
None
Tree(namespace, [Token(OBO_NAMESPACE, 'biological_process')])
None
Tree(def, [Token(QUOTED_STRING, '"The maintenance of the structure and integrity of the mitochondrial genome; includes replication and 