**Eric Meinhardt / emeinhardt@ucsd.edu**

In [1]:
#Prints **all** console output, not just last item in cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Stop!-Grammar-time..." data-toc-modified-id="Stop!-Grammar-time...-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Stop! Grammar time...</a></span><ul class="toc-item"><li><span><a href="#Grammar-Notation/NLTK-Demos" data-toc-modified-id="Grammar-Notation/NLTK-Demos-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Grammar Notation/NLTK Demos</a></span></li><li><span><a href="#A-grammar-for-a-subset-of-Krambeck-et-al.-2009's-linear-code" data-toc-modified-id="A-grammar-for-a-subset-of-Krambeck-et-al.-2009's-linear-code-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>A grammar for a subset of Krambeck et al. 2009's linear code</a></span></li></ul></li></ul></div>

# Imports

In [2]:
from funcy import *

In [111]:
from nltk import CFG
from nltk.parse.chart import ChartParser
from nltk.parse.chart import SteppingChartParser

In [4]:
from nltk.parse.generate import generate

# Stop! Grammar time...

## Grammar Notation/NLTK Demos

Below is a simple mathematical/formal context-free grammar of a very simple fragment (subset) of English sentence syntax:

`
S -> NP VP
PP -> P NP
NP -> 'the' N | N PP | 'the' N PP
VP -> V NP | V PP | V NP PP
N -> 'cat'
N -> 'dog'
N -> 'rug'
V -> 'chased'
V -> 'sat'
P -> 'in'
P -> 'on'`

In [5]:
demo_grammar = CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> 'the' N | N PP | 'the' N PP
VP -> V NP | V PP | V NP PP
N -> 'cat'
N -> 'dog'
N -> 'rug'
V -> 'chased'
V -> 'sat'
P -> 'in'
P -> 'on'
""")

In [7]:
for sentence in generate(demo_grammar, n=100):
    print(str_join(' ', sentence))

the cat chased the cat
the cat chased the dog
the cat chased the rug
the cat chased cat in the cat
the cat chased cat in the dog
the cat chased cat in the rug
the cat chased cat in cat in the cat
the cat chased cat in cat in the dog
the cat chased cat in cat in the rug
the cat chased cat in cat in cat in the cat
the cat chased cat in cat in cat in the dog
the cat chased cat in cat in cat in the rug
the cat chased cat in cat in cat in cat in the cat
the cat chased cat in cat in cat in cat in the dog
the cat chased cat in cat in cat in cat in the rug
the cat chased cat in cat in cat in cat in cat in the cat
the cat chased cat in cat in cat in cat in cat in the dog
the cat chased cat in cat in cat in cat in cat in the rug
the cat chased cat in cat in cat in cat in cat in cat in the cat
the cat chased cat in cat in cat in cat in cat in cat in the dog
the cat chased cat in cat in cat in cat in cat in cat in the rug
the cat chased cat in cat in cat in cat in cat in cat in cat in the cat
the 

In [44]:
demo_parser = ChartParser(demo_grammar)

In [46]:
demo_sentence = 'the dog chased the cat on the rug'

for tree in demo_parser.parse(demo_sentence.split()):
    print(tree)
    if tree is None:
        print('None')

(S
  (NP the (N dog))
  (VP (V chased) (NP the (N cat)) (PP (P on) (NP the (N rug)))))
(S
  (NP the (N dog))
  (VP (V chased) (NP the (N cat) (PP (P on) (NP the (N rug))))))


Below is a grammar for a fragment of arithmetic on the integers:

`
expression -> integer | expression binary_operation expression | '(' expression binary_operation expression ')'
binary_operation -> + | - | ⸱ | /
integer -> (-) natural_number
natural_number -> digit | nonzero_digit digit*
digit -> 0 | nonzero_digit
nonzero_digit -> 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
`

The Kleene-*, the closely-related + operator, and metalinguistic parentheses for indicating optionality are all [syntactic sugar](https://en.wikipedia.org/wiki/Syntactic_sugar) that can be expressed using more basic rewrite rules. Since `nltk`'s `CFG` module doesn't recognize them, we'll re-express the same more compact grammar noted above without them. (Note also that terminals (= literals) are wrapped in single quotes.)

Relevant wikipedia articles:
 - https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form
 - https://en.wikipedia.org/wiki/Context-free_grammar

In [61]:
arithmetic_grammar = CFG.fromstring("""
expression -> integer | expression binary_operation expression | '(' expression binary_operation expression ')'
binary_operation -> '+' | '-' | 'x' | '/'
integer -> natural_number | '-' natural_number
natural_number -> digit | nonzero_digit digit_phrase
digit_phrase -> digit | digit digit_phrase
digit -> '0' | nonzero_digit
nonzero_digit -> '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
""")

In [62]:
for expression in generate(arithmetic_grammar, n=1000):
    print(str_join('', expression))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
100
101
102
103
104
105
106
107
108
109
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
100000
100001
100002
100003
100004
100005
100006
100007
100008
100009
1000000
1000001
1000002
1000003
1000004
1000005
1000006
1000007
1000008
1000009
10000000
10000001
10000002
10000003
10000004
10000005
10000006
10000007
10000008
10000009
100000000
100000001
100000002
100000003
100000004
100000005
100000006
100000007
100000008
100000009
1000000000
1000000001
1000000002
1000000003
1000000004
1000000005
1000000006
1000000007
1000000008
1000000009
10000000000
10000000001
10000000002
10000000003
10000000004
10000000005
10000000006
10000000007
10000000008
10000000009
100000000000
100000000001
100000000002
100000000003
100000000004
100000000005
100000000006
100000000007
100000000008
100000000009
1000000000000
1000000000001
1000000000002
1000000000003
1000000000004
1000000000005
1000000000006
10

In [63]:
arithmetic_parser = ChartParser(arithmetic_grammar)

In [66]:
demo_expression = '1'
demo_expression_tokenized = demo_expression.split()
print('Tokenized = {0}'.format(demo_expression_tokenized))

for tree in arithmetic_parser.parse(demo_expression.split()):
    print(tree)

Tokenized = ['1']
(expression (integer (natural_number (digit (nonzero_digit 1)))))


In [67]:
demo_expression = '1 + 2'
demo_expression_tokenized = demo_expression.split()
print('Tokenized = {0}'.format(demo_expression_tokenized))

for tree in arithmetic_parser.parse(demo_expression.split()):
    print(tree)

Tokenized = ['1', '+', '2']
(expression
  (expression (integer (natural_number (digit (nonzero_digit 1)))))
  (binary_operation +)
  (expression (integer (natural_number (digit (nonzero_digit 2))))))


In [70]:
demo_expression = '( 3 x 2 ) + 3 / 4'
demo_expression_tokenized = demo_expression.split()
print('Tokenized = {0}'.format(demo_expression_tokenized))

for tree in arithmetic_parser.parse(demo_expression.split()):
    print(tree)

Tokenized = ['(', '3', 'x', '2', ')', '+', '3', '/', '4']
(expression
  (expression
    (expression
      (
      (expression
        (integer (natural_number (digit (nonzero_digit 3)))))
      (binary_operation x)
      (expression
        (integer (natural_number (digit (nonzero_digit 2)))))
      ))
    (binary_operation +)
    (expression (integer (natural_number (digit (nonzero_digit 3))))))
  (binary_operation /)
  (expression (integer (natural_number (digit (nonzero_digit 4))))))
(expression
  (expression
    (
    (expression (integer (natural_number (digit (nonzero_digit 3)))))
    (binary_operation x)
    (expression (integer (natural_number (digit (nonzero_digit 2)))))
    ))
  (binary_operation +)
  (expression
    (expression (integer (natural_number (digit (nonzero_digit 3)))))
    (binary_operation /)
    (expression (integer (natural_number (digit (nonzero_digit 4)))))))


## A grammar for a subset of Krambeck et al. 2009's linear code

It is conventional that
 - linear code be interpreted from right to left (counter to the norm of formal language theory, but not formally unaccommodatable).
 - the child subtrees of a node be ordered such that the bond locations of the roots of the subtrees descend as you go leftwards among the children.
 
Below is a context-free grammar for a fragment of linear code where each expression describes a single glycan (i.e. no uncertainty operators are expressed):

*wrong*

`
exp -> stem | exp non_rightmost_branch* stem
non_rightmost_branch -> "(" exp ")"
stem -> SU_with_bond_info* SU_bare
SU_with_bond_info -> SU_bare bond_type bond_location
bond_type -> "a" | "b" | "?"
bond_location -> "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "?"
SU_bare -> "A" | "AN" | "B" | "E" | "F" | "G" | "GN" | "G[Q]" | "H" | "H[2Q, 4Q]" | "I" | "K" | "L" | "M" | "NG" | "NJ" | "NN" | "NN[9N]" | "N[5Q]" | "O" | "P" | "PH" | "R" | "S" | "U" | "W" | "X"
`

`
exp -> empty | stem | subexp non_rightmost_branch+ stem
stem -> SU_with_bond_info* SU_bare
non_rightmost_branch -> "(" subexp ")"
subexp -> substem | subexp non_rightmost_branch+ substem
substem -> SU_with_bond_info+
SU_with_bond_info -> SU_bare bond_type bond_location
bond_type -> "a" | "b" | "?"
bond_location -> "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "?"
SU_bare -> "A" | "AN" | "B" | "E" | "F" | "G" | "GN" | "G[Q]" | "H" | "H[2Q, 4Q]" | "I" | "K" | "L" | "M" | "NG" | "NJ" | "NN" | "NN[9N]" | "N[5Q]" | "O" | "P" | "PH" | "R" | "S" | "U" | "W" | "X"
empty -> ""
`

...where
 - `->`, `|`, `*`, `+` are all metalinguistic operators with their usual formal-language theoretic meaning (see any textbook or introductory material).
 - all terminal symbols are quoted string literals.
 - the enumeration of saccharide units is cribbed from a relatively arbitrary mix of what `glypy` supports and what `glymmer` supports.

In [176]:
#right-to-left leftwards-descending uncertainty-operator-free normal form...
# ...formatted in a way that NLTK understands = with no Kleene star or plus, but
# simulating Kleene stars/pluses using left-recursive rules
RTF_LDNF_UOF_g = CFG.fromstring("""
    exp -> empty | stem | subexp non_rightmost_branch_phrase stem
    stem -> SU_with_bond_info_phrase_star SU_bare
    non_rightmost_branch_phrase -> empty | non_rightmost_branch_phrase non_rightmost_branch
    non_rightmost_branch -> '(' subexp ')'
    subexp -> substem | subexp non_rightmost_branch_phrase substem
    substem -> SU_with_bond_info_phrase_plus
    SU_with_bond_info_phrase_star -> empty | SU_with_bond_info | SU_with_bond_info_phrase_star SU_with_bond_info
    SU_with_bond_info_phrase_plus -> SU_with_bond_info | SU_with_bond_info_phrase_star SU_with_bond_info
    SU_with_bond_info -> SU_bare bond_type bond_location
    bond_type -> 'a' | 'b' | '?'
    bond_location -> '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | '?'
    SU_bare -> 'A' | 'AN' | 'B' | 'E' | 'F' | 'G' | 'GN' | 'G[Q]' | 'H' | 'H[2Q, 4Q]' | 'I' | 'K' | 'L' | 'M' | 'NG' | 'NJ' | 'NN' | 'NN[9N]' | 'N[5Q]' | 'O' | 'P' | 'PH' | 'R' | 'S' | 'U' | 'W' | 'X'
    empty -> 
""")

In [161]:
# current grammar generates a recursion error, but that's not a sign the 
# current grammar is incorrect.
# for lce in generate(RTF_LDNF_UOF_g, n=10000):
#     print(str_join(' ', lce))

In [177]:
RTF_LDNF_UOF_parser = ChartParser(RTF_LDNF_UOF_g)
RTF_LDNF_UOF_parser_step = SteppingChartParser(RTF_LDNF_UOF_g)

In [178]:
for tree in RTF_LDNF_UOF_parser.parse(['M']):
    print(tree)
    
for tree in RTF_LDNF_UOF_parser.parse(['M', 'a', '6', 'M']):
    print(tree)
    
for tree in RTF_LDNF_UOF_parser.parse(['(', 'M', 'a', '6', ')', 'M']):
    print(tree)
    
for tree in RTF_LDNF_UOF_parser.parse(['M', 'a', '4', '(', 'M', 'a', '6', ')', 'M']):
    print(tree)

(exp (stem (SU_with_bond_info_phrase_star (empty )) (SU_bare M)))
(exp
  (stem
    (SU_with_bond_info_phrase_star
      (SU_with_bond_info (SU_bare M) (bond_type a) (bond_location 6)))
    (SU_bare M)))
(exp
  (stem
    (SU_with_bond_info_phrase_star
      (SU_with_bond_info_phrase_star (empty ))
      (SU_with_bond_info (SU_bare M) (bond_type a) (bond_location 6)))
    (SU_bare M)))
(exp
  (subexp
    (substem
      (SU_with_bond_info_phrase_plus
        (SU_with_bond_info
          (SU_bare M)
          (bond_type a)
          (bond_location 6)))))
  (non_rightmost_branch_phrase (empty ))
  (stem (SU_with_bond_info_phrase_star (empty )) (SU_bare M)))
(exp
  (subexp
    (substem
      (SU_with_bond_info_phrase_plus
        (SU_with_bond_info_phrase_star (empty ))
        (SU_with_bond_info
          (SU_bare M)
          (bond_type a)
          (bond_location 6)))))
  (non_rightmost_branch_phrase (empty ))
  (stem (SU_with_bond_info_phrase_star (empty )) (SU_bare M)))
(exp
  (subexp
 

In [167]:
for tree in RTF_LDNF_UOF_parser_step.parse(['M']):
    print(tree)
    
for tree in RTF_LDNF_UOF_parser_step.parse(['M', 'a', '6', 'M']):
    print(tree)
    
for tree in RTF_LDNF_UOF_parser_step.parse(['(', 'M', 'a', '6', ')', 'M']):
    print(tree)
    
for tree in RTF_LDNF_UOF_parser_step.parse(['M', 'a', '4', '(', 'M', 'a', '6', ')', 'M']):
    print(tree)

In [137]:
#Krambeck et al. 2009, Fig. 1
Kea2009Fig1 = [
    'Ma2Ma2Ma3(Ma2Ma3(Ma2Ma6)Ma6)Mb4GNb4GN',
    'Ma2Ma2Ma3(Ma3(Ma2Ma6)Ma6)Mb4GNb4GN',
    'Ga3Ma2Ma2Ma3(Ma2Ma3(Ma2Ma6)Ma6)Mb4GNb4GN']

Kea2009Fig1_tokenized = [['M', 'a', '2', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN'],
                         ['M', 'a', '2', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN'],
                         ['G', 'a', '3', 'M', 'a', '2', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN']]
print(Kea2009Fig1_tokenized)

[['M', 'a', '2', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN'], ['M', 'a', '2', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN'], ['G', 'a', '3', 'M', 'a', '2', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN']]


In [138]:
first = ['M', 'a', '2', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN']
print(first)
print(len(first))

['M', 'a', '2', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN']
35


In [139]:
first[-4:]
for tree in RTF_LDNF_UOF_parser.parse(first[-4:]):
    print(tree)

['GN', 'b', '4', 'GN']

(exp
  (stem
    (SU_with_bond_info_phrase
      (SU_with_bond_info
        (SU_bare GN)
        (bond_type b)
        (bond_location 4)))
    (SU_bare GN)))


In [140]:
for i in range(35):
    print(i)
    print(first[-i:])
    for tree in RTF_LDNF_UOF_parser.parse(first[-i:]):
        print(tree)
    print('----------')

0
['M', 'a', '2', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN']
(exp
  (subexp
    (substem
      (SU_with_bond_info_phrase
        (SU_with_bond_info_phrase
          (SU_with_bond_info_phrase
            (SU_with_bond_info
              (SU_bare M)
              (bond_type a)
              (bond_location 2)))
          (SU_with_bond_info
            (SU_bare M)
            (bond_type a)
            (bond_location 2)))
        (SU_with_bond_info
          (SU_bare M)
          (bond_type a)
          (bond_location 3)))))
  (non_rightmost_branch_phrase
    (non_rightmost_branch_phrase (empty ))
    (non_rightmost_branch
      (
      (subexp
        (subexp
          (substem
            (SU_with_bond_info_phrase
              (SU_with_bond_info_phrase
                (SU_with_bond_info
                  (SU_bare M)
                  (bond_type a)
                  (bond

    (SU_bare GN)))
----------
1
['GN']
----------
2
['4', 'GN']
----------
3
['b', '4', 'GN']
----------
4
['GN', 'b', '4', 'GN']
(exp
  (stem
    (SU_with_bond_info_phrase
      (SU_with_bond_info
        (SU_bare GN)
        (bond_type b)
        (bond_location 4)))
    (SU_bare GN)))
----------
5
['4', 'GN', 'b', '4', 'GN']
----------
6
['b', '4', 'GN', 'b', '4', 'GN']
----------
7
['M', 'b', '4', 'GN', 'b', '4', 'GN']
(exp
  (stem
    (SU_with_bond_info_phrase
      (SU_with_bond_info_phrase
        (SU_with_bond_info
          (SU_bare M)
          (bond_type b)
          (bond_location 4)))
      (SU_with_bond_info
        (SU_bare GN)
        (bond_type b)
        (bond_location 4)))
    (SU_bare GN)))
(exp
  (subexp
    (substem
      (SU_with_bond_info_phrase
        (SU_with_bond_info
          (SU_bare M)
          (bond_type b)
          (bond_location 4)))))
  (non_rightmost_branch_phrase (empty ))
  (stem
    (SU_with_bond_info_phrase
      (SU_with_bond_info
        (SU_

['2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN']
----------
31
['a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN']
----------
32
['M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '3', '(', 'M', 'a', '2', 'M', 'a', '6', ')', 'M', 'a', '6', ')', 'M', 'b', '4', 'GN', 'b', '4', 'GN']
(exp
  (subexp
    (substem
      (SU_with_bond_info_phrase
        (SU_with_bond_info_phrase
          (SU_with_bond_info
            (SU_bare M)
            (bond_type a)
            (bond_location 2)))
        (SU_with_bond_info
          (SU_bare M)
          (bond_type a)
          (bond_location 3)))))
  (non_rightmost_branch_phrase
    (non_rightmost_branch_phrase (empty ))
    (non_rightmost_branch
      (
      (subexp
        (subexp
          (substem
            (SU_with_bond_info_

    (SU_bare GN)))
(exp
  (subexp
    (subexp
      (substem
        (SU_with_bond_info_phrase
          (SU_with_bond_info
            (SU_bare M)
            (bond_type a)
            (bond_location 2)))))
    (non_rightmost_branch_phrase (empty ))
    (substem
      (SU_with_bond_info_phrase
        (SU_with_bond_info
          (SU_bare M)
          (bond_type a)
          (bond_location 3)))))
  (non_rightmost_branch_phrase
    (non_rightmost_branch_phrase (empty ))
    (non_rightmost_branch
      (
      (subexp
        (subexp
          (subexp
            (substem
              (SU_with_bond_info_phrase
                (SU_with_bond_info
                  (SU_bare M)
                  (bond_type a)
                  (bond_location 2)))))
          (non_rightmost_branch_phrase (empty ))
          (substem
            (SU_with_bond_info_phrase
              (SU_with_bond_info
                (SU_bare M)
                (bond_type a)
                (bond_location 3)))))
        (n

In [141]:
for each, each_tokenized in zip(Kea2009Fig1, Kea2009Fig1_tokenized):
    print('Linear code expression, pre-tokenized = \n{0}'.format(each))

    for tree in RTF_LDNF_UOF_parser.parse(each_tokenized):
        print(tree)
    print('---------------------')

Linear code expression, pre-tokenized = 
Ma2Ma2Ma3(Ma2Ma3(Ma2Ma6)Ma6)Mb4GNb4GN
(exp
  (subexp
    (substem
      (SU_with_bond_info_phrase
        (SU_with_bond_info_phrase
          (SU_with_bond_info_phrase
            (SU_with_bond_info
              (SU_bare M)
              (bond_type a)
              (bond_location 2)))
          (SU_with_bond_info
            (SU_bare M)
            (bond_type a)
            (bond_location 2)))
        (SU_with_bond_info
          (SU_bare M)
          (bond_type a)
          (bond_location 3)))))
  (non_rightmost_branch_phrase
    (non_rightmost_branch_phrase (empty ))
    (non_rightmost_branch
      (
      (subexp
        (subexp
          (substem
            (SU_with_bond_info_phrase
              (SU_with_bond_info_phrase
                (SU_with_bond_info
                  (SU_bare M)
                  (bond_type a)
                  (bond_location 2)))
              (SU_with_bond_info
                (SU_bare M)
                (bond_typ

(exp
  (subexp
    (subexp
      (subexp
        (subexp
          (substem
            (SU_with_bond_info_phrase
              (SU_with_bond_info_phrase
                (SU_with_bond_info
                  (SU_bare G)
                  (bond_type a)
                  (bond_location 3)))
              (SU_with_bond_info
                (SU_bare M)
                (bond_type a)
                (bond_location 2)))))
        (non_rightmost_branch_phrase (empty ))
        (substem
          (SU_with_bond_info_phrase
            (SU_with_bond_info
              (SU_bare M)
              (bond_type a)
              (bond_location 2)))))
      (non_rightmost_branch_phrase (empty ))
      (substem
        (SU_with_bond_info_phrase
          (SU_with_bond_info
            (SU_bare M)
            (bond_type a)
            (bond_location 3)))))
    (non_rightmost_branch_phrase
      (non_rightmost_branch_phrase (empty ))
      (non_rightmost_branch
        (
        (subexp
          (subexp
    

In [145]:
for each, each_tokenized in zip(Kea2009Fig1, Kea2009Fig1_tokenized):
    print('Linear code expression, pre-tokenized = \n{0}'.format(each))

    for tree in RTF_LDNF_UOF_parser.parse(each_tokenized):
        print(tree)
    print('---------------------')

Linear code expression, pre-tokenized = 
Ma2Ma2Ma3(Ma2Ma3(Ma2Ma6)Ma6)Mb4GNb4GN
(exp
  (subexp
    (substem
      (SU_with_bond_info_phrase
        (SU_with_bond_info_phrase
          (SU_with_bond_info_phrase
            (SU_with_bond_info
              (SU_bare M)
              (bond_type a)
              (bond_location 2)))
          (SU_with_bond_info
            (SU_bare M)
            (bond_type a)
            (bond_location 2)))
        (SU_with_bond_info
          (SU_bare M)
          (bond_type a)
          (bond_location 3)))))
  (non_rightmost_branch_phrase
    (non_rightmost_branch_phrase (empty ))
    (non_rightmost_branch
      (
      (subexp
        (subexp
          (substem
            (SU_with_bond_info_phrase
              (SU_with_bond_info_phrase
                (SU_with_bond_info
                  (SU_bare M)
                  (bond_type a)
                  (bond_location 2)))
              (SU_with_bond_info
                (SU_bare M)
                (bond_typ

(exp
  (subexp
    (subexp
      (subexp
        (substem
          (SU_with_bond_info_phrase
            (SU_with_bond_info_phrase
              (SU_with_bond_info
                (SU_bare G)
                (bond_type a)
                (bond_location 3)))
            (SU_with_bond_info
              (SU_bare M)
              (bond_type a)
              (bond_location 2)))))
      (non_rightmost_branch_phrase (empty ))
      (substem
        (SU_with_bond_info_phrase
          (SU_with_bond_info_phrase
            (SU_with_bond_info
              (SU_bare M)
              (bond_type a)
              (bond_location 2)))
          (SU_with_bond_info
            (SU_bare M)
            (bond_type a)
            (bond_location 3)))))
    (non_rightmost_branch_phrase
      (non_rightmost_branch_phrase (empty ))
      (non_rightmost_branch
        (
        (subexp
          (subexp
            (subexp
              (substem
                (SU_with_bond_info_phrase
                  (SU_w

In [146]:
for each, each_tokenized in zip(Kea2009Fig1, Kea2009Fig1_tokenized):
    print('Linear code expression, pre-tokenized = \n{0}'.format(each))

    print(len(list(RTF_LDNF_UOF_parser.parse(each_tokenized))))
    print('---------------------')

Linear code expression, pre-tokenized = 
Ma2Ma2Ma3(Ma2Ma3(Ma2Ma6)Ma6)Mb4GNb4GN
32
---------------------
Linear code expression, pre-tokenized = 
Ma2Ma2Ma3(Ma3(Ma2Ma6)Ma6)Mb4GNb4GN
16
---------------------
Linear code expression, pre-tokenized = 
Ga3Ma2Ma2Ma3(Ma2Ma3(Ma2Ma6)Ma6)Mb4GNb4GN
64
---------------------


In [152]:
parse_trees = []
for each, each_tokenized in [zip(Kea2009Fig1, Kea2009Fig1_tokenized)[0]]:
    print('Linear code expression, pre-tokenized = \n{0}'.format(each))

    for tree in RTF_LDNF_UOF_parser.parse(each_tokenized):
        parse_trees.append(tree)
    print('---------------------')

Linear code expression, pre-tokenized = 
Ma2Ma2Ma3(Ma2Ma3(Ma2Ma6)Ma6)Mb4GNb4GN
---------------------


In [153]:
len(parse_trees)

32

In [154]:
print(parse_trees[0])

(exp
  (subexp
    (substem
      (SU_with_bond_info_phrase
        (SU_with_bond_info_phrase
          (SU_with_bond_info_phrase
            (SU_with_bond_info
              (SU_bare M)
              (bond_type a)
              (bond_location 2)))
          (SU_with_bond_info
            (SU_bare M)
            (bond_type a)
            (bond_location 2)))
        (SU_with_bond_info
          (SU_bare M)
          (bond_type a)
          (bond_location 3)))))
  (non_rightmost_branch_phrase
    (non_rightmost_branch_phrase (empty ))
    (non_rightmost_branch
      (
      (subexp
        (subexp
          (substem
            (SU_with_bond_info_phrase
              (SU_with_bond_info_phrase
                (SU_with_bond_info
                  (SU_bare M)
                  (bond_type a)
                  (bond_location 2)))
              (SU_with_bond_info
                (SU_bare M)
                (bond_type a)
                (bond_location 3)))))
        (non_rightmost_branch_phrase

In [155]:
print(parse_trees[1])

(exp
  (subexp
    (substem
      (SU_with_bond_info_phrase
        (SU_with_bond_info_phrase
          (SU_with_bond_info_phrase
            (SU_with_bond_info
              (SU_bare M)
              (bond_type a)
              (bond_location 2)))
          (SU_with_bond_info
            (SU_bare M)
            (bond_type a)
            (bond_location 2)))
        (SU_with_bond_info
          (SU_bare M)
          (bond_type a)
          (bond_location 3)))))
  (non_rightmost_branch_phrase
    (non_rightmost_branch_phrase (empty ))
    (non_rightmost_branch
      (
      (subexp
        (subexp
          (substem
            (SU_with_bond_info_phrase
              (SU_with_bond_info_phrase
                (SU_with_bond_info
                  (SU_bare M)
                  (bond_type a)
                  (bond_location 2)))
              (SU_with_bond_info
                (SU_bare M)
                (bond_type a)
                (bond_location 3)))))
        (non_rightmost_branch_phrase

In [159]:
for tree in RTF_LDNF_UOF_parser.parse(['M', 'a', '6', '(', 'M', 'a', '4', ')', 'M']):
    print(tree)