In [1]:
import io
from dataclasses import dataclass
from enum import Enum, auto

import pymupdf
from lark import Lark, Tree, UnexpectedInput

from deep_statutes.pdf.token_stream import pdf_to_token_stream, PDFTokenConversionOptions
from deep_statutes.pdf.util import Pos

In [2]:
# these need to be inferred for the particular document
LeftMargin = 72.
IndentSize = 36.

In [3]:
path = "/Users/eric/Development/statutes_data/wy/raw_pdf/title03.pdf"

doc = pymupdf.open(path)

In [4]:
options = PDFTokenConversionOptions(
    left_margin=LeftMargin,
    indent_size=IndentSize,
    infer_centered=False,
    font_sizes=[float('inf'), 12, float('inf'), float('inf')],
)

text = "\n".join(pdf_to_token_stream(doc, options))

with open('foo.txt', 'w') as f:
    f.write(text)

In [5]:
grammar = r'''
_start: _LINE* title
_heading_start: title_start | chapter_start | article_start | section_start

title: title_start _blank_line chapter+

chapter: chapter_start _blank_line article+

article: article_start _blank_line many_sections 

_centeredish: _INDENT*

!title_number: "TITLE " NAT
!chapter_number: "CHAPTER " NAT
!article_number: "ARTICLE " NAT
subtitle: RAW_UPPER_TEXT _cont_upper_line?

title_start.2: _LINE* _centeredish _SPAN_M title_number " - " subtitle
chapter_start.2: _LINE* _centeredish _SPAN_M chapter_number " - " subtitle
article_start.2: _LINE* _centeredish _SPAN_M article_number " - " subtitle
_cont_upper_line: _LINE _centeredish _span_m_upper

!section_number: NAT "-" NAT "-" NAT
section_start: _INDENT _SPAN_M_B section_number "." section_subtitle?
section_subtitle: RAW_TEXT (_LINE _sep{_span_m_b, _LINE})? 
section: section_start (_blank_line _sep{paragraph, _blank_line})?

many_sections: _sep{section, _blank_line}

_blank_line: _LINE _LINE

paragraph: _INDENT* _sep{_multi_span_m, _LINE}

_span_m: _SPAN_M RAW_TEXT
_span_m_upper: _SPAN_M RAW_UPPER_TEXT
_span_m_b: _SPAN_M_B RAW_TEXT

_multi_span_m: _span_m+
_multi_span_m_b: _span_m_b+

NAT: /[1-9][0-9]*/

RAW_TEXT: /(?:(?!<<)[^\n])+/
RAW_UPPER_TEXT: /(?:(?!<<)[^\na-z])+/

_sep{x, sep}: x (sep x)*

//_CENTER.2: /<<CENTER>>/
_INDENT.2: /<<INDENT>>/
_LINE.2: /<<LINE>>/
_BLOCK.2: /<<BLOCK>>/
_PAGE.2: /<<PAGE>>/

_SPAN_M.2: /<<SPAN_M>>/
_SPAN_M_B.2: /<<SPAN_M_B>>/

EOL: /\n/

%ignore EOL
'''

In [6]:
lark = Lark(grammar, parser='lalr', start='_heading_start', propagate_positions=True)

In [24]:
END_ACTION = '$END'

def find_matches(text: str, start: str,lark: Lark) -> list[Tree]:
    matched_parse_trees = []
    token_end_pos = []
    if not lark.options.parser == 'lalr':
        raise ValueError("Only lalr parser is supported")
    
    pi = lark.parse_interactive(text, start=start)
    lex_stream = pi.lexer_thread.lex(pi.parser_state)
    last_token_end_pos = None
    while True:
        # check if we are in the valid end state for a rule
        choices = pi.choices()
        if END_ACTION in choices:
            #valid_parses.append(choices[END_ACTION])
            # we feed the parser EOF to trick it into giving us back the parse tree for
            # this range of the input text
            r = pi.copy().feed_eof()
            matched_parse_trees.append(r)
            token_end_pos.append(last_token_end_pos)
        try:
            token = next(lex_stream)
            last_token_end_pos = token.end_pos
        except StopIteration:
            break
        except UnexpectedInput as e:
            break
        pi.feed_token(token)

    return matched_parse_trees, token_end_pos

In [26]:
idx_parses = []
for idx in range(len(text)):
    valid_parses, end_pos = find_matches(text[idx:], '_heading_start', lark)
    for p, e in zip(valid_parses, end_pos):
        idx_parses += [((idx, idx+e), p) for p in valid_parses]

In [29]:
for r, parse in idx_parses[::177]:
    print(r, parse)
    print(text[r[0]:r[1]+1])
    print()

(18, 102) Tree(Token('RULE', '_heading_start'), [Tree(Token('RULE', 'title_start'), [Tree(Token('RULE', 'title_number'), [Token('__ANON_0', 'TITLE '), Token('NAT', '3')]), Tree(Token('RULE', 'subtitle'), [Token('RAW_UPPER_TEXT', 'GUARDIAN AND WARD ')])])])

<<INDENT>>
<<INDENT>>
<<INDENT>>
<<INDENT>>
<<SPAN_M>>
TITLE 3 - GUARDIAN AND WARD 


(21908, 21967) Tree(Token('RULE', '_heading_start'), [Tree(Token('RULE', 'chapter_start'), [Tree(Token('RULE', 'chapter_number'), [Token('__ANON_1', 'CHAPTER '), Token('NAT', '2')]), Tree(Token('RULE', 'subtitle'), [Token('RAW_UPPER_TEXT', 'GUARDIANSHIPS ')])])])
<<INDENT>>
<<INDENT>>
<<SPAN_M>>
CHAPTER 2 - GUARDIANSHIPS 


(61625, 61674) Tree(Token('RULE', '_heading_start'), [Tree(Token('RULE', 'article_start'), [Tree(Token('RULE', 'article_number'), [Token('__ANON_2', 'ARTICLE '), Token('NAT', '1')]), Tree(Token('RULE', 'subtitle'), [Token('RAW_UPPER_TEXT', 'OPENING CONSERVATORSHIPS ')])])])

<<SPAN_M>>
ARTICLE 1 - OPENING CONSERVATORSHIPS 


(84

In [10]:
tree = idx_parses[0][1]
idx_parses[0][0]

18

In [11]:
for node in tree.iter_subtrees_topdown():
    print(node.data, node.meta.start_pos, node.meta.end_pos)

_heading_start 1 None
title_start 1 None
title_number 56 63
subtitle 66 None


In [12]:
tree.children[-1].children[-1].children[-1].end_pos

In [13]:
tree.meta.line, tree.meta.column, tree.meta.start_pos, tree.meta.end_pos

(2, 1, 1, None)

In [14]:
dir(tree.meta)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__firstlineno__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__static_attributes__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'column',
 'container_column',
 'container_end_column',
 'container_end_line',
 'container_end_pos',
 'container_line',
 'container_start_pos',
 'empty',
 'end_column',
 'end_line',
 'end_pos',
 'line',
 'start_pos']

In [15]:
for _, tree in idx_parses[::377]:
    print(tree.pretty())

_heading_start
  title_start
    title_number
      TITLE 
      3
    subtitle	GUARDIAN AND WARD 

_heading_start
  article_start
    article_number
      ARTICLE 
      6
    subtitle	DUTIES AND POWERS OF CONSERVATOR 

_heading_start
  section_start
    section_number
      3
      -
      6
      -
      112

_heading_start
  article_start
    article_number
      ARTICLE 
      3
    subtitle	STATUTORY FORMS 



In [16]:


class HeadingType(Enum):
    Title = auto()
    Chapter = auto()
    Article = auto()
    Section = auto()


@dataclass
class Heading:
    type: HeadingType
    text: str
    sub_text: str
    pos: Pos


    

In [17]:
def _child(tree: Tree, idx: int, name: str) -> Tree:
    child_tree = tree.children[idx]
    if child_tree.data != name:
        raise ValueError(f"Expected {name}, got {child_tree.data}")
    return child_tree


def _text(tree: Tree):
    """
    Get all text from the tree's children.

    This assumes that all child nodes are tokens.
    """
    t = []
    for child in tree.children:
        if isinstance(child, str):
            t.append(child)
        else:
            raise ValueError(f"Expected Token/string but got {child}")
    return ''.join(t)
 

def _parse_heading(pos: Pos, root: Tree) -> Heading:
    if root.data != '_heading_start':
        raise ValueError(f"Expected heading start, got {root.data}")
    if len(root.children) != 1:
        raise ValueError(f"Expected exactly one child, got {len(root.children)}")
 
    tree = root.children[0]
    match tree.data:
        case 'title_start':
            text = _text(_child(tree, 0, 'title_number'))
            sub_text = _text(_child(tree, 1, 'subtitle'))
            return Heading(HeadingType.Title, text, sub_text, pos)
        case 'chapter_start':
            text = _text(_child(tree, 0, 'chapter_number'))
            sub_text = _text(_child(tree, 1, 'subtitle'))
            return Heading(HeadingType.Chapter, text, sub_text, pos)
        case 'article_start':
            text = _text(_child(tree, 0, 'article_number'))
            sub_text = _text(_child(tree, 1, 'subtitle'))
            return Heading(HeadingType.Article, text, sub_text, pos)
        case 'section_start':
            text = _text(_child(tree, 0, 'section_number'))
            if len(tree.children) > 1:
                sub_text = _text(_child(tree, 1, 'section_subtitle'))
            else:
                sub_text = ''
            return Heading(HeadingType.Section, text, sub_text, pos)
        case _:
            raise ValueError(f"Unknown heading type: {tree.type}")


In [18]:
headings = []
for pos, tree in idx_parses:
    headings.append(_parse_heading(pos, tree))


In [19]:
headings

[Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=18),
 Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=19),
 Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=29),
 Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=30),
 Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=40),
 Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=41),
 Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=51),
 Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=52),
 Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=62),
 Heading(type=<HeadingType.Title: 1>, text='TITLE 3', sub_text='GUARDIAN AND WARD ', pos=63),
 Heading(type=<HeadingType.Chapter: 2>, text='CHAPTER 1', su