In [106]:
# ! pdf2txt --layoutmode=normal --char-margin=10 -o xml/0.c.html PathofPurification2011.pdf

In [1]:
from lxml import etree
import numpy as np
import re
import itertools
import copy
import string
import roman
from lxml.html.clean import Cleaner


cleaner = Cleaner(remove_unknown_tags=False, safe_attrs_only=False, style=False, remove_tags=['br'])
open('xml/0.c1.html','w').write(cleaner.clean_html(open('xml/0.c.html').read()))

tree=etree.parse('xml/0.c1.html',etree.XMLParser())
root=tree.getroot()
body=root[0]
print(body.tag)

# remove XHTML namespaces which we don't need
for elem in root.getiterator():
    if isinstance(elem, (etree._Comment,etree._ProcessingInstruction)): continue
    # Remove a namespace URI in the element's name
    elem.tag=etree.QName(elem).localname


# map custom encoding of the URWPalladioPali to unicode
# + change linebreaks to space
c0='áÁòÒóÓþÞìÌúÚíÍ¿÷²ðåõºøý' +'\n'
c1='āĀṅṄṇṆṭṬīÌūŪṃṂḷēĕḍṣōṛśḥ' +'\n'
c01=str.maketrans(c0,c1)
preserve='Ññéàù©§üöïäë}"\n'
punctuation=' .-,;:!—–+*=“”\'‘’…()[]?&/'# +'\n'
# š is rendered as ṝ, should be removed (is in index, last line 763: "m. šformation (citta-saòkhára)", should be just formation)
# ` is (1) extra at the beginning of §82: `In the description of ...
#      (2) 718: "has ceased-reckoned as `been and gone’" should be opening single quote (looks the same in PDF, anyway)
typos='š`'
valid=set(string.ascii_letters+string.digits+preserve+typos+punctuation+c1)
for p in root.xpath('//span'):
    if p.text is None: continue
    p.text=p.text.translate(c01)
    if set(p.text)-valid:
        print(set(p.text)-valid,tree.getelementpath(p),p.text)
for p in root.xpath('//div'):
    if p.text=='\n': p.text=''
    
patTop=re.compile(r'\btop:([0-9]+)px;')
def getTop(e):
    m=patTop.search(e.attrib['style'])
    return int(m.group(1))
patLeft=re.compile(r'\bleft:([0-9]+)px;')
def getLeft(e):
    m=patLeft.search(e.attrib['style'])
    if m is None: return -1 # print(e.attrib['style'])
    return int(m.group(1))

# detect page beginnings based on anchors (and remove those)
# selects parent div which contains <a name="...">
pageTops=[]
for p in root.xpath('//div[a[@name]]'):
    pageTops.append(getTop(p))
    del p[0]
    
pageTops.append(pageTops[-1]*2)
    
pageTops=np.array(pageTops)


pageElems=[list() for i in range(len(pageTops))]
for e in root.xpath('//div[contains(@style,"position:absolute")] | //span[contains(@style,"position:absolute")]'):
    if 0:
        if e.tag in ('span','div') and len(e)==0 and (e.text is None or e.text=='') and ('black 1px solid' not in e.attrib['style']): continue   
    t=getTop(e)
    pg=np.searchsorted(pageTops,t)
    pageElems[pg-1].append(e)
# sort elements on each page by height
for pg in range(len(pageElems)): pageElems[pg].sort(key=lambda e: (getTop(e),getLeft(e)))
print('Number of elements per page')
print(tuple(np.array([len(pageElems[i]) for i in range(len(pageElems))])))

body
Number of elements per page
(2, 11, 7, 7, 8, 18, 14, 7, 13, 9, 9, 9, 12, 15, 8, 12, 12, 9, 14, 10, 28, 22, 11, 16, 13, 18, 13, 14, 15, 41, 55, 45, 11, 15, 25, 11, 12, 11, 12, 14, 14, 11, 12, 13, 14, 14, 13, 12, 13, 16, 16, 11, 13, 13, 14, 14, 13, 5, 6, 5, 6, 5, 18, 16, 20, 16, 18, 20, 22, 18, 16, 38, 16, 13, 16, 18, 17, 12, 18, 15, 16, 15, 15, 16, 13, 14, 16, 14, 14, 15, 13, 14, 17, 20, 16, 18, 17, 18, 15, 16, 15, 20, 19, 19, 13, 18, 19, 13, 15, 20, 22, 16, 16, 16, 21, 18, 33, 20, 18, 20, 18, 20, 22, 23, 14, 20, 20, 20, 18, 27, 21, 18, 17, 18, 10, 5, 6, 5, 17, 15, 17, 17, 15, 17, 21, 16, 15, 14, 14, 13, 17, 20, 15, 16, 15, 17, 21, 24, 22, 14, 15, 21, 23, 15, 15, 15, 15, 16, 16, 11, 13, 20, 17, 20, 15, 17, 15, 14, 15, 15, 18, 17, 14, 13, 14, 16, 19, 17, 15, 13, 16, 12, 17, 15, 17, 18, 16, 14, 16, 15, 15, 15, 17, 12, 18, 16, 16, 16, 16, 18, 19, 19, 18, 15, 13, 15, 13, 21, 10, 17, 16, 18, 19, 19, 19, 13, 17, 19, 16, 16, 20, 19, 19, 17, 16, 14, 19, 22, 12, 15, 15, 19, 11, 15, 19, 23, 

In [2]:
##
## new book document, with pages elements
##

# create a completely new tree now, putting all element inside <page id="..">
book=etree.Element("book")
for pgno in itertools.count(1):
    if pgno<=5: continue
    if pgno>=845: break # discard tables at the end
    if pgno>=len(pageTops): break
    pgtop=pageTops[pgno]
    page=etree.Element("page",id=str(pgno+1),top=str(pgtop))
    for e in pageElems[pgno]:
        e.attrib['y']=str(getTop(e)-pgtop)
        e.attrib['x']=str(getLeft(e))
        page.append(copy.deepcopy(e))
    book.append(page)
    
open('xml/book.1.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))
    
# cleanup styles for everything
for e in book.xpath('//span | //div'):
    style=e.attrib.get('style',None)
    if not style: continue
    def _pair(s): return (_:=s.split(':'))[0].strip(),_[1].strip()
    ss=[_pair(item) for item in style.split(';') if len(item)>0]
    s2=dict(ss)
    # things we definitely want to drop
    for k in ('writing-mode','width','position','top','left','height'): s2.pop(k,None)
    # sematic fonts (family attribute)
    if ff:=s2.pop('font-family',None):
        if ff in ('URWPalladioPali','URWPalladioITU','TimesNewRoman','Times_BPS'): pass
        elif ff.startswith('TT'): pass # special fonts for tables
        elif ff in ('URWPalladioPali-Italic','TimesNewRomanPS-ItalicMT','Times_BPS,Italic'): e.attrib['family']='italic'
        elif ff in ('URWPalladioPali-Bold','Times_BPS,Bold'): e.attrib['family']='bold'
        elif ff in ('URWPalladioPali-BoldItalic','Times_BPS-BoldItalic'): e.attrib['family']='bold-italic'
        else: raise ValueError(f'Unhandled font-family {ff}')
    # size attribute
    if sz:=s2.pop('font-size',None):
        assert sz.endswith('px')
        if sz[:-2] not in ('8','9'): e.attrib['size']=sz[:-2]
    # print(s2)
    if len(s2): e.attrib['style']='; '.join([f'{k}:{v}' for k,v in s2.items()])
    else: del e.attrib['style']



    
if 1:        
    # drop footers with accesstoinsight
    if 1:
        for e in book.xpath('page/div[contains(span[2],"accesstoinsight")]'):
            e.getparent().remove(e)
    
    # drop page bottom lines
    for e in book.findall('page/span[@style="border:gray 1px solid"]'):
        assert e.text is None or e.text==' ' or e.text=='\n' or e.text=='|'
        e.getparent().remove(e)

    # all top-level divs (paragraphs) have this, drop it
    for e in book.xpath('//div[@style="border:textbox 1px solid"]'): del e.attrib['style']

    # funny divs at the end of the page
    for e in book.xpath('page/div[@x="-1"]'): e.getparent().remove(e)

    pgnosMain=[]
    pgnosFront=[]
    # remove page numbers (always last div on page)
    for e in book.xpath('page/div[@y>="605"]'):
        pgno=int(e.getparent().attrib['id'])
        if len(e)!=1: continue
        if e[0].text is None: continue
        # print(pgno,e[0].text)
        #pgnos.append(e[0].text.strip())
        t=e[0].text.strip()
        if sum([c in t for c in 'xivcl'])>0: # front, roman numerals
            import roman
            pgnosFront.append(roman.fromRoman(t.replace(' ','').upper()))
        else:
            pgnosMain.append(int(t))
        e.getparent().attrib['pageno']=e[0].text.strip()
        e.getparent().remove(e)
        
    missingOk={'main':[78,79,80,428,429,430,748,749,750],'front':[]}
    for pp,what in [(pgnosFront,'front'),(pgnosMain,'main')]:
        # print(what,pp)
        for i in range(pp[0],pp[-1]+1):
            if i not in pp and i not in missingOk[what]:print(what,'missing footer',(roman.toRoman(i) if what=='front' else str(i)))

    # remove page headings
    for e in book.xpath('(page/div[1])[@y<=35]'): e.getparent().remove(e)
    for e in book.xpath('(page/div[1])[@y<=35]'): e.getparent().remove(e)
    
    # drop these divs
    for e in book.xpath('.//div[@style="border:figure 1px solid"]'):
        e.getparent().remove(e)

# create <footnote_separator/> for horizontal lines at certain positions (heuristics)
for e in book.xpath('//span[contains(@style,"border:black 1px solid") and (@x="40" or @x="42" or @x="48" or @x="49")]'):
    if e.getparent().attrib['id']=='30' and int(e.attrib['y'])<200: continue
    e.getparent().attrib['footnote_pos']=str(e.attrib['y'])
    # print(e)
    if 0: e.getparent().remove(e)
    else:
        e.tag='footnote_separator'
        del e.attrib['style']

## HACK
for s in book.findall('.//span'):
    if s.text!='107\n': continue
    pgno=s.getparent().getparent().attrib['id']
    print(pgno)
    s.getparent().remove(s)
    
# discard page with title after introduction
for p in book.xpath('/book/page[@id="59"]'): p.getparent().remove(p)
    
open('xml/book.2.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))

3657623

In [156]:
##
## FOOTNOTES
##

from lxml import etree
import itertools,copy,re,string

book=tree=etree.parse('xml/book.2.xml',etree.XMLParser()).getroot() 
# footnotes 


# detect footnote marks (small font)
footCurr=0
for page in book:
    pgno=page.attrib['id']
    for e in page.findall('*/span[@size="5"]'):
        if e.text in (None,'st','nd','rd','th'): continue
        t0=e.text[:]
        hadEol=(e.text.endswith('\n') or e.text.endswith('|'))
        #if pgno=='285':
        #    print(e.text,e.text.strip(),hadEol)
        e.text=e.text.strip().replace('|','')
        if set(e.text)-set(string.digits): raise RuntimeError(f'Unhandled size=5 text: "{e.text}"')
        try: f=int(e.text)
        except ValueError:
            # raise RuntimeError(f'{pgno=}: {e.text=} {t0=}')
            print(f'{pgno=}: error with size=5: {e.text=} {t0=}, SKIPPING')
            continue
        if f not in (footCurr+1,1): print(f'{pgno}: {footCurr} → {f} (repeated footmark)')
        # print(f'{pgno} {f}')
        footCurr=f
        e.tag='footref'
        e.attrib['page_id']=e.getparent().getparent().attrib['id']
        del e.attrib['size']
        # preserve linebreak after footnote (important in verse)
        if hadEol:
            eol=etree.Element('span')
            eol.text='\n'
            e.getparent().insert(e.getparent().index(e)+1,eol)


        
# find footnote marks in footnote area

pgDebug=-1
footLast=0
footVerbose=False
# footrefs=[]
for page in book:
    global footLast
    pgno=int(page.attrib['id'])
    # print(f'{pgno} {footLast}')
    # if pgno>=100: break
    fy=int(page.attrib.get('footnote_pos','0'))
    footrefs=[int(e.text) for e in page.findall('.//footref')]
    if footrefs and footrefs[0]==1:
        footLast=0
    if fy==0: continue # no footnotes on this page
    pageFootNums=[]
    
    if footrefs: footMin,footMax=footLast+1,footrefs[-1]+1
    else: footMin,footMax=footLast+1,footLast+2
    ## exceptions
    if pgno==65: footMax=4
    elif pgno==224: footMax=5
    elif pgno==260: footMax=20
    elif pgno==690: footMax=3
    elif pgno==498: footMax=14
    elif pgno==558: footMax=15
    
    if pgno==pgDebug or footVerbose: print(f'{pgno=} {footMin=} {footMax=} {footLast=} {footrefs=}')

    divs=[d for d in page.findall(f'div') if int(d.attrib['y'])>fy]
    for num in range(footMin,footMax+1):
        numLone=re.compile(rf'^({num})$')
        numPat=re.compile(rf'\b({num})\.')
        numPatLine=re.compile(rf'\n({num}).(?=\n)')
        numPatNoDot=re.compile(rf'\b({num})\b(?!\))')
        for currPat,strategy in (numLone,'$'),(numPatLine,'|'),(numPat,'='),(numPatNoDot,'#'):
            if pgno==76 and strategy=='#': continue
            dobreak=False
            for div in divs:
                for span in div:
                    if span.tag!='span' or not span.text: continue
                    et=span.text.replace("\n","|")
                    if pgno==pgDebug: print(f'{num=} {et=} {strategy=} {currPat=}')
                    matches=list(currPat.finditer(span.text))
                    if not matches: continue
                    if len(matches)>1: print(f'{pgno}: multiple matches for {num}, using first match.')
                    m=matches[0]
                    if num!=footLast+1: print(f'{pgno}: non-sequential footnotes {footLast} → {num}')
                    if span.text[:m.span()[0]].endswith(' note '):
                        print('@@')
                        continue
                    footLast=num
                    # replace elements
                    m0,m1=m.span()
                    if strategy=='|':
                        if pageFootNums: # (pgno,num) in [(35,7),]:
                            eLeft,eMiddle,eRight=etree.Element("span"),etree.Element('footmark',id=m.group(1)),etree.Element("span")
                            eLeft.text=span.text[:m0]
                            eRight.text=span.text[m1:]
                            span.addprevious(eLeft)
                            span.addprevious(eMiddle)
                            span.addprevious(eRight)
                            span.getparent().remove(span)
                        else:
                            span.text=span.text[:m0]+span.text[m1:]
                            span.getparent().insert(0,etree.Element("footmark",id=m.group(1)))
                            print(f'[{pgno} {m.group(1)}]',end=" ")
                    else:
                        if m0>0:
                            e=etree.Element("span")
                            e.text=span.text[:m0]
                            span.addprevious(e)
                        span.addprevious(emark:=etree.Element("footmark",id=m.group(1)))
                        if m1<len(span.text):
                            e=etree.Element("span")
                            e.text=span.text[m1:]
                            span.addprevious(e)
                        span.getparent().remove(span)
                    pageFootNums.append(num)
                    dobreak=True
                    break
                if dobreak: break
            if dobreak: break
open('xml/book.3.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))


book=tree=etree.parse('xml/book.3.xml',etree.XMLParser()).getroot() # remove_blank_text=True)).getroot()
# move overflown footnotes to the pages where they belong
prevFooter=None
for page in book:
    pgno=int(page.attrib['id'])
    sep=page.find('footnote_separator')
    if sep is None:
        prevFooter=None
        # print(f'{pgno}: reset prevFooter')
        continue
    overflow=True
    for footer in sep.itersiblings():
        for e in footer:
            if e.tag=='footmark': overflow=False
            if overflow:
                # hack
                if pgno==581:
                    footer.insert(0,etree.Element('footmark',id='18'))
                    overflow=False
                else:
                    #print(f'({pgno})',end=" ")
                    print(f'{pgno}→{prevFooter.getparent().attrib["id"]}',end=" ")
                    prevFooter.append(e)
    if not overflow:
        prevFooter=footer
        #print(f'{pgno}: {prevFooter=}')
    else:
        pass
        #print(f'{pgno}: still overflowing, {prevFooter=}')
            
#if 0:   
#    p65=book.find('page[@id="65"]')
#    assert 'border:black 1px solid' in p65[-1].attrib['style']
#    p65.remove(p65[-1])

print('\n'+20*'--')
# second pass: put footnotes together
pgDebug=564
footVerb=True
prevNotes={}
prevRefs={}
for page in book:
    pgno=int(page.attrib['id'])
    sep=page.find('footnote_separator')
    # if sep is None and not prevNotes: continue
    ff={}
    # print('\n** pg',pgno)
    mark=None
    if sep is not None:
        # iterate through divs ("paragraphs") in the footer area
        nextPara=False
        for footer in sep.itersiblings():
            marks=[]
            if nextPara: ff[mark].append(etree.Element('div')) # new paragraph of the last footnote
            for e in footer:
                # this is a new footnote
                if e.tag=='footmark':
                    mark=e.attrib['id']
                    marks.append(e)
                    nextPara=False
                    continue
                if mark is None: raise RuntimeError(f'{pgno}')
                if mark not in ff: ff[mark]=[etree.Element('div')]
                ff[mark][-1].append(e)
                # print(f'{pgno}: {mark}')
            if mark and len(ff[mark][-1])>0: nextPara=True
            for e in marks: e.getparent().remove(e)
    for mark,elem in prevRefs.items():
        for e in ff[mark]: elem.append(e)
    if ff or prevNotes:
        # print(f'{ff} {prevNotes}')
        print(f'*{pgno}: {",".join(ff.keys())}',end=" ")
        if prevNotes: print(f'[{",".join(prevNotes.keys())}]',end=" ")
    prevRefs={}
    # print()
    #for footer in sep.itersiblings():
    for ref in page.findall('.//footref'):
        if pgno==pgDebug: print(100*'*'+f'F {pgno}: {ref.text}')
        ref.tag='footnote'
        ref.attrib['page_id']=page.attrib['id']
        ref.attrib['mark']=ref.text
        src=(prevNotes if ref.text in prevNotes else ff)
        if pgno==69 and ref.text=='8' and '8' not in src:
            ref.attrib['reference_existing_footnote']="1"
            # ref.append(etree.Element('footnote_extra_ref_TODO',id='8',page_id='69'))
        elif pgno==397 and ref.text=='8': pass
        elif ref.text in src:
            for e in src.pop(ref.text): ref.append(e)
        else:
            # print(f'[{ref.text} unavailable]',list(prevNotes.keys()),list(ff.keys()))
            prevRefs[ref.text]=ref
        ref.text=None
        if len(ref)==0 or len(ref[-1])==0: pass #  print('@')
        else:
            tail=ref[-1][-1].text.strip()
            if tail.endswith('with Ce of M-a and A-a'): ref[-1][-1].text,tail=ref[-1][-1].text[:-1]+'.',tail+'.'
            if sum([tail.endswith(s) for s in ['.','.”',')”','.]','?','.’',')','…”','”…','nābhinandati …',]])==0:
                # print(f'\n{pgno=} {ref.text} {tail[-30:]}')
                #ref[-1][-1].text=ref[-1][-1].text+'(END-FOOTNOTE-ERROR)'
                raise RuntimeError(f'Error in footnote ending {pgno=} {tail[:-30]}')
    prevNotes=ff
    # print(f'{pgno}: {list(ff.keys())} {list(prevNotes.keys())} {list(prevRefs.keys())}')
    if prevRefs: print(f' ({",".join(prevRefs.keys())})',end=" ")
    
# remove all separators        
for page in book:
    sep=page.find('footnote_separator')
    if sep is None: continue
    for f in sep.itersiblings():
        # print(f'{page.attrib["id"]} {f.tag} {len(f)=} {f.text}')
        if f.tag=='div' and len(f)==0 and f.text=='\n': f.getparent().remove(f)
    #sep.getparent().remove(sep)
    #del page.attrib['footnote_pos']

open('xml/book.4.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))

69: 8 → 8 (repeated footmark)
pgno='499': error with size=5: e.text='' t0=' ', SKIPPING
179: multiple matches for 11, using first match.
@@
@@
@@
30→29 30→29 30→29 30→29 30→29 30→29 30→29 30→29 36→35 36→35 36→35 36→35 36→35 36→35 36→35 36→35 36→35 36→35 36→35 36→35 36→35 36→35 56→55 56→55 56→55 56→55 56→55 56→55 56→55 56→55 56→55 56→55 56→55 56→55 81→80 81→80 81→80 81→80 81→80 81→80 81→80 81→80 81→80 81→80 81→80 81→80 81→80 81→80 81→80 87→86 88→87 88→87 88→87 99→98 133→132 133→132 133→132 133→132 133→132 133→132 133→132 133→132 133→132 133→132 133→132 165→164 165→164 165→164 178→177 178→177 178→177 178→177 178→177 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 180→179 193→192 193→192 193→192 193→192 193→192 245→244 245→244 245→244 245→244 245→244 245→244 245→244 24

3681753

In [157]:
##
## paragraphs accross pages, small caps, verse
##

book=tree=etree.parse('xml/book.4.xml',etree.XMLParser()).getroot()
# book=tree=etree.parse('vism.book.4.xml',etree.XMLParser()).getroot()

# this is page 65
for s in book.xpath('.//span[@style="border:black 1px solid" and @y>500]'):
    s.getparent().remove(s)


# remove a few more useless tags
for page in book:
    pgno=int(page.attrib['id'])
    for e in page.findall('.//div'):
        if (
            e.text in (None,'\n','','|',' ') and ('style' not in e.attrib)
            and (len(e)==0 or (len(e)==1 and e[0].tag=='span' and e[0].text in (None,'\n','','|',' ')))
            ): e.getparent().remove(e)
    if len(page)==0: continue
    if page[-1].tag=='footnote_separator':
        page.remove(page[-1])
    
# try to detect Small Caps runs
for page in book:
    pgno=int(page.attrib['id'])
    # if pgno>100: break
    # if pgno!=145: continue
    for span in page.findall('.//span'):
        def _textIsNum(text):
            return re.match('^(?P<head>\([xivcm]+\)) (?P<tail>[^a-z]+)$',text)
        def _spanSc(s,ini=False):
            if s.tag!='span' or s.text is None: return False
            # if not ini: print(f'  test: {s.text}')
            if sum(map(str.islower,s.text)) and not _textIsNum(s.text): return False
            if (upper:=sum(map(str.isupper,s.text)))<(2 if ini else 0): return False
            if ini and upper<len(s.text)//2: return False
            if '.......' in s.text: return False # in TOC
            return True
        def _elemScCompat(s,s2,sizes):
            if not _spanSc(s2,ini=False): return False
            if not _textIsNum(s2.text) and s.attrib.get('family','normal')!=s2.attrib.get('family','normal'): return False
            if len(sizes)==2 and s.attrib.get('size','9') not in sizes: return False
            sizes.add(s.attrib.get('size','9'))
            sizes.add(s2.attrib.get('size','9'))
            return True
        if not _spanSc(span,ini=True): continue
        #  print(f'{pgno}: starting from "{span.text}"')
        run=[span]
        sizes=set()
        for e in span.itersiblings(preceding=True):
            if not _elemScCompat(span,e,sizes):
                # print('no')
                break
            #print(f'  - "{e.text}"')
            run.insert(0,e)
        for e in span.itersiblings():
            if not _elemScCompat(span,e,sizes): break
            #print(f'  + "{e.text}"')
            run.append(e)
        if len(run)<2: continue
        bigger=max([int(s) for s in sizes])
        parent=span.getparent()
        
        if m:=_textIsNum(run[0].text):
            parent.insert(parent.index(span),e:=etree.Element('span',size=str(bigger)))
            e.text=m.group('head')
            run[0].text=m.group('tail')            
        span.text=''.join([s.text if int(s.attrib.get('size','9'))==bigger else s.text.lower() for s in run])
        span.attrib['family']='smallcaps'
        span.attrib['size']=str(bigger)
        for e in run:
            if e!=span: parent.remove(e)
        #print(pgno,sizes,'_'.join([e.text for e in run]))
        #print(pgno,span.text)

        
open('xml/book.4a.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))
        
# detect paragraphs continuing from preceding page
prevDone=False
for page in book:
    pgno=int(page.attrib['id'])
    if len(page)==0: continue
    # if page[0].tag!='div': print(f'{pgno}')
    d0=page[0]
    if d0.tag!='div':
        print(f'{pgno} {d0.tag}',end=" ")
        continue
    assert d0.tag=='div'
    if int(d0.attrib['y'])<60 and int(d0.attrib['x'])<50:
        # unindented start of new paragraph, not continuation
        if re.match('^[0-9]+\. ',d0[0].text): continue
        if prevDone: print(f'{pgno} ?')
        d0.attrib['continuation']="1"
    prevDone=(page[-1][-1].text is not None and page[-1][-1].text.endswith('.'))
    

msg=False
# remove table from introduction
for div in list(book.xpath('page[(@id>=30) and (@id<=32)]/*[self::div or self::span]')):
    pg,y=int(div.getparent().attrib['id']),int(div.attrib['y'])
    # print(f'{div.tag=} {pg=} {y=}')
    #if not (30<=pg<=32): continue
    if pg==30 and y<150: continue
    if pg==32 and y>350: continue
    if not msg:
        d2=etree.Element('div',**div.attrib)
        del d2.attrib['style']
        d2.append(etree.Element('TODO',id='ceylon-king-tab',desc='table with kings of Ceylon',y=div.attrib['y'],x=div.attrib['y']))
        div.addprevious(d2)
    msg=True
    #print(pg,end=' ')
    div.getparent().remove(div)


    
# detect verse divs
for page in book:
    # break
    pgno=int(page.attrib['id'])
    if pgno in (591,724): continue # hack: chapter title would be interpreted as verse
    # if pgno!=273: continue
    # print(pgno)
    for div in page:
        if not (112<int(div.attrib['x'])<125): continue
        if sum([s.text.count('\n') for s in div if s.text])<2: continue
        div.tag='verse'
        lines=[]
        buf=[]
        for s in div:
            if s.text is None or '\n' not in s.text:
                buf.append(s)
                continue
            #print('2',s.text)
            splits=s.text.split('\n')
            for i,l in enumerate(splits):
                # print(f'{i} {l} {buf=}')
                if len(l)>0:
                    e=etree.Element('span')
                    e.text=l
                    for a in ('size','family'):
                        if a in s.attrib: e.attrib[a]=s.attrib[a]
                    buf.append(e)
                if i<len(splits)-1:
                    lines.append(buf)
                    buf=[]
        if buf: lines.append(buf)
        for e in div: div.remove(e)
        for line in lines:
            el=etree.Element('line')
            for e in line: el.append(e)
            div.append(el)

    
    
open('xml/book.4b.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))

14 span 31 span 32 span 

3635190

In [158]:
##
## SECTIONING
## 
import numpy as np

book=tree=etree.parse('xml/book.4b.xml',etree.XMLParser()).getroot()
# detect headings
for sp in book.xpath('.//div[starts-with(span,"Part ")]'):
    if len(sp)!=3 or sp[0].tag!='span' or not 'size' in sp[0].attrib or sp[0].attrib['size']!='26' or not sp[0].text.startswith('Part'): continue
    # h=etree.Element('heading-1-part')
    sp.tag='heading-1-part'
    sp.attrib['toc_name']=sp[0].text.split('\n')[0]
    sp.attrib['toc_num']=sp.attrib['toc_name'].split()[-1]
    ee=[]
    for i,s in enumerate(sp):
        e=etree.Element('span')
        e.text=(s.text.split('\n')[1] if i==0 else s.text).replace('\n','')
        if 'family' in s.attrib: e.attrib['family']=s.attrib['family']
        ee.append(e)
    for s in sp: sp.remove(s)
    for e in ee: sp.append(e)

# keep track of page numbers for chapters -- used for sections/subections below
chap2page=[]
for d1 in book.xpath('.//div[starts-with(span,"Chapter ")]'):
    # skip stuff in TOC
    pgno=int(d1.getparent().attrib['id'])
    if pgno<29: continue
    chap2page.append(pgno)
    d2=d1.getnext()
    d3=d2.getnext()
    d1.tag='heading-2-chapter'
    d1.attrib['toc_name']=d1[0].text.strip()
    d1.attrib['toc_num']=d1[0].text.strip().split()[-1]
    d2.attrib['anchor']=d1.attrib['toc_num']
    for c in d1: d1.remove(c)
    def mk_span(text,**kw):
        ret=etree.Element('span')
        for k,v in kw.items(): ret.attrib[k]=v
        ret.text=text
        return ret
    # print(pgno,d2[0].text.strip())
    d1.append(mk_span(d2[0].text.strip().replace('\n',' ')))
    d1.attrib['subtitle_pali']=d3[1].text
    p=d1.getparent()
    p.remove(d2)
    p.remove(d3)

# intro sectioning

for div in book.xpath('page/div[span/@family="smallcaps" and (span/@size="12" or span/@size="9")]'):
    #div=span.getparent()
    pgno=int(div.getparent().attrib['id'])
    if not 24<pgno<61: continue
    size=div[-1].attrib['size']
    div.tag=('heading-2-chapter' if size=='12' else 'heading-3-section')
    print('*' if size=="12" else '  *',pgno,'|'.join([s.text.strip() for s in div if s.text]))
    for s in div:
        s.attrib.pop('size')
        s.attrib.pop('family')
        if s.text is not None: s.text=s.text.replace('\n',' ')# strip()
    #
    #span.attrib.pop('family'); span.attrib.pop('size'); span.attrib['x']=div.attrib['x']; span.attrib['y']=div.attrib['y']
    #div.addprevious(span)
    #if len(div)==0: div.getparent().remove(div)

#
# tricky, some of these headings are sections and some are subsections
# they use the same font, though, are only indented differently in the ToC (that info is lost in the XML unfortunately)
#
chap2page=np.array(chap2page)
for d in book.xpath('.//div[span/@size="9" and span/@family="smallcaps"]'):
    # skip stuff in TOC
    pgno=int(d.getparent().attrib['id'])
    chap=np.searchsorted(chap2page,pgno,side='right')
    if pgno==63 and chap!=1: raise RuntimeError('Chapters out of sync? Chapter I is on page 63')
    if pgno<30: continue
    # if pgno<139 or pgno>139: continue
    # print(pgno,len(d),'#',"|".join([s.text for s in d if s.text is not None])[:100])
    heading=None
    def _flatten(ss): return ' '.join([s.text for s in ss if s.text is not None])
            
            
    if d[0].text.startswith('[') and len(d)==1:
        heading=d[0].text.strip()
        # away=1
    elif m:=re.match(r'^(\([xiv]+\))',d[0].text):
        print(pgno,len(d),':',"|".join([s.text for s in d if s.text is not None])[:100])
        if '?' in d[0].text:
            heading=d[0].text
        else:
            num=m.group(1)
            spl=d[1].text.split('?')
            heading=num+' '+spl[0].strip()+'?'
            d[1].text=(spl[1] if len(spl)>1 else '')
    elif m:=re.match(r'^(?P<para>[0-9]+\.)\s+(?P<num>\([xiv]+\))(?P<tail>.*)$',d[0].text,re.DOTALL):
        print('##',pgno,len(d),":","|".join([s.text for s in d if s.text is not None]).replace('\n','|')[:100])
        if 0:
            if '?' in d[0].text:
                heading=m.group('num')+' '+d[0].text.split('?')[0]            
            else:
                spl=d[1].text.split('?')
                heading=m.group('num')+' '+m.group('tail')+spl[0]+'?'
                d[1].text=(spl[1] if len(spl)>1 else '')
    else:
        print('??',pgno,len(d),":","|".join([s.text for s in d if s.text is not None]).replace('\n','|')[:100])
        # pass
    # print(heading)
    if heading is not None:   
        # determine whether this is section (level 3) or subsection (level 4)
        # this depends on chapter number and title text
        if 1<=chap<=6: level=3
        elif chap in (7,): level=(3 if heading.startswith('[(') else 4)
        elif chap in (8,9,10): level=3
        elif chap in (11,12,13): level=(4 if (heading.startswith('[(') or (chap==13 and heading=='[General]')) else 3)
        elif chap==14:
            #if re.match('[ABCD]\. ',heading): level=3
            #elif '&' in heading: level=3
            if heading.endswith('Aggregate]') or heading=='Materiality' or heading.startswith('[Perception,'): level=4
            else: level=3
        elif chap==15: level=3
        elif chap==16: level=(4 if heading.startswith('[(') else 3)
        elif chap==17: level=(3 if heading.startswith('[Section ') else 4)
        elif chap==18: level=(4 if heading.startswith('[(') else 3)
        elif chap==19: level=3
        elif chap==20: level=(4 if heading.startswith('[(') else 3)
        elif chap==21: level=(4 if re.match('^\[[0-9]\.',heading) else 3)
        elif chap in (22,23): level=3
        else: raise RuntimeError(f'{pgno} unhandled chapter {chap}')
        #c=list(d)
        #for c in d: d.remove(c)
        d.remove(d[0])
        if level==3: e=etree.Element('heading-3-section',x=d.attrib['x'],y=d.attrib['y'])
        elif level==4: e=etree.Element('heading-4-subsection',x=d.attrib['x'],y=d.attrib['y'])
        print(level*'    '+str(pgno)+' '+heading)
        e.text=heading
        if pgno==697 and e.text.endswith('Etc.'): e.text=e.text+']'
        d.addprevious(e)


   
open('xml/book.4c.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))

* 25 Message from his Holiness the Dalai Lama
* 26 Publisher’s Foreword to Third Edition
* 26 Publisher’s Foreword to Fourth Edition
* 27 Translator’s Preface
* 29 Introduction
  * 29 Background and Main Facts
  * 34 The|Visuddhimagga|and its Author
  * 46 The|Vimuttimagga
  * 47 Trends in the Development of Theravāda Doctrine
  * 49 The|Paramatthamañjusā
  * 51 Concerning the Translation
  * 55 Concluding  Remarks
            63 [I. Introductory]
            68 [II. Virtue]
68 5 : (i) |What is virtue?| It is the states beginning with volition present in one who
abstains from kill
            68 (i) What is virtue?
?? 69 34 : as to the rest—|19. (ii) I|n what sense is it virtue?| It is virtue (|sīla|)| |in the sense of compo
## 69 3 : 20. (iii) Now,  |what  are  its  characteristic,  function,  manifestation,  and  proximate|cause?| 
## 70 3 : 23. (iv) |What  are  the  benefits  of  virtue?|  Its  benefits  are  the  acquisition  of  the|seve
## 72 2 : 25. (v) Now,  here  is  the  answ

3633254

In [159]:
##
## PARAGRAPHS
##

# now we have chapters so can confidently know where paragraphs reset
book=tree=etree.parse('xml/book.4c.xml',etree.XMLParser()).getroot()
# open('vism.book.5.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))

# pg 654, missing paragraph number
## HACK: missing paragraph numbers
##
for prefix,txt in [
        ('87. ','Thus there comes to be the removal of [false] view'), # pg654
        ('7. ','impermanent  in  the  sense  of  destruction,  painful'), # pg692
        ('8. ','Understanding of defining by summarization thus, ‘With birth'), # pg962
    ]:
    fix0=book.xpath('.//div[contains(span,"'+txt+'")]')
    print(prefix,len(fix0))
    assert(len(fix0)==1)
    fix0[0][0].text=prefix+fix0[0][0].text


nextPar=None
currChap=None
for page in list(book):
    pgno=int(page.attrib['id'])
    if pgno<63: continue
    # if pgno>64: continue
    if pgno>802: break
    print(f'[{pgno}:',end="")
    for d in page:
        if d.tag=='heading-2-chapter':
            nextPar=1
            currChap=d.attrib['toc_num']
            continue
        if d.tag!='div' and d.tag!='verse' and not d.tag.startswith('heading-'): print(f'{pgno} {d.tag}')
        if pgno==491 and int(d.attrib['x'])>60: continue
        if pgno==692 and nextPar==10: continue 
        while True:
            found=False
            pat=re.compile(r'^\s*'+str(nextPar)+r'(\.|\s|$)(?P<tail>.*)$',re.DOTALL)
            for s in d:
                if s.text is None: continue
                #if pgno==64: print(s.text,nextPar,)
                # forgotten in the PDF, found in the older paper edition
                if pgno==596 and nextPar==22 and re.match('The\s+first',s.text):
                    # print(f'@HACK@{nextPar}',end="")
                    s.addprevious(e:=etree.Element('vism-para',num=str(nextPar),anchor=currChap+'.'+str(nextPar)))
                    # s.text=str(nextPar)+'. '+s.text
                    e.text=str(nextPar)
                    nextPar+=1
                    found=True
                # one case (pg 98, §122) forgot the dot, so try both with and without
                elif m:=pat.match(s.text): # ,s.text.startswith(str(nextPar)+'. ') or s.text.startswith(str(nextPar)+' '): 
                    print(f'={nextPar}',end="")
                    s.text=m.group('tail').lstrip()
                    # if s[0]==' ': s=s[1:]
                    s.addprevious(e:=etree.Element('vism-para',num=str(nextPar),anchor=currChap+'.'+str(nextPar)))
                    e.text=str(nextPar)
                    # XX.42: has dot extra as italics, remove it
                    if s.text=='': s.getparent().remove(s)
                    if ((n:=e.getnext()) is not None) and n.text=='. ': n.getparent().remove(n)
                    nextPar+=1
                    found=True
            if found: continue
            # if not found at the beginning of the span, search inside the span, as a separate line
            for s in d:
                if s.text is None: continue
                if len(sp:=s.text.split('\n'+str(nextPar)+'.'))>1:
                    assert len(sp)==2
                    print(f'#{nextPar}',end="")
                    s.addprevious(sLeft:=copy.deepcopy(s))
                    sLeft.text=sp[0]
                    s.text=sp[1]
                    s.addprevious(e:=etree.Element('vism-para',num=str(nextPar),anchor=currChap+'.'+str(nextPar)))
                    e.text=str(nextPar)
                    nextPar+=1
                    found=True
            if not found: break
    # add anchors for footnotes in this chapter
    for fn in page.findall('.//footnote'):
        # a=etree.Element(',
        #fn.insert(0
        fn.attrib['anchor']=f'{currChap}.n{fn.attrib["mark"]}'
        
        
    print(']',end=" ")

open('xml/book.5.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))

87.  1
7.  1
8.  1
[63:=1=2] [64:=3=4=5#6] [65:=7] [66:=8=9=10] [67:=11=12=13] [68:=14=15=16=17] [69:=18#19=20] [70:=21=22=23] [71:=24] [72:=25=26] [73:=27=28=29=30=31] [74:=32=33=34=35] [75:=36=37=38=39] [76:=40=41=42] [77:=43=44] [78:=45=46=47] [79:=48=49=50=51=52] [80:=53=54] [81:=55] [82:=56=57=58] [83:=59=60=61=62] [84:=63=64=65=66=67=68] [85:=69=70] [86:=71=72#73=74] [87:=75=76=77=78=79=80] [88:=81=82=83] [89:=84=85=86=87] [90:=88=89=90=91=92] [91:=93=94] [92:=95=96] [93:=97=98=99] [94:=100=101=102=103] [95:=104=105=106=107] [96:=108=109=110=111=112] [97:=113=114=115=116=117=118] [98:=119=120=121=122] [99:=123=124=125=126] [100:=127=128=129] [101:=130=131] [102:=132=133] [103:=134=135=136] [104:=137=138=139=140] [105:] [106:=141=142=143] [107:=144=145=146=147=148=149=150=151] [108:=152=153=154=155] [109:=156=157] [110:=158] [111:=159] [112:=160=161] [113:=1=2=3] [114:=4=5#6=7] [115:=8=9=10=11=12] [116:=13=14=15=16] [117:=17=18=19] [118:=20=21=22] [119:=23=24=25=26] [120:=27=28=29

3771573

In [160]:
##
## LOGICAL STRUCTURE:
## - book, index, glossary as separate documents
## - 
##
from lxml import etree
import itertools,copy,re

book=tree=etree.parse('xml/book.5.xml',etree.XMLParser(remove_blank_text=True)).getroot() # remove_blank_text=True)).getroot()
index=etree.Element("index")
##
## get rid of pages, structure everything logically
##
for page in list(book):
    pgno=int(page.attrib['id'])
    assert page.tag=='page'
    if len(page)==0 or pgno<25 or pgno>806:
        # separate output for index and glossary
        if pgno>=809:
            for i,d in enumerate(page):
                # drop headers for index and glossary
                if pgno in (809,832) and i in (0,1,2): continue 
                d.attrib['page_id']=page.attrib['id']
                index.append(d)
        book.remove(page)
        continue
    for d in page:
        d.attrib['page_id']=page.attrib['id']
        d.attrib['page_no']=page.attrib.get('pageno','N/A')
    if 'pageno' in page.attrib: 
        e=page[0]
        while e.tag.startswith('heading-'): e=e.getnext()
        e.insert(0,br:=etree.Element('printed_page',edition='BPS2011',page_id=d.attrib['page_id']))
        br.text=page.attrib['pageno']
    if 'continuation' in page[0].attrib:
        # print(pgno,book[-1].tag)
        if book[-1].tag=='div':
            for s in page[0]: book[-1].append(s)
            # print(f'{pgno} {page[0].tag} {len(page[0])} {page[0].text}')
            assert len(page[0])==0 and (page[0].text is None or page[0].text.strip()=='')
            page.remove(page[0])
        else: book.append(page[0])
        print('.',end='')
    # assert 'continuation' not in page[0]
    for d in page: book.append(d)
    assert len(page)==0
    book.remove(page)
front=etree.Element('heading-1-part',toc_name='(Front)')
front.text='(Front)'
book.insert(0,front)

def _splitSpan(span,headText,mids,tailText):
    if len(headText)==0: # don't prepend empty span
        for e in mids: span.addprevious(e)
        span.text=tailText
    elif len(tailText)==0: # don't append empty span
        for e in mids: span.addnext(e)
        span.text=headText
    else: # split
        s0=copy.deepcopy(span)
        s0.text=headText
        span.addprevious(s0)
        for e in mids: span.addprevious(e)
        span.text=tailText

# locate PTS page marks [25] etc in all spans (includes nested in verse etc), sequentially
pagePat=re.compile(r'^(?P<head>.*)\[(?P<num>[0-9]{1,3})\](?P<tail>.*)$',re.DOTALL)
for span in book.findall('.//span'):
    if span.text is None: continue
    while m:=pagePat.match(span.text):
        head,num,tail=m.group('head'),m.group('num'),m.group('tail')
        pg=etree.Element('printed_page',edition='PTS')
        pg.text=num
        _splitSpan(span,head,[pg],tail)
        print(f'[{num}]',end=' ')
        
## create section tree
currSect=[]
for e in book:
    if e.tag=='heading-1-part':
        p=etree.Element('struct-1-part',name=e.attrib['toc_name'])
        # p.text=e.text
        book.insert(book.index(e),p)
        currSect=[p]
    elif e.tag=='heading-2-chapter':
        p=etree.Element('struct-2-chapter',name=(e.get('toc_name',' '.join([s.text for s in e if s.text]))))
        # p.text=e.text
        currSect[0].append(p)
        currSect=[currSect[0],p]
    elif e.tag=='heading-3-section':
        p=etree.Element('struct-3-section',name=(e.text if e.text else ' '.join([s.text for s in e if s.text is not None])))
        # p.text=e.text
        # print(e.attrib['page_no'])
        # print(len(currSect))
        if len(currSect)<=1:
            print(e.attrib['page_no'],'no chapter yet!',e.text)
            c=etree.Element('struct-2-chapter',name='[fake-chapter]')
            c.text='[fake chapter]'
            currSect[0].append(c)
            currSect.append(c)
        currSect[1].append(p)
        currSect=[currSect[0],currSect[1],p]
    elif e.tag=='heading-4-subsection':
        p=etree.Element('struct-4-subsection',name=(e.text if e.text else ' '.join([s.text for s in e if s.text is not None])))
        if len(currSect)<=2:
            print(e.attrib['page_no'],'no section yet!',e.text)
            c=etree.Element('struct-3-section',name='[fake-section]')
            currSect[1].append(c)
            currSect.append(c)
        currSect[2].append(p)
        currSect=[currSect[0],currSect[1],currSect[2],p]
    currSect[-1].append(e)

## fuse paragraph tails
for d1 in book.findall('.//div'):
    if len(d1)!=1 or d1[0].tag!='span' or d1[0].text is None: continue
    if (d0:=d1.getprevious()) is None or len(d0)<1 or d0[-1].tag!='span' or d0[-1].text is None: continue
    s1,s0=d1[0],d0[-1]
    t1,t0=s1.text.strip(),s0.text.strip()
    #if len(s1)==0: continue
    if len(t1)>=100: continue
    if (not t0.endswith('.')) and t1[0].islower() and (t1.endswith('.') or t1.endswith('?')):
        # print(t1)
        s0.text+=s1.text
        d1.remove(s1)
    # print(s1.text)
    
    

open('xml/book.5a.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))
open('xml/index.0.xml','w').write(etree.tostring(index,encoding='unicode',pretty_print=True))

...................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15] [16] [17] [18] [19] [20] [21] [22] [23] [24] [25] [26] [27] [28] [29] [30] [31] [32] [33] [34] [35] [36] [37] [38] [39] [40] [41] [42] [43] [44] [45] [46] [47] [48] [49] [50] [51] [52] [53] [54] [55] [56] [57] [58] [59] [60] [61] [62] [63] [64] [65] [66] [67] [68] [69] [70] [71] [72] [73] [74] [75] [76] [77] [78] [79] [80] [81] [82] [83] [84] [85] [86] [87] [88] [89] [90] [91] [92] [93] [94] [95] [96] [97] [98] [99] [100] [101] [102] [103] [104] [105] [106] [107]

299129

In [161]:
##
## INDEX+GLOSSARY (1): fix markup, split to entries
##

index=tree=etree.parse('xml/index.0.xml',etree.XMLParser()).getroot() 

# improper spacing from pdf export, fix that
if 1:
    for e in index.xpath('.//span[contains(text(),"c i t t a")]'): e.text=e.text.replace(' ','')
    for e in index.xpath('.//span[contains(text(),"k n o c k i n g ")]'): e.text=e.text.replace(' ','')

# remove hyphenation
patHyph=re.compile('(?<=\w)-\n(?=\w)')
# patHyph=re.compile('\w-\n\w')
for e in index.findall('.//span'):
    if e.text is None: continue
    e.text,n=patHyph.subn('',e.text)
    # HACK
    e.text=e.text.replace("XIVn.27","XIV.n27").replace('XXI–','XXI-')
    
# remove heading: sort everything
def ix_key(k):
    # page,column,vertical position
    return int(k.attrib['page_id']),(1 if int(k.attrib['x'])<100 else 2),int(k.attrib['y'])
index[:]=sorted(index,key=ix_key)
# fix offset left/right
for d in index:
    if int(d.attrib['page_id'])%2==0: d.attrib['x']=str(int(d.attrib['x'])+6)
# is this conuation?
# X is a hack for one case...
def _isCont(d): return (52<(x:=int(d.attrib['x']))<58) or (231<x<236) or (len(d)>0 and (d[0].text in ('XV') or d[0].text.startswith('95')))

index2=etree.Element("list",title='Index')
glossary=etree.Element("list",title='Glossary')

prevPgno=None
prev=None
for d in index:
    pgno=int(d.attrib['page_id'])
    if pgno==832 and prevPgno!=832:
        index2.append(prev)
        prev=None
    prevPgno=pgno
    if prev is None: prev=etree.Element('entry',page_id=d.attrib['page_id'])
    if not _isCont(d):
        if prev is not None and len(prev)>0: (index2 if pgno<832 else glossary).append(prev)
        prev=etree.Element('entry',page_id=d.attrib['page_id'])
    if d.attrib['y']=='54' and d.attrib['x']=='226' and d.attrib['page_id']=='810':
        print(d[0].text,_isCont(d))
    for e in d:
        prev.append(e)
if prev is not None: index.append(prev)


open('xml/index.1.xml','w').write(etree.tostring(index2,encoding='unicode',pretty_print=True))
open('xml/gloss.1.xml','w').write(etree.tostring(glossary,encoding='unicode',pretty_print=True))

XXI.10, 27, 37, n.6; XXII.5, 44, 79; XXI-II.7
ascetic practice ( False


59099

In [162]:
##
## INDEX+GLOSSARY (2): split fused entries, parse entries to have title+text
##


index=tree=etree.parse('xml/index.1.xml',etree.XMLParser()).getroot() 
glossary=tree=etree.parse('xml/gloss.1.xml',etree.XMLParser()).getroot() 
def splitentry2(e,span,iline,dbg=False):
    ispan=e.index(span)
    ll=span.text.split('\n')
    s1=etree.Element('span')
    s1.text='\n'.join(ll[iline:])
    span.text='\n'.join(ll[:iline])
    if dbg: print(ll[:iline],ll[iline:])
    e1=etree.Element('entry',page_id=e.attrib['page_id'])
    e1.append(s1)
    for s in e[ispan+1:]: e1.append(s)
    e.addnext(e1)
    if span.text=='' and len(e)==1: 
        assert e==span.getparent()
        e.getparent().remove(e)
    
from unidecode import unidecode

for INDEX in (index,glossary):
    for i in range(15):
        for entry in INDEX:
            if entry.getnext() is None or len(entry.getnext())==0: continue
            for span in entry:
                if span.tag!='span': continue
                doBreak=False
                for iline,l in enumerate(ll:=span.text.split('\n')):
                    if l.strip()=='': continue
                    w0=unidecode(l.split()[0]).lower()
                    SPLIT=False
                    if INDEX==index:                        
                        if w0[0] in ('*“'): w0=w0[1:]
                        # alphabetically between start of this entry and beginning of the next one
                        wPrev,wNext=unidecode(entry[0].text).lower(),unidecode(entry.getnext()[0].text).lower()
                        # if entry[0].text.startswith('desir'): print('   ',l,'|',wPrev[:15],'..',wNext[:15],wPrev<w0<wNext,entry[0].text[:15])
                        SPLIT=(wPrev<w0<wNext)
                        dbg=False
                    else:
                        # in the glossary
                        if w0.startswith('*'): SPLIT=True
                        elif re.match('^[a-zA-Z-]+--',w0): SPLIT=True # unidecode normalizes em-dash to --
                        dbg=False
                        # print(w0,SPLIT)
                    if SPLIT and not (span==entry[0] and iline==0):
                        # print(wPrev[:10],l,wNext[:10])
                        splitentry2(entry,span,iline,dbg=dbg)
                        doBreak=True
                        break
                if doBreak: break
    for entry in INDEX:
        for s in entry:
            s.text=s.text.replace('\n',' ').replace('  ',' ').replace('  ',' ')
            if 'size' in s.attrib: del s.attrib['size']
            # if s.tail and s.tail.strip()=='': s.tail='' ##### XXXXX???
            # if s.tail.strip()!='': print('$')
        entry.tail='\n  '

open('xml/index.1a.xml','w').write(etree.tostring(index,encoding='unicode',pretty_print=True))
        
# split entries into title and text
for ent in index:
    # continue
    s0=ent[0]
    if 'prompted' in s0.text: print('A',s0.text)
    t=s0.text
    # if there is chaptere reference just after the word, only match for entry whatever precedes
    if chapref:=re.search(r'[XIV]{2,}',t): t=t[:chapref.span()[0]]
    m=re.match(r'^(?P<title>[\w\s’-]+)((,|;| \(|[XIV]+\.|).*$)',t)
    if m is None: ent.attrib['title']='[???]'
    else:
        ent.attrib['title']=m.group('title').strip()
        # print(m.group('title'),'|',m.group('tail'))
        s0.text=s0.text[len(m.group('title')):].strip()
        if s0.text=='': s0.getparent().remove(s0)

        
# print(len(glossary))
for ent in glossary:
    s0=ent[0]
    if '—' not in s0.text: ent.attrib['title']='[???]'
    else: ent.attrib['title'],s0.text=s0.text.split('—',maxsplit=1)
        
    
#        
print(f'index: {len(index)} entries')
print(f'glossary: {len(glossary)} entries')
open('xml/index.2.xml','w').write(etree.tostring(index,encoding='unicode',pretty_print=True))
open('xml/gloss.2.xml','w').write(etree.tostring(glossary,encoding='unicode',pretty_print=True))

A prompted, prompting (
index: 965 entries
glossary: 920 entries


92534

In [163]:
##
## find internal references (for book, index and glossary)
##
from lxml import etree
import re, itertools

index=tree=etree.parse('xml/index.2.xml',etree.XMLParser()).getroot() 
glossary=tree=etree.parse('xml/gloss.2.xml',etree.XMLParser()).getroot() 
book=tree=etree.parse('xml/book.5a.xml',etree.XMLParser(remove_blank_text=True)).getroot()

# HACK
for s in book.xpath('.//span[contains(text(),"Ch. III n. 5")]'):
    s.text.replace('Ch. III n. 5','Ch. III, n. 5')

for TOP in index,book,glossary:
    repl=-1
    while repl!=0:
        repl=0
        ## fuse adjacent spans without attributes
        for s in TOP.findall('.//span'):
            # if len(s.attrib)>0: continue
            if s.getparent() is None or (s1:=s.getnext()) is None or s1.tag!='span': continue # might be a span we just removed
            #if len(s.attrib)!=len(s1.attrib): continue
            if s.attrib.get('family','')!=s1.attrib.get('family',''): continue
            if s.attrib.get('size','9')!=s1.attrib.get('size','9'): continue
            if s.text is None: continue
            # s.text=s.text.replace('  ',' ').replace('  ',' ')
            if s.text is None or s1.text is None: continue
            s.text+=' '+s1.text ### XXX? unwanted spaces?
            s.text=s.text.replace('  ',' ')
            s.getparent().remove(s1)
            repl+=1
            print('=',end='')

        ## fuse interleaved-family spans where the mid-span no letters (only punctuation)
        for s0 in TOP.findall('.//span'):
            if s0.getparent() is None: continue
            if (s1:=s0.getnext()) is None: continue
            if (s2:=s1.getnext()) is None: continue
            if s.text is None or s1.text is None or s2.text is None: continue
            def _it(s): return s.attrib.get('family','normal')=='italic'
            ii=_it(s0),_it(s1),_it(s2)
            alpha1=sum([c.isalnum() for c in s1.text])
            if alpha1>0 or ')' in s1.text or '(' in s1.text: continue
            if ii in [(0,1,0),(1,0,1)]:
                # print('=-=' if ii[0]==1 else '-=-',s0.text[-10:],'|',s1.text,'|',s2.text[:10])
                s0.text=(s0.text+s1.text+s2.text).replace('  ',' ')
                s1.getparent().remove(s1)
                s2.getparent().remove(s2)
                repl+=1
    

# open('xml/index.2a.xml','w').write(etree.tostring(index,encoding='unicode',pretty_print=True))

def _matchHeadTail(m):
    return m.string[:m.span()[0]],m.string[m.span()[1]:]

            
def mkVismRef(*,target,text,type='vism',loc=None):
    e=etree.Element('ref',type=type,target=target)
    if loc is not None: e.attrib['loc']=loc
    if text=='' or text is None: raise RuntimeError(e)
    e.text=text.replace('\n',' ')
    return e

# references to chapter . paragraph (possibly chained)
def _mkXrefChapPar_2(top):
    import re
    patRefStart=re.compile(r'''
        \b(?P<chapter>
            ((?P<ch0>[IVX]{1,5})\.)
            |
            ((Ch.|Chapter)\s+(?P<ch1>[IVX]{1,5}),?\s+)
            |
            ((?P<ch2>[IVX]{1,5})\s+(?=passim))
        )
    ''',re.X)
    patPara=re.compile(r'§?(?P<num>[0-9]+)(f\.|ff\.)?')
    patNote=re.compile(r'(note |n. |n.)(?P<num>[0-9]+)\b')
    patPassim=re.compile(r'passim\b')
    patCont=re.compile(r'(and\s+|,\s+|-|–)\b')
    retRefs=0
    for span in top.findall('.//span'):
        if span.text is None: continue
        pgno=int(span.getparent().attrib.get('page_id',"0"))
        #dbg=(pgno==812 and (entry:=span.getparent()).tag=='entry' and entry.attrib['title']=='concentration')
        dbg=False
        chPos0=0
        while chapm:=patRefStart.search(span.text,pos=chPos0):
            chap=[chapm.group(g) for g in ['ch0','ch1','ch2'] if chapm.group(g) is not None][0]
            head,tail=_matchHeadTail(chapm)
            chPos0=chapm.span()[1]
            for nth in itertools.count():
                if nth==0: pass
                elif mc:=patCont.match(span.text):
                    head,tail=span.text[:mc.span()[1]],span.text[mc.span()[1]:]
                    if dbg: print(f'  {nth} continuation: {head=} {tail=}')
                else:
                    if dbg: print(f'  {nth} NO continuation: {tail=}')
                    break
                if m0:=patPara.match(tail):
                    if dbg: print(f'  {nth} match: § {tail=}')
                    mm,target=m0,f'{chap}.{m0.group("num")}'
                elif m1:=patNote.match(tail):
                    if dbg: print(f'  {nth} match: N {tail=}')
                    mm,target=m1,f'{chap}.n{m1.group("num")}'
                elif m2:=patPassim.match(tail):
                    if dbg: print(f'  {nth} match: passim {tail=}')
                    mm,target=m2,f'{chap}'
                else:
                    if dbg: print(f'  NO match: {tail=}')
                    break
                e=mkVismRef(text=(chapm.group(0) if nth==0 else '')+mm.group(0),target=target)
                retRefs+=1
                tail=tail[mm.span()[1]:]
                if dbg: print(f'     {head=} {e=} {tail=}')
                _splitSpan(span,head,[e],tail)
                chPos0=0 # span changed
            # print(span)
    return retRefs




# references to chapter only
def _mkXrefChap(top):
    patChap=re.compile(r'''
        ((Ch.|Chapter)\s+(?P<num>([IVX]{1,5})))
    ''',re.X)
    ret=0
    for span in top.findall('.//span'):
        if span.text is None: continue
        pgno=int(span.getparent().attrib.get('page_id',"0"))
        pos0=0
        while chapm:=patChap.search(span.text,pos=pos0):
            ret+=1
            head,tail=_matchHeadTail(chapm)
            pos0=chapm.span()[1]
            chap=chapm.group('num')
            _splitSpan(span,head,[mkVismRef(text=chapm.group(0),target=chapm.group('num'))],tail)
    return ret
        
     
## references to bibliography
refRomDec=re.compile(r'(?P<book>S|M|D|A|A-a|Dhp-a|J-a|M-a|Paṭis|S-a|Vism|Nidd|Paṭṭh|Vin)(?P<loc>\s+[XIVC]+\s+[0-9–]+(f\.)?)')

# |Nidd\s+I+|Paṭṭh\s+I+|Vin\s+[IV]+
refDec=re.compile(r'''
    # (\b|^)
    (?P<book>Sn|Ud|Cp|Cp-a|Dhp|Dhs|Dhs-a|Dhs-ṭ|Dhātuk|It|Kv|Kathāvatthu|Mil|Netti|Nikāya-s|Paṭis-a|Peṭ|Pv|Sn-a|Th|Vibh|Vibh-a|Vibh-ṭ|Vv|Vism-mhṭ|Vism mhṭ)
    (?P<loc>
        (\s+|,)
        (
            [0-9§.–]+
            |
            \(p.\s*[0-9.–]+\)
            |
            \s*p\.\s*[0-9]+
        )
        (f\.|\b|$)?
    )
''',re.X)
refMhv=re.compile(r'(?P<book>Mhv)(?P<loc>(\s+pp\.)?\s+[0-9XIV.–]+)')
# make sure the suffixed variants (like Dhs-a) come before the stem (like Dhs), otherwise the stem matches first
refLone=re.compile(r'(^|\b)(?P<book>Sn|Ud|Cp|Cp-a|Dhp|Dhs-a|Dhs-ṭ|Dhs|Dhātuk|Kv|Mil|Netti|Nikāya-s|Paṭis-a|Peṭ|Pv|Sn-a|Th|Vibh-a|Vibh-ṭ|Vibh|Vv|Vism-mhṭ|Vism mhṭ|A-a|Dhp|Dhp-a|J-a|M-a|Paṭis|S-a|Vin|Nidd|Paṭṭh)(\b|$)')

import unidecode

bibTargetFixes={'Vism mhṭ':'Vism-mhṭ'}

def _mkXrefBib(top):
    ret=0
    for span in top.findall('.//span'):
        if span.text is None: continue
        pgno=int(span.getparent().attrib.get('page_id',"0"))
        while (
               (bibm:=refRomDec.search(span.text))
            or (bibm:=refDec.search(span.text))
            or (bibm:=refLone.search(span.text))
            or (bibm:=refMhv.search(span.text))
        ):
            ret+=1
            head,tail=_matchHeadTail(bibm)
            book=bibm.group('book')
            loc=(bibm.group('loc') if 'loc' in bibm.groupdict() else None)
            _splitSpan(span,head,[mkVismRef(text=bibm.group(0),target=bibTargetFixes.get(book,book),type='bib',loc=loc)],tail)
    return ret

        



for obj,what in [(book,'book'),(index,'index'),(glossary,'glossary')]:
    print(what)
    print('  para',_mkXrefChapPar_2(obj))
    print('  chap',_mkXrefChap(obj))
    print('  bib ',_mkXrefBib(obj))
   

# clean linebreaks (not needed anymore) and double spaces allover the place

for s in book.findall('.//span'):
    if s.text is not None:
        s.text=s.text.replace('\n',' ')
        s.text=s.text.replace('  ',' ').replace('  ',' ').replace('  ',' ')
        # em and en dashes
        s.text=s.text.replace('— ','—').replace('– ','–')
        
# rename all divs to p
for div in book.findall('.//div'):
    div.tag='p'

open('xml/book.5b.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))
open('xml/index.2a.xml','w').write(etree.tostring(index,encoding='unicode',pretty_print=True))
open('xml/gloss.2a.xml','w').write(etree.tostring(glossary,encoding='unicode',pretty_print=True))

  para 394
  chap 42
  bib  1973
index
  para 6400
  chap 0
  bib  0
glossary
  para 345
  chap 0
  bib  41


106940

In [164]:
book=tree=etree.parse('xml/book.5b.xml',etree.XMLParser(remove_blank_text=True)).getroot()
index=tree=etree.parse('xml/index.2a.xml',etree.XMLParser()).getroot() 
glossary=tree=etree.parse('xml/gloss.2a.xml',etree.XMLParser()).getroot() 

## patterns to REMOVE the hyphne, in book
pats='''conscious- ness
appre- hending
Anurādha- pura
existing- ness
Concen- tration
Dhamm- asaṅgaṇī
sammappa- dhāna
compre- hending
aris- ing
conscious- ness-originated
pain- ful
im- permanent
disappear- ance
understand- ing
unaban- doned
or-what- ever-states
cakkhuviññāṇa- dhātuyā
behav- iour
differ- ence
imperma- nence
under- standing'''
for s in book.findall('.//span'):
    if s.text is None: continue
    s.text=s.text.replace('  ',' ')
    if s.text.endswith('- ') and (pg:=s.getnext()) is not None and pg.tag=='printed_page' and (s2:=pg.getnext()) is not None and s2.tag=='span' and s2.text is not None:
        tail,head=s.text.split()[-1],s2.text.split()[0]
        s.text=s.text[:-1]+head
        s2.text=s2.text[len(head):]
    for p in pats.split('\n'):
        if p in s.text: s.text=s.text.replace(p,p.replace('- ',''))
    s.text=s.text.replace('- ','-')

## patterns to KEEP the hyphen, in index and glossary
pats='''power- wielder
adhered- to
not- self
adukkham- asukha
### questionmarks make the pattern not match, check how it should be
?-suta- muta
?kāma- bhava
?kalyāṇa- puthujjana
?sampatta- visaya'''
for TOP in index,glossary:
    for s in TOP.findall('.//span'):
        if s.text is None: continue
        s.text=s.text.replace('  ',' ')
        for p in pats.split('\n'):
            if p in s.text: s.text.replace('  ',' ').replace(p,p.replace('- ','-'))
        s.text=s.text.replace('  ',' ').replace('- ','')

for TOP in index,glossary,book:
    for s in TOP.findall('.//span[@family="italic"]'):
        if len(s.attrib)>1: continue
        assert len(s.attrib)==1
        del s.attrib['family']
        s.tag='em'
# move namo tassa after chapter title
for namotassa in book.xpath('.//p[@page_id="63" and @x="160" and @y=126]'):
    assert 'Namo tassa bhagavato' in namotassa[-1].text
    #print('@')
    # namotassa.getparent().remove(namotassa)
    n=namotassa.getnext()
    print(n.tag)
    #print(n[0].tag)
    assert n.tag=='struct-2-chapter' and n[0].tag=='heading-2-chapter'
    n.insert(1,namotassa)
    # assert 
        
#for TOP in index,glossary,book:
#    for s in TOP.xpath('.//span'):
#        if s.text is None: s.getparent().remove(s)

#for e in book.xpath('.//span[@style="border:black 1px solid"]'):
#    #assert len(e)==0
#    for c in e:
#    e.getparent().remove(e)
       
open('xml/book.6.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))
open('xml/index.3.xml','w').write(etree.tostring(index,encoding='unicode',pretty_print=True))
open('xml/gloss.3.xml','w').write(etree.tostring(glossary,encoding='unicode',pretty_print=True))

struct-2-chapter


105620

In [165]:
!ln -sf book.6.xml xml/book.final.xml
!ln -sf index.3.xml xml/index.final.xml
!ln -sf gloss.3.xml xml/gloss.final.xml

In [169]:
##
## LaTeX output
##
book=tree=etree.parse('xml/book.final.xml',etree.XMLParser()).getroot() 
index=tree=etree.parse('xml/index.final.xml',etree.XMLParser()).getroot() 
gloss=tree=etree.parse('xml/gloss.final.xml',etree.XMLParser()).getroot() 

def _latex_writer(e,lev=0,ord=-1):
    def _rep(t): return t.replace('&','\\&')
    def _recurse(e,lev=lev):
        if e.text is not None and len(e)==0: return _rep(e.text)
        return ''.join([_latex_writer(e2,lev=lev+1,ord=ord) for ord,e2 in enumerate(e)])
    def _nobraces(t):
        #if t.strip().endswith(']'):
        return t.replace('[','').replace(']','')
        #return t
    def _title(sect,e):
        t=_recurse(e)
        return f'\\{sect}[{_nobraces(t)}]'+'{'+t+'}\n'
    ret=''
    ind=2*lev*'  '
    if e.tag=='em':
        assert len(e)==0
        if e.text is None: return ''
        return '\\emph{'+_rep(e.text)+'}'
    elif e.tag=='span':
        assert len(e)==0
        if e.text is None: return ''
        tx=_rep(e.text)
        if (fam:=e.attrib.get('family',None)) is None: return tx
        elif fam=='italic': return '\\emph{'+tx+'}'
        elif fam=='bold': return '\\textbf{'+tx+'}'
        elif fam=='smallcaps': return '\\textsc{'+tx+'}'
        elif fam=='bold-italic': return '\\textbf{\\emph{'+tx+'}}'
        else: raise RuntimeError(f'Unrecognized family {fam}')
    elif e.tag=='p': return ('\n\n'+ind if ord>0 else '')+_recurse(e)
    elif e.tag=='vism-para': return '\\par\\noindent\\textbf{§'+e.text+'.}\\vismHypertarget{'+e.attrib['anchor']+'}{}\\marginnote{\\footnotesize\\textcolor{purple}{'+e.attrib['anchor']+'}}{}\n'+ind
    elif e.tag=='footnote':
        check=r'\vismAssertFootnoteCounter{'+e.attrib['mark']+'}'
        if 'reference_existing_footnote' in e.attrib: return check+r'\footnotemark[\value{footnote}]'
        elif anchor:=e.attrib.get('anchor',None): return '\\footnote{'+check+'\\vismHypertarget{'+anchor+'}{}\\marginnote{\\footnotesize\\textcolor{purple}{'+anchor+'}}'+_recurse(e)+'}'
        else: return '\\footnote{'+check+_recurse(e)+'}'
    elif e.tag=='verse':
        assert e[-1].tag=='line'
        e[-1].attrib['last-line']="1"
        return '\n'+ind+'\\begin{verse}\n'+_recurse(e)+ind+'\\end{verse}\n'
    elif e.tag=='line': return ind+_recurse(e)+(r'\\{}' if not 'last-line' in e.attrib else '')+'\n'
    elif e.tag=='heading-1-part':
        if e.attrib['toc_name']=='(Front)': return '' # ind+'\\frontmatter\n\n'
        return '\n'+ind+_title('part',e) # '\\part{'+_recurse(e)+'}\n'
    elif e.tag=='heading-2-chapter':
        # \label is jsut for PlasTeX which will then name the output file accordingly
        if toc_num:=e.attrib.get('toc_num',None): return '\n'+ind+_title('chapter',e)+ind+'\\vismHypertarget{'+toc_num+'}\n'
        else: return '\n'+ind+_title('chapter',e)
    elif e.tag=='heading-3-section': return '\n'+ind+_title('section',e)
    elif e.tag=='heading-4-subsection': return '\n'+ind+_title('subsection',e)
    elif e.tag in ('struct-2-chapter','struct-3-section','struct-4-subsection'): return _recurse(e)
    elif e.tag=='struct-1-part':
        # if e.attrib['name']=='(Front)': pre,post='' # ind+'\\frontmatter',''
        if e.attrib['name']=='Part I': pre,post=ind+'\\mainmatter',''
        elif e.attrib['name']=='Part III': pre,post='',ind+'\\appendix'
        else: pre,post='',''
        return pre+_recurse(e)+post
    #elif e.tag=='footref': return '\\textbf{ERROR: footnote reference '+e.text+'}'
    #elif e.tag=='footnote_separator': return r'\textbf{ERROR: footnote\textunderscore{}separator}'
    elif e.tag=='printed_page':
        #if e.attrib['edition']=='BPS2011': return r'{\small\textbf{\href[page='+e.attrib['page_id']+']{PathofPurification2011.pdf}{\{'+e.text+' ('+e.attrib['page_id']+')\}}}}' # marginpar{['+e.text+r']}'
        if e.attrib['edition']=='BPS2011':
            # return r'\marginnote[\footnotesize\{'+e.text+'('+e.attrib['page_id']+r')\}]{}[-1ex]' # this is too complicated for PlasTeX
            return r'\marginnote{\textcolor{teal}{\footnotesize\{'+e.text+'('+e.attrib['page_id']+r')\}}}{} '
        elif e.attrib['edition']=='PTS': return r'\textcolor{brown}{\textit{['+e.text+']}} '
        assert False
    elif e.tag=='ref':
        if e.attrib['type']=='vism': return r'\hyperlink{'+e.attrib['target']+r'}{'+e.text+'}{}'
        elif e.attrib['type']=='bib': return r'\textbf{\cite{'+e.attrib['target']+'}'+(e.attrib['loc'] if 'loc' in e.attrib else '')+'}'
        # r'\fbox{'+e.text+'→'+e.attrib['target']+'}'
        assert False
    elif e.tag=='list':
        #print(e.tag,e.attrib)
        #title=e.attrib['title']
        return r'\chapter{'+e.attrib['title']+'}'+r'\begin{multicols}{2}\parskip=.2\baselineskip\RaggedRight\parindent=-1em\leftskip=1em '+_recurse(e)+r'\end{multicols}'
    elif e.tag=='entry':
        return r'\par\textbf{'+e.attrib['title']+'} '+_recurse(e)+'\n'
    elif e.tag=='TODO':
        return r'\textbf{TODO '+e.attrib['id']+': '+e.attrib['desc']+'}'
    raise RuntimeError(f'Unhandled tag <{e.tag}>')
    
open('latex/vism-body.tex','w').write(''.join(_latex_writer(e) for e in book))
open('latex/vism-index.tex','w').write(''.join(_latex_writer(index)))
open('latex/vism-glossary.tex','w').write(''.join(_latex_writer(gloss)))

    

56214

# TODO

* maybe detect some more text headings? (subsections)
* ? find all word from glossary and turn those to hyperlinks
* (needed?) tag indented paragraphs, unindented paragraphs (§)
* use dictionary to distinguish `<em>` and `<pali>` (semantically)

# DONE
* paragraphs in footnotes disappear (should be there)* paragraphs in footnotes disappear (should be there)
* scan for bibliography references, turn them into `<ref type="bib" target="...">...</ref>`
* detect chapter-only hyperlinks (Ch. XXX, Chapter XXX and such)
* "IV passim" reference as chapters* "IV passim" reference as chapters
* figure out point sizes and styles for remaining headings* (mostly done) figure out point sizes and styles for remaining headings
* turn index & glossary into keyword-definition pairs (split where appropriate, glossary uses em-dash)* turn index & glossary into keyword-definition pairs (split where appropriate, glossary uses em-dash)
* (re.DOTALL for multi-line regexps!): FIXME: some headings are eatn in XIX and XX (possibly others), perhaps some nodes get killd accidentally when headings are created?
* sort elements in two-column parts (index) so that second column comes after the first one
* detect all Vism. references (es. in the glossary/index) in text and turn them to hyperlinks
* main text: em-dash+space (was em-dash + linebreak): remove space
* undo hyphenation
   
   
Hand Work
==========

* glossary: theOrder,theCommunity (fused)
* II.2 II.10 II.24 II.34 ?

In [167]:
from lxml import etree
##
## Sphinx
##
book=tree=etree.parse('xml/book.final.xml',etree.XMLParser()).getroot() 
index=tree=etree.parse('xml/index.final.xml',etree.XMLParser()).getroot() 
gloss=tree=etree.parse('xml/gloss.final.xml',etree.XMLParser()).getroot() 


class SphinxWriter(object):
    def __init__(self,outdir):
        self.footnotes={}
        self.outdir=outdir
        self.chapter=0
        self.part=0
    def fixanchor(self,a):
        # return a.replace('.','')
        return a
    def _flush(self):
        if not self.footnotes: return ''
        ret='\n\n.. rubric:: Footnotes\n\n'
        # TODO: multi-paragraph footnotes
        for k,vv in self.footnotes.items(): ret+=f'\n\n.. [#{k}] '+'\n    '.join([v for v in vv.split('\n')])+'\n'
        self.footnotes={}
        return ret
    def _rep(self,t): return t # .replace('&','\\&') 
    def recurse(self,e):
        if e.text is not None and len(e)==0: return self._rep(e.text)
        return ''.join([self.write(e2,ord=ord) for ord,e2 in enumerate(e)])
    def title(self,e,level,anchor=None,prefix=None):
        ret=''
        if anchor: ret+='\n\n.. _'+anchor+':'
        t=(e if isinstance(e,str) else self.recurse(e))
        if prefix: t=prefix+'. '+t
        return ret+'\n\n'+t+'\n'+len(t)*('#*=-^"'[level])
    def enclose(self,t,c):
        if t.strip()=='': return ' '
        ret=t
        if ret.endswith(' '): ret=ret.rstrip()+c+' '
        else: ret=ret+c+'\\ '
        if ret.startswith(' '): ret=' '+c+ret.lstrip()
        else: ret=c+ret
        return ret

    def write(self,e,ord=-1,list_type=None):
        if list_type is not None: self.list_type=list_type
        def _nobraces(t):
            #if t.strip().endswith(']'):
            return t.replace('[','').replace(']','')
            #return t
        if e.tag=='em':
            assert len(e)==0
            if e.text is None: return ''
            return self.enclose(self._rep(e.text),'*')
        elif e.tag=='span':
            assert len(e)==0
            if e.text is None: return ''
            tx=self._rep(e.text)
            if (fam:=e.attrib.get('family',None)) is None: return tx
            elif fam=='italic': return self.enclose(tx,'*')
            elif fam=='bold': return self.enclose(tx,'**')
            elif fam=='smallcaps': return self.enclose(tx,'``')
            elif fam=='bold-italic': return self.enclose(tx,'``')
            else: raise RuntimeError(f'Unrecognized family {fam}')
        elif e.tag=='p': return ('\n\n' if ord>0 else '')+self.recurse(e)
        elif e.tag=='vism-para': return f'\n\n.. _{self.fixanchor(e.attrib["anchor"])}:\n\n**§{e.text}** '
        elif e.tag=='footnote':
            anchor=self.fixanchor(e.attrib.get("anchor",str(len(self.footnotes)+1)))
            if 'reference_existing_footnote' in e.attrib: return '[#{anchor}]_'
            self.footnotes[anchor]=self.recurse(e)
            return f' [#{anchor}]_ '
        elif e.tag=='verse':
            return '\n\n'+self.recurse(e)
        elif e.tag=='line': return ('\n\n' if ord==0 else '')+'\n| '+self.recurse(e)+('\n' if 'last-line' in e.attrib else '')
        elif e.tag=='heading-1-part':
            self.partOut.write(self.title(e,level=1,prefix=e.attrib["toc_name"])+'\n\n.. toctree::\n   :numbered:\n   :maxdepth: 6\n\n')
            return ''
        elif e.tag=='heading-2-chapter':
            return self.title(e,level=2,anchor=e.attrib.get('toc_num',None),prefix=e.attrib.get('toc_num',None))
        elif e.tag=='heading-3-section':
            return self.title(e,level=3)
        elif e.tag=='heading-4-subsection':
            return self.title(e,level=4)
        elif e.tag in ('struct-3-section','struct-4-subsection'): return self.recurse(e)
        elif e.tag=='struct-1-part':
            self.part+=1
            f=f'{self.outdir}/part-{self.part}.rst'
            self.partOut=open(f,'w')
            print(f'→ {f}')
            for chap in e:
                self.partOut.write('\n   '+self.write(chap))
            self.partOut.close()
            return None
        elif e.tag=='struct-2-chapter':
            self.chapter+=1
            f=f'ch-{self.chapter:02d}.rst'
            ff=f'{self.outdir}/{f}'
            print(f'   → {ff}')
            out=open(ff,'w')
            out.write(self.recurse(e)+self._flush())
            return f
        elif e.tag=='book':
            for e2 in e: self.write(e2)
            return None
        elif e.tag=='footref': return f'[#{e.text}]_'
        elif e.tag=='printed_page':
            #if e.attrib['edition']=='BPS2011': return r'{\small\textbf{\href[page='+e.attrib['page_id']+']{PathofPurification2011.pdf}{\{'+e.text+' ('+e.attrib['page_id']+')\}}}}' # marginpar{['+e.text+r']}'
            if e.attrib['edition']=='BPS2011': return f'*[{e.text}/{e.attrib["page_id"]}]*\n'
            elif e.attrib['edition']=='PTS': return f' ``{e.text}`` '
            assert False
        elif e.tag=='ref':
            if e.attrib['type']=='vism':
                #return f'`{e.text} <{self.fixanchor(e.attrib["target"])}>`_ '
                return f':ref:`{e.text} <{self.fixanchor(e.attrib["target"])}>` '
            elif e.attrib['type']=='bib': return f' [{e.attrib["target"]}]_ '+(self.enclose(e.attrib["loc"],'*') if 'loc' in e.attrib else '')+' '
            assert False
        elif e.tag=='list':    
            if self.list_type=='index': name,ret='index_',self.title(e='Index',level=1,anchor='index')+'\n\n.. glossary::'
            if self.list_type=='glossary': name,ret='glossary',self.title(e='Glossary',level=1,anchor='glossary')+'\n\n.. glossary::'
            ret+=self.recurse(e)
            f=f'{self.outdir}/{name}.rst'
            print(f'→ {f}')
            open(f,'w').write(ret)
            return
            # return r'\chapter{'+e.attrib['title']+'}'+r'\begin{multicols}{2}\parskip=.2\baselineskip\RaggedRight\parindent=-1em\leftskip=1em '+_recurse(e)+r'\end{multicols}'
        elif e.tag=='entry':
            #if self.list_type=='index':
            #    return f'\n   {e.attrib["title"]}: '+self.recurse(e)
            #elif self.list_type=='glossary':
            title=e.attrib["title"].replace("*","\\*")
            return f'\n\n   {title}\n          '+self.recurse(e) # {e.attrib["desc"]}'
        elif e.tag=='TODO':
            return f'\n\n.. todo:: {e.attrib["id"]}\n\n    e.attrib["desc"]'
        raise RuntimeError(f'Unhandled tag <{e.tag}>')
    
writer=SphinxWriter(outdir='sphinx/source')
writer.write(book)
writer.write(index,list_type='index')
writer.write(gloss,list_type='glossary')
# ''.join(_latex_writer(e) for e in book))
#open('/tmp/vism-index.tex','w').write(''.join(_latex_writer(index,list='index')))
#open('/tmp/vism-glossary.tex','w').write(''.join(_latex_writer(gloss,list='glossary')))

    

→ sphinx/source/part-1.rst
   → sphinx/source/ch-01.rst
   → sphinx/source/ch-02.rst
   → sphinx/source/ch-03.rst
   → sphinx/source/ch-04.rst
   → sphinx/source/ch-05.rst
→ sphinx/source/part-2.rst
   → sphinx/source/ch-06.rst
   → sphinx/source/ch-07.rst
→ sphinx/source/part-3.rst
   → sphinx/source/ch-08.rst
   → sphinx/source/ch-09.rst
   → sphinx/source/ch-10.rst
   → sphinx/source/ch-11.rst
   → sphinx/source/ch-12.rst
   → sphinx/source/ch-13.rst
   → sphinx/source/ch-14.rst
   → sphinx/source/ch-15.rst
   → sphinx/source/ch-16.rst
   → sphinx/source/ch-17.rst
   → sphinx/source/ch-18.rst
→ sphinx/source/part-4.rst
   → sphinx/source/ch-19.rst
   → sphinx/source/ch-20.rst
   → sphinx/source/ch-21.rst
   → sphinx/source/ch-22.rst
   → sphinx/source/ch-23.rst
   → sphinx/source/ch-24.rst
   → sphinx/source/ch-25.rst
   → sphinx/source/ch-26.rst
   → sphinx/source/ch-27.rst
   → sphinx/source/ch-28.rst
→ sphinx/source/index_.rst
→ sphinx/source/glossary.rst
