In [7]:
from lxml import etree
import roman
srcStem='origin/vimm7a'
import zipfile, re, subprocess
def getOdfFile(odf,what,mode='r'):
    with zipfile.Path(odf,at=what).open(mode=mode) as item: return item.read()
def setOdfFile(odf,what,data):
    assert isinstance(data,(str,bytes))
    subprocess.call(['zip','--quiet','--delete',odf,what])
    with zipfile.ZipFile(odf,'a',compression=zipfile.ZIP_DEFLATED) as odf2:
        with odf2.open(what,'w') as content:
            content.write(data if isinstance(data,bytes) else data.encode('utf-8'))
def _E(tag,text=None,subs=[],**kw):
    ret=etree.Element(tag,**kw)
    ret.text=text
    for s in subs: ret.append(s)
    return ret

dta=getOdfFile(srcStem+'.odt',what='content.xml',mode='rb')
vimm=etree.fromstring(dta,etree.XMLParser())
ns=dict(namespaces=vimm.nsmap)

textNs='{'+vimm.nsmap['text']+'}'
styleNs='{'+vimm.nsmap['style']+'}'

def _tag(e): return e.tag.split('}')[-1]

listMap={}
styles=vimm.xpath('.//office:automatic-styles',**ns)[0]
for ls in styles:
    if _tag(ls)!='list-style': continue
    name=ls.attrib[styleNs+'name']
    ls0=ls[0]
    assert _tag(ls0)=='list-level-style-number'
    assert ls0.attrib[textNs+'level']=='1'
    pns=ls0.get(styleNs+'num-prefix',''),ls0.get(styleNs+'num-format',''),ls0.get(styleNs+'num-suffix','')
    #print(name,pns)
    listMap[name]=''.join(pns)
    
pMap=dict([(s.attrib[styleNs+'name'],s.attrib[styleNs+'parent-style-name']) for s in styles if (s.get(styleNs+'family',None)=='paragraph' and styleNs+'parent-style-name' in s.attrib)])

open(srcStem+'.content.xml','w').write(etree.tostring(vimm,encoding='unicode',pretty_print=True))


bibs=('''A.
Abhms.
Abhmv.
Ap.
As.
It.-a.
It.
Ud.-a.
Ud.
C.Pit.
Cv.
J.
Th.
Thī.
D.
Dh.-a.
Dh.
Dhs.
Nd1
Nidd. I.
Netti.
Pts.-a.
Pts.
Peṭaka.
Pm.
Ps.
M.
Mv.
Mhv.
Mil.
M. Vyut.
Rūpārūp.
Lal.V.
Vin.
Vis. Mag.
Vbh.-a.
Vbh.
S.
Saddh.
Sn.-a.
Sn.
Sp.
Spk.
Sv.'''.split('\n'))

if 1:  
    from lxml import etree
    from rich.pretty import pprint
    dta=open(srcStem+'.content.xml').read()
    vimm=etree.fromstring(dta,etree.XMLParser())
    ns=dict(namespaces=vimm.nsmap)
    
    book=_E('TEI',subs=[
        _E('teiHeader'),
        teiText:=_E('text')
    ])
    textNs='{'+vimm.nsmap['text']+'}'
    styleNs='{'+vimm.nsmap['style']+'}'    
    def _(t): return f'"{"" if t is None else t}"'
    def _tag(e): return e.tag.split('}')[-1]
    def _style(e): return e.attrib[textNs+'style-name']
    txt=vimm.xpath('.//office:text',**ns)[0]
    
    levels=[teiText,None,None,None,None,None]
    def newSection(elem,level):
        tag='div'
        # print(len(text))
        divType={1:'1-part-unused',2:'2-chapter',3:'3-section',4:'4-subsection',5:'5-subsubsection'}[level]
        divTag={1:['front','main','back','backback'][len(teiText)]}.get(level,'div')
        heading=exportParaInternal(elem)
        heading.tag='head'
        if elem.text is not None: name=elem.text
        else: name='...'
        heading.attrib['name']=name
        struct=_E(divTag,subs=[heading],name=name,type=divType)
        if name=='Bibliography': struct.attrib['rend']='hanging'
        if level==2 and len(teiText)==2: struct.attrib['n']=roman.toRoman(len(levels[1]))
        # print(len(struct))
        levels[level-1].append(struct)
        levels[level]=struct
        for l in range(level+1,len(levels)): levels[l]=None
    def exportPara(p,emph=False):
        psty=_style(p)
        if psty in ('VimmBody','Standard','VimmAdded'): pass
        elif psty=='VimmAuthorComment': emph=True
        elif psty=='VimmSectEnded': emph=True
        elif psty=='VimmExportSkip': return None
        elif psty in ('VimmVersePali','VimmVerseEnglish'): return exportVerse(p)
        else: raise RuntimeError(f'{p.sourceline}: Unhandled paragraph style {psty}')
        return exportParaInternal(p,emph=emph)
    def exportParaInternal(p,emph=False,pTag='p'):
        ret=[]
        spanTag=('em' if emph else 'span')
        if p.text and p.text.strip(): ret+=[_E(spanTag,text=p.text)]
        if p.tail and p.tail.strip(): ret+=[_E(spanTag,text=p.tail.strip())]
        for e in p:
            if (tag:=_tag(e))=='span':
                if (sty:=_style(e))=='VimmBibQuote': ret+=[_E('em',text=e.text)]
                elif sty=='VimmBibRef':
                    for b in bibs:
                        if e.text.startswith(b):
                            ret+=[_E('ptr',type='bib',target=b,loc=e.text[len(b):],text=e.text)]
                            break
                    else:
                        print(f'{e.sourceline}: unrecognized bibliography entry in {e.text} (SKIPPING)')
                        ret+=[_E('ptr',type='bib',target='?',loc=e.text,text=e.text)]
                elif sty=='VimmPageNo':
                    e.text=e.text.strip()
                    assert e.text.startswith('[[')
                    assert e.text.endswith(']]')
                    mid=e.text[2:-2]
                    pgid,pgno=mid.split('|')
                    ret+=[_E('pb',ed='BPS1995',pdf_page=pgid,n=pgno)]
                elif sty=='VimmPageNoOrig': ret+=[_E('pb',ed='PTS',n=e.text)]
                elif sty=='VimmPali': ret+=[_E('em',text=e.text)]
                elif sty=='VimmBold': ret+=[_E('span',text=e.text,rend='bold')]
                elif sty=='VimmQA': ret+=[_E('span',rend='bold',text=e.text)]
                elif sty=='VimmTODO':
                    if e.text[0]=='<':
                        dir=etree.fromstring(e.text,etree.XMLParser())
                        if dir.tag=='include':
                            ret+=[etree.parse('origin/'+dir.attrib['file'],etree.XMLParser(remove_blank_text=True,remove_comments=True)).getroot()]
                        else: raise RuntimeError('{e.sourceline}: unhandled XML tag {dir.tag}')
                    else:
                        ret+=[_E('note',type='TODO',text=e.text)]
                elif sty in ('Default_20_Paragraph_20_Font'): ret+=[_E('span',text=e.text)]
                else: raise RuntimeError(f'{e.sourceline}: unhandled span style: {sty}')
            elif tag=='note':
                assert e.attrib[textNs+'note-class']=='footnote'
                assert e[0].tag.endswith('note-citation')
                bb=list(e.xpath('text:note-body',**ns))
                assert len(bb)==1
                fn=_E('note',n=e[0].text,place='foot')
                for p in list(bb[0]):
                    if _tag(p)=='p':
                        if (ex:=exportPara(p)) is not None: fn.append(ex)
                    elif _tag(p)=='list': fn.append(exportList(p))
                    else: raise RuntimeError(f'{e.sourceline}: unhandled tag in footnote: {tag}')
                ret+=[fn]
            elif tag=='a': pass
            elif tag=='s': ret+=[_E('span',text=' ')]
            elif tag.startswith('bookmark-'): pass
            elif tag=='p': pass # FIXME??|
            elif tag=='soft-page-break': pass
            elif tag=='tab': ret+=[_E('span',text=' ')]
            elif tag=='list': ret+=exportList(e)
            elif tag=='verse': ret+=exportVerse(e)
            elif tag in ('annotation','annotation-end'): pass # comments
            else: raise RuntimeError(f'{e.sourceline}: unhandled tag in paragraph {_style(p)}: {tag}')
            if e.tail: ret+=[_E(spanTag,text=e.tail)]
        return _E(pTag,subs=ret)
    def exportVerse(ppp):
        emph=(_style(ppp)=='VimmVersePali')
        lines=[_E('p',text=ppp.text)]
        for p in ppp:
            if _tag(p)=='line-break': lines.append(_E('p',text=p.tail))
            else: lines[-1].append(p)
        ret=_E('lg',subs=[exportParaInternal(p,emph=emph,pTag='l') for p in lines])
        return ret
    def exportList(lst):
        ret=_E('list',type='numbered',subtype=listMap[_style(lst)])
        for item in lst:
            assert _tag(item)=='list-item'
            ret.append(_E('item',subs=[exportPara(i) for i in item]))
        return ret
        # return _E('TODO',text='list')
    buf,bufStyle=[],None
    for para in txt:
        currLevel=[l for l in range(len(levels)) if levels[l] is not None ][-1]
        if _tag(para)=='sequence-decls': continue
        style=_style(para)
        if _tag(para)=='h':
            level=int(para.attrib[textNs+'outline-level'])
            newSection(para,level=level)
        elif _tag(para)=='p':
            ex=exportPara(para)
            if ex is not None: levels[currLevel].append(ex)
            #else: raise RuntimeError(f'{para.sourceline}: unhandled paragraph style {style}')
        elif _tag(para)=='list': levels[currLevel].append(exportList(para))
            
        else: raise RuntimeError(f'Unhandled top-level tag {_tag(para)}')
    open(srcStem+'.pretty.tei','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))
    open(srcStem+'.tei','w').write(etree.tostring(book,encoding='unicode',pretty_print=False))


4184: unrecognized bibliography entry in Ch. XXXII, 73 (SKIPPING)
4197: unrecognized bibliography entry in Ch.II (SKIPPING)
4200: unrecognized bibliography entry in p. liv (SKIPPING)
4860: unrecognized bibliography entry in Ibid. 380 (SKIPPING)
4860: unrecognized bibliography entry in Ibid.: (SKIPPING)
4860: unrecognized bibliography entry in   (SKIPPING)
4865: unrecognized bibliography entry in Vim. Mag. (SKIPPING)
5048: unrecognized bibliography entry in Sddh.v. 621 (SKIPPING)
5643: unrecognized bibliography entry in Mp. III, 274 (SKIPPING)
5685: unrecognized bibliography entry in Ibid. 27 (SKIPPING)
5721: unrecognized bibliography entry in Cy. (SKIPPING)
5734: unrecognized bibliography entry in Ibid. 127 (SKIPPING)
5744: unrecognized bibliography entry in I, 75 (SKIPPING)
5780: unrecognized bibliography entry in Vim. Mag. and Vis. Mag. p. 49 (SKIPPING)
5782: unrecognized bibliography entry in Ibid. 147. ff (SKIPPING)
5786: unrecognized bibliography entry in p. 49, Vim. Mag. and Vis.

In [62]:
if 0:
    book=etree.parse(srcStem+'.exported.xml',etree.XMLParser(remove_blank_text=True)).getroot() 
    #open(srcStem+'.exported.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))
    toc=_E('toc')
    flatBook=list(book.iter())
    levels=[toc]+5*[None]

    def getText(struct,dbg):
        ret=''
        for p in struct:
            if dbg: print(f'{p.sourceline} {p.tag}')
            if p.tag!='p': continue
            #if len(ret)>0: ret+=' '
            for e in p:
                if dbg: print(f'  {e.sourceline} {e.tag}')
                if e.tag not in ('span','em'): continue
                if dbg: print(f'    {e.text[:50]}')
                ret+=e.text
            #if len(ret)>100: return ret
            if len(ret)>0: return ' '.join(ret.split(' ')[:12])
        # raise RuntimeError(f'{struct.sourceline}: not enough text collected?')
        return ''


    for e in flatBook:
        if not (m:=re.match('^struct-([0-9])-.*',e.tag)): continue
        level=int(m.group(1))
        # if level<2: continue
        heading=e[0]
        assert heading.tag=='heading'
        assert len(heading)==1
        assert heading[0].text is not None
        title=heading[0].text
        toc_num=e.get('toc_num',None)
        starts_at=getText(e,dbg=(title=='Salutation'))
        if level>2: sect=_E('sect',title=title,starts_at=starts_at)
        else:
            sect=_E('chapter')
            if toc_num: sect.attrib['num']=toc_num
        levels[level-1].append(sect)
        levels[level]=sect
    toc=toc[1]
    toc.tag='toc'
    open('toc.xml','w').write(etree.tostring(toc,encoding='unicode',pretty_print=True))


2451 heading
2454 p
  2455 span
    Homage to the Blessed One, the Consummate One, the
  2456 footnote


54599