In [20]:
from lxml import etree
srcStem='origin/vimm7a'
import zipfile, re, subprocess
def getOdfFile(odf,what,mode='r'):
    with zipfile.Path(odf,at=what).open(mode=mode) as item: return item.read()
def setOdfFile(odf,what,data):
    assert isinstance(data,(str,bytes))
    subprocess.call(['zip','--quiet','--delete',odf,what])
    with zipfile.ZipFile(odf,'a',compression=zipfile.ZIP_DEFLATED) as odf2:
        with odf2.open(what,'w') as content:
            content.write(data if isinstance(data,bytes) else data.encode('utf-8'))
def _E(tag,text=None,subs=[],**kw):
    ret=etree.Element(tag,**kw)
    ret.text=text
    for s in subs: ret.append(s)
    return ret

dta=getOdfFile(srcStem+'.odt',what='content.xml',mode='rb')
vimm=etree.fromstring(dta,etree.XMLParser())
open(srcStem+'.content.xml','w').write(etree.tostring(vimm,encoding='unicode',pretty_print=True))


bibs=set('''A.
Abhms.
Abhmv.
Ap.
As.
It.-a.
It.
Ud.-a.
Ud.
C.Pit.
Cv.
J.
Th.
Thī.
D.
Dh.-a.
Dh.
Dhs.
Nd1
Nidd. I.
Netti.
Pts.-a.
Pts.
Peṭaka.
Pm.
Ps.
M.
Mv.
Mhv.
Mil.
M. Vyut.
Rūpārūp.
Lal.V.
Vin.
Vis. Mag.
Vbh.-a.
Vbh.
S.
Saddh.
Sn.-a.
Sn.
Sp.
Spk.
Sv.'''.split('\n'))

if 1:  
    from lxml import etree
    from rich.pretty import pprint
    dta=open(srcStem+'.content.xml').read()
    vimm=etree.fromstring(dta,etree.XMLParser())
    ns=dict(namespaces=vimm.nsmap)
    
    book=_E('book')
    textNs='{'+vimm.nsmap['text']+'}'
    styleNs='{'+vimm.nsmap['style']+'}'    
    def _(t): return f'"{"" if t is None else t}"'
    def _tag(e): return e.tag.split('}')[-1]
    def _style(e): return e.attrib[textNs+'style-name']
    txt=vimm.xpath('.//office:text',**ns)[0]
    
    levels=[book,None,None,None,None,None]
    def newSection(elem,level):
        tag='struct-'+{1:'1-part',2:'2-chapter',3:'3-section',4:'4-subsection',5:'5-subsubsection'}[level]
        heading=exportParaInternal(elem)
        heading.tag='heading'
        if elem.text is not None: name=elem.text
        else: name='...'
        heading.attrib['name']=name
        struct=_E(tag,subs=[heading],name=name)
        # print(len(struct))
        levels[level-1].append(struct)
        levels[level]=struct
        for l in range(level+1,len(levels)): levels[l]=None
    def exportPara(p,emph=False):
        psty=_style(p)
        # print(p.sourceline,psty)
        if psty in ('VimmBody','VimmAuthorComment','Standard'): pass
        elif psty in ('VimmExportSkip','VimmSectEnded'): return None
        elif psty in ('VimmVersePali','VimmVerseEnglish'): return exportVerse(p)
        else: raise RuntimeError(f'{p.sourceline}: Unhandled paragraph style {psty}')
        return exportParaInternal(p,emph=emph)
    def exportParaInternal(p,emph=False,pTag='p'):
        ret=[]
        spanTag=('em' if emph else 'span')
        if p.text: ret+=[_E(spanTag,text=p.text)]
        if p.tail and p.tail.strip(): ret+=[_E(spanTag,text=p.tail.strip())]
        for e in p:
            if (tag:=_tag(e))=='span':
                if (sty:=_style(e))=='VimmBibQuote': ret+=[_E('em',text=e.text)]
                elif sty=='VimmBibRef':
                    for b in bibs:
                        if e.text.startswith(b):
                            ret+=[_E('ref',type='bib',target=b,loc=e.text[len(b):],text=e.text)]
                            break
                    else:
                        print(f'{e.sourceline}: unrecognized bibliography entry in {e.text} (SKIPPING)')
                        ret+=[_E('ref',type='bib',target='?',loc=e.text,text=e.text)]
                elif sty=='VimmPageNo':
                    e.text=e.text.strip()
                    assert e.text.startswith('[[')
                    assert e.text.endswith(']]')
                    mid=e.text[2:-2]
                    pgid,pgno=mid.split('|')
                    ret+=[_E('printed_page',edition='BPS1995',text=pgno,page_id=pgid)]
                elif sty=='VimmPageNoOrig': ret+=[_E('printed_page',edition='PTS',text=e.text)]
                elif sty=='VimmPali': ret+=[_E('em',text=e.text)]
                elif sty=='VimmQA': ret+=[_E('span',family='bold',text=e.text)]
                elif sty=='VimmTODO':
                    if e.text[0]=='<':
                        dir=etree.fromstring(e.text,etree.XMLParser())
                        if dir.tag=='include':
                            ret+=[etree.parse('origin/'+dir.attrib['file'],etree.XMLParser(remove_blank_text=True,remove_comments=True)).getroot()]
                        else: raise RuntimeError('{e.sourceline}: unhandled XML tag {dir.tag}')
                    else:
                        ret+=[_E('TODO',text=e.text)]
                elif sty in ('Default_20_Paragraph_20_Font'): ret+=[_E('span',text=e.text)]
                else: raise RuntimeError(f'{e.sourceline}: unhandled span style: {sty}')
            elif tag=='note':
                assert e.attrib[textNs+'note-class']=='footnote'
                assert e[0].tag.endswith('note-citation')
                bb=list(e.xpath('text:note-body',**ns))
                assert len(bb)==1
                fn=_E('footnote',mark=e[0].text)
                for p in list(bb[0]):
                    if _tag(p)=='p':
                        if (ex:=exportPara(p)) is not None: fn.append(ex)
                    elif _tag(p)=='list': fn.append(exportList(p))
                    else: raise RuntimeError(f'{e.sourceline}: unhandled tag in footnote: {tag}')
                ret+=[fn]
            elif tag=='a': pass
            elif tag=='s': ret+=[_E('span',text=' ')]
            elif tag.startswith('bookmark-'): pass
            elif tag=='p': pass # FIXME??|
            elif tag=='soft-page-break': pass
            elif tag=='tab': ret+=[_E('span',text=' ')]
            elif tag=='list': ret+=exportList(e)
            elif tag=='verse': ret+=exportVerse(e)
            else: raise RuntimeError(f'{e.sourceline}: unhandled tag in paragraph {_style(p)}: {tag}')
            if e.tail: ret+=[_E(spanTag,text=e.tail)]
        return _E(pTag,subs=ret)
    def exportVerse(ppp):
        emph=(_style(ppp)=='VimmVersePali')
        lines=[_E('p',text=ppp.text)]
        for p in ppp:
            if _tag(p)=='line-break': lines.append(_E('p',text=p.tail))
            else: lines[-1].append(p)
        ret=_E('verse',subs=[exportParaInternal(p,emph=emph,pTag='line') for p in lines])
        return ret
    def exportList(lst):
        ret=_E('enum',labels='I.' if _style(lst)=='L58' else '(a)')
        for item in lst:
            assert _tag(item)=='list-item'
            ret.append(_E('li',subs=[exportPara(i) for i in item]))
        return ret
        # return _E('TODO',text='list')
    buf,bufStyle=[],None
    for para in txt:
        currLevel=[l for l in range(len(levels)) if levels[l] is not None ][-1]
        if _tag(para)=='sequence-decls': continue
        style=_style(para)
        if _tag(para)=='h':
            level=int(para.attrib[textNs+'outline-level'])
            newSection(para,level=level)
        elif _tag(para)=='p':
            ex=exportPara(para)
            if ex is not None: levels[currLevel].append(ex)
            #else: raise RuntimeError(f'{para.sourceline}: unhandled paragraph style {style}')
        elif _tag(para)=='list': levels[currLevel].append(exportList(para))
            
        else: raise RuntimeError(f'Unhandled top-level tag {_tag(para)}')
    open(srcStem+'.exported.xml','w').write(etree.tostring(book,encoding='unicode',pretty_print=True))


4212: unrecognized bibliography entry in Ibid. 380 (SKIPPING)
4212: unrecognized bibliography entry in Ibid.: (SKIPPING)
4212: unrecognized bibliography entry in   (SKIPPING)
4217: unrecognized bibliography entry in Vim. Mag. (SKIPPING)
4401: unrecognized bibliography entry in Vis Mag. 36 (SKIPPING)
4401: unrecognized bibliography entry in Sddh.v. 621 (SKIPPING)
4410: unrecognized bibliography entry in Vis.Mag. 59 (SKIPPING)
4636: unrecognized bibliography entry in Netti 164 (SKIPPING)
4996: unrecognized bibliography entry in Mp. III, 274 (SKIPPING)
5006: unrecognized bibliography entry in Cp. S. V, 97 (SKIPPING)
5038: unrecognized bibliography entry in Ibid. 27 (SKIPPING)
5074: unrecognized bibliography entry in Cy. (SKIPPING)
5075: unrecognized bibliography entry in Peṭaka, 142 (SKIPPING)
5087: unrecognized bibliography entry in Ibid. 127 (SKIPPING)
5097: unrecognized bibliography entry in Cp. I, 75 (SKIPPING)
5104: unrecognized bibliography entry in Petaka. 147-8 (SKIPPING)
5108: un