In [1]:
import openpyxl
from packaging import version
assert version.parse(openpyxl.__version__)>=version.parse('3.1.0')
from lxml import etree


In [5]:
def fix_xlsx_encoding(xlsx0,xlsx1):
    c0='áÁòÒóÓþÞìÌúÚíÍ¿÷²ðåõºøý' +'\n'
    c1='āĀṅṄṇṆṭṬīÌūŪṃṂḷēĕḍṣōṛśḥ' +'\n'
    c01=str.maketrans(c0,c1)
    from openpyxl.cell.rich_text import CellRichText

    wb=openpyxl.load_workbook(xlsx0,rich_text=True)
    for sheet in wb:
        for row in sheet:
            for cell in row:
                if cell.value is None: pass
                elif isinstance(cell.value,CellRichText):
                    for span in cell.value: span.text=span.text.translate(c01)
                else:
                    cell.value=cell.value.translate(c01)
    wb.save(xlsx1)
#fix_xlsx_encoding('raw-intro-ceylon-king-tab.0.xlsx','raw-intro-ceylon-king-tab.xlsx')
# fix_xlsx_encoding('raw-intro-commentaries.0.xlsx','raw-intro-commentaries.xlsx')

In [6]:
t='''
          <line><em>Ciraí tiþþhatu saddhammo</em></line>
          <line><em>sabbe sattá bhavantu sukhitattá</em></line>
'''
c0='áÁòÒóÓþÞìÌúÚíÍ¿÷²ðåõºøý' +'\n'
c1='āĀṅṄṇṆṭṬīÌūŪṃṂḷēĕḍṣōṛśḥ' +'\n'
c01=str.maketrans(c0,c1)

print(t.translate(c01))


          <line><em>Ciraṃ tiṭṭhatu saddhammo</em></line>
          <line><em>sabbe sattā bhavantu sukhitattā</em></line>



In [3]:
import re
from lxml import etree

def _E(e,text=None,*,subs=[],xml_id=None,**kw):
    assert text is None or isinstance(text,str)
    ret=etree.Element(e,**kw)# ,nsmap={None:'http://docbook.org/ns/docbook','pub':pubNs})
    if xml_id is not None: ret.attrib['id']=xml_id
    ret.text=text
    for sub in subs:
        if sub is None: continue
        if sub.tag=='__FLATTEN__':
            for su in sub: ret.append(su)
        else: ret.append(sub)
    return ret


## references to bibliography
refRomDec=re.compile(r'(?P<book>S|M|D|A|A-a|Dhp-a|J-a|M-a|Paṭis|S-a|Vism|Nidd|Paṭṭh|Vin)(?P<loc>\s+[XIVC]+\s+[0-9–]+(f\.|f)?)')

# |Nidd\s+I+|Paṭṭh\s+I+|Vin\s+[IV]+
refDec=re.compile(r'''
    # (\b|^)
    (?P<book>Sn|Ud|Cp|Cp-a|Dhp|Dhs|Dhs-a|Dhs-ṭ|Dhātuk|It|Kv|Kathāvatthu|Mil|Netti|Nikāya-s|Paṭis-a|Peṭ|Pv|Sn-a|Th|Vibh|Vibh-a|Vibh-ṭ|Vv|Vism-mhṭ|Vism mhṭ)
    (?P<loc>
        (\s+|,)
        (
            [0-9§.–]+
            |
            \(p.\s*[0-9.–]+\)
            |
            \s*p\.\s*[0-9]+
        )
        (f\.|\b|$)?
    )
''',re.X)
refMhv=re.compile(r'(?P<book>Mhv)(?P<loc>(\s+pp\.)?\s+[0-9XIV.–]+(f\.|f|ff\.|ff)?)')
# make sure the suffixed variants (like Dhs-a) come before the stem (like Dhs), otherwise the stem matches first
refLone=re.compile(r'(^|\b)(?P<book>Sn|Ud|Cp|Cp-a|Dhp|Dhs-a|Dhs-ṭ|Dhs|Dhātuk|Kv|Mil|Netti|Nikāya-s|Paṭis-a|Peṭ|Pv|Sn-a|Th|Vibh-a|Vibh-ṭ|Vibh|Vv|Vism-mhṭ|Vism mhṭ|A-a|Dhp|Dhp-a|J-a|M-a|Paṭis|S-a|Vin-a|Vin|Nidd|Paṭṭh)(\b|$)')

import unidecode

def _matchHeadTail(m):
    return m.string[:m.span()[0]],m.string[m.span()[1]:]


def _leaf(txt,fmt,em):
    def _leaf_txt(t):
        if fmt=='latex': return ('\\emph{'+t+'}' if em else t)
        elif fmt=='rst': return (f'*{t}*' if em else t)
        elif fmt=='docbook': return _E('emphasis' if em else 'phrase',t)
    ret=[]
    # print(txt)
    book=None
    while (
           (bibm:=refRomDec.search(txt))
        or (bibm:=refDec.search(txt))
        or (bibm:=refLone.search(txt))
        or (bibm:=refMhv.search(txt))
    ):
        head,tail=_matchHeadTail(bibm)
        book=bibm.group('book')
        loc=(bibm.group('loc').replace('\n',' ') if 'loc' in bibm.groupdict() else None)
        # if book=='Vin': print(f'{head=} {book=} {loc=} {tail=}')
        if head is not None and len(head)>0: ret.append(_leaf_txt(head))
        if fmt=='latex': ret+=[r'\textbf{\cite{'+book+'}'+(loc if loc else '')+'}']
        elif fmt=='rst': ret+=[f' [{book}]_ '+(f'*{loc.strip()}*' if (loc is not None and len(loc.strip())>0) else ' ')]
        elif fmt=='docbook':
            if loc: ret+=[_E('phrase',subs=[_E('citation',book),_E('phrase',loc)])]
            else: ret+=[_E('citation',book)]
        txt=tail
    if len(txt)>0: ret.append(_leaf_txt(txt))
    if False and book=='Vin':
        def _show(ee,lev=0):
            for e in ee:
                print(lev*'  '+f'{e.tag=} {e.text=}')
                if len(e)>0: _show(e,lev=lev+1)
        _show(ret)
        print(etree.tostring(_E('RET',subs=ret),encoding='unicode',pretty_print=True))
        #for e in ret:
        #            print('   ',e.text)
    return ret

    # _splitSpan(span,head,[mkVismRef(text=bibm.group(0),target=bibTargetFixes.get(book,book),type='bib',loc=loc)],tail)





In [4]:
# print(xlsx_to_vism('raw-intro-ceylon-king-tab.xlsx',fmt='docbook',sheet='Sheet1',pretty_print=True))

In [9]:
CELL=None
def xlsx_to_vism(xlsx,fmt,sheet='Sheet1',pretty_print=False):
    assert fmt in ('latex','rst','docbook')
    # fmt='latex'
    from openpyxl.cell.rich_text import CellRichText
    sheet=openpyxl.load_workbook(xlsx,rich_text=True,read_only=False)[sheet]
    if fmt=='latex': ret,rawFmt=[],True
    elif fmt=='rst': ret,rawFmt=[],True
    elif fmt=='docbook': table,rawFmt=_E('informaltable'),False
    nRows=len([None for row in sheet])
    nCols=0
    for iRow,row in enumerate(sheet):
        if fmt=='docbook':
            if iRow==0: table.append(tGrp:=_E('thead',subs=[tRow:=_E('tr')]))
            elif iRow==1: table.append(tGrp:=_E('tbody',subs=[tRow:=_E('tr')]))
            else: tGrp.append(tRow:=_E('tr'))
        if fmt=='latex': ret+='    '
        for cell in row:
            nCols=max(cell.column,nCols)
            if fmt=='docbook': ret=[]
            if fmt=='rst': ret+=('\n  * - ' if cell.column==1 else '\n    - ')
            if fmt=='latex' and cell.column>1: ret+=' & '
            if cell.value is None:
                if rawFmt: ret+=''
            elif isinstance(cell.value,CellRichText):
                for span in cell.value:
                    ret+=_leaf(span.text,em=span.font.i,fmt=fmt)
            else:
                ret+=_leaf(cell.value,em=cell.font.i,fmt=fmt)
            if fmt=='docbook': tRow.append(_E('td',subs=ret,valign='top'))
        if fmt=='latex': ret+=('\\\\' if iRow<nRows-1 else '')+'\n'
    if fmt=='latex':
        assert nCols in (2,3)
        lCols,xCols=('|' if nCols>2 else '').join(nCols*['l']),('X[2]|X[4]|X[3]' if nCols==3 else 'll')
        tblr='longtblr'
        heads=','.join([h+r'head={font={\bfseries}}' for h in ('first','middle','last')])
        plastex='\\begin{tabular}{'+lCols+'}\n'+''.join(ret)+'\\end{tabular}'
        latexLong='\\begin{longtblr}[theme=vismLong]{colspec={'+xCols+'},rowhead=1}\n'+''.join(ret)+'\\end{longtblr}\n'
        latexShort='\\begin{tblr}{colspec={'+xCols+'},rowhead=1}\n'+''.join(ret)+'\\end{tblr}\n'
        return f'\n\n\\ifplastex\n{plastex}\n\\else\n{latexLong if nRows>10 else latexShort}\\fi\n'
        
    elif fmt=='rst': return '.. list-table::\n  :header-rows: 1\n  '+''.join(ret)
    return etree.tostring(table,encoding='unicode',pretty_print=pretty_print)
for tab in ('raw-intro-ceylon-king-tab','raw-intro-commentaries'): #,raw-intro-):
    xlsx=f'{tab}.xlsx'
    wb=openpyxl.load_workbook(xlsx,read_only=True)
    sheets=[s.title for s in wb]
    wb.close()
    for fmt in ('docbook','latex','rst'):
        for sheet in sheets:
            suffix=('-'+sheet if len(sheets)>1 else '')
            out={'docbook':f'../docbook/{tab}{suffix}.xml','latex':f'../latex/{tab}{suffix}.tex','rst':f'../sphinx/{tab}{suffix}.rst'}[fmt]
            open(out,'w').write(xlsx_to_vism(xlsx,fmt=fmt,sheet=sheet,pretty_print=True))
            print(f'→ {out}')

→ ../docbook/raw-intro-ceylon-king-tab.xml
→ ../latex/raw-intro-ceylon-king-tab.tex
→ ../sphinx/raw-intro-ceylon-king-tab.rst
→ ../docbook/raw-intro-commentaries-vinaya.xml
→ ../docbook/raw-intro-commentaries-sutta.xml
→ ../docbook/raw-intro-commentaries-suttanipata.xml
→ ../docbook/raw-intro-commentaries-abhidhamma.xml
→ ../latex/raw-intro-commentaries-vinaya.tex
→ ../latex/raw-intro-commentaries-sutta.tex
→ ../latex/raw-intro-commentaries-suttanipata.tex
→ ../latex/raw-intro-commentaries-abhidhamma.tex
→ ../sphinx/raw-intro-commentaries-vinaya.rst
→ ../sphinx/raw-intro-commentaries-sutta.rst
→ ../sphinx/raw-intro-commentaries-suttanipata.rst
→ ../sphinx/raw-intro-commentaries-abhidhamma.rst
