In [7]:
import re
import os

volumeinfo = {
    
    '1': 'The Works of Lucian of Samosata, complete, with exceptions specified in the preface, Vol. 1. Fowler, H. W. and Fowler, F.G., translators. Oxford at the Clarendon Press, 1905.',
    '2': 'The Works of Lucian of Samosata, complete, with exceptions specified in the preface, Vol. 2. Fowler, H. W. and Fowler, F.G., translators. Oxford at the Clarendon Press, 1905.',
    '3': 'The Works of Lucian of Samosata, complete, with exceptions specified in the preface, Vol. 3. Fowler, H. W. and Fowler, F.G., translators. Oxford at the Clarendon Press, 1905.',
    '4': 'The Works of Lucian of Samosata, complete, with exceptions specified in the preface, Vol. 4. Fowler, H. W. and Fowler, F.G., translators. Oxford at the Clarendon Press, 1905.'


}

refsbook = ['<refsDecl n="CTS">',
  '<cRefPattern matchPattern="(\w+).(\w+)"',
   'replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\']/tei:div[@n=\'$2\'])">',
   '<p>This pointer pattern extracts book and section</p>',
  '</cRefPattern>',
  '<cRefPattern matchPattern="(\w+)"',
   'replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\'])">',
   '<p>This pointer pattern extracts book</p>',
  '</cRefPattern>',
 '</refsDecl>']

refssection = ['<refsDecl n="CTS">',
   '<cRefPattern matchPattern="(\w+)"',
    'replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\'])">',
    '<p>This pointer pattern extracts section.</p>',
   '</cRefPattern>',
  '</refsDecl>']


tgroup = 'tlg0062'
edition = 'perseus-eng4'
worktitles = {}
hasbooks = {}
firstpages = {}
metadata = {}

worklines = {}


def normcase(s):
    stopwords = ['a','an','and','in','is','of','on','or','that','the','with']
    returns  = ''
    
    curwork = ''
    for foo in s.split():
        foo = foo.lower()
        if(not foo in stopwords):
            foo = re.sub('^(.)',foo[0].upper(),foo)
        returns = returns + ' ' + foo
        
    returns = re.sub('^\s*(.+)\s*$','\g<1>',returns)        
    ucase = returns[0].upper() 
    returns = re.sub('^(.)',returns[0].upper(),returns)
    return(returns)

def add2teiheader(l,teiheader):
    
    if(re.search('<name xml:lang="en">H. W. Fowler</name>',l)):
        teiheader.append('<name>Henry Watson Fowler</name>')
        return
    
    if(re.search('<name xml:lang="en">F. G. Fowler</name>',l)):
        teiheader.append('<name>Francis George Fowler</name>')
        return
    l = re.sub('"en"','"eng"',l)
    
    if(re.search('<resp>',l)):
        teiheader.append('<resp>Digital conversion and editing</resp>')
        return
    
    l = re.sub('<editor>','<editor role="translator">',l)
    
    if(re.search('</teHeader',l)):
        teiheader.append('<revisionDesc>')
        teiheader.append('<change when="2023-08-01" who="Gregory Crane">GRC found this file largely tagged but cannot find a record (or remember) how it was originally entered. He did the final work to make this compatible with Scaife.</change>')
        teiheader.append('</revisionDesc>')
    
    teiheader.append(l)
    if(re.search('Library</authority',l)):
        teiheader.append('<idno type="filename">IDNO</idno>')

def splitvol(volnum):
    global worklines
    global volumeinfo
    fname = 'lucian-vol' + volnum + '-eng.xml'
    f = open(fname)
    curpage = ''
    curwork = ''
    sawtitle = 0
    intext = 0
    teiheader = []
    worklines = {}
    
    curpagelink = 'https://archive.org/details/worksoflucianofs0'+volnum+'luciuoft/page/PNUM/mode/1up'
    
    for l in f:
        l = re.sub('\s+$','',l)
        
        if(re.search('<text',l)):
            intext = 1
        
        if(not intext):
            l = re.sub('target="(https://archive[^"]+)','target="'+curpagelink,l)
            if(re.search('<title ',l)):
                if(not sawtitle):
                    l = '<title xml:lang="eng">WORKTITLE</title>'
                    sawtitle = 1
            add2teiheader(l,teiheader)
            continue
            
        m = re.search('<pb n="([0-9]+)',l)
        if(m):
            curpage = m[1]
            
        if(re.search('subtype="book"',l)):
            hasbooks[curwork] = 1
            
        m = re.search('(tlg006[12].tlg[0-9]+)',l)
        if(m):
            curwork = m[1]
            metadata[curwork] = volumeinfo[volnum]
            if(curpage and not curwork in firstpages):
                firstpages[curwork] = curpage
            else:
                print('nofirstpage',curwork)
            #print(curwork,curpage)
            
        m =  re.search('<head>([^<]+)',l)
        if(m and not curwork in worktitles):
            curhead = normcase(m[1])
            worktitles[curwork] = curhead
            #print(volnum,curwork,curhead)
    
    f.close()
    
    
    transbase = '<div type="translation" n="urn:cts:greekLit:tlg0062.tlg001.perseus-eng4" xml:lang="eng">'
    trans = ''
    f = open(fname)
    
    curwork = ''
    
    for l in f:
        l = re.sub('\s+$','',l)
        m = re.search('(tlg006[12]\.tlg[0-9]+)',l)
        if(m):
            curwork = m[1]
            worklines[curwork] = []
            worklines[curwork].append('<text xml:lang="eng">')
            worklines[curwork].append('<body>')
            trans = re.sub('tlg006[12]\.tlg001',curwork,transbase)
            worklines[curwork].append(trans)
            continue
        if(re.search('</TEI>',l)):
            continue
            
        if(curwork):
            worklines[curwork].append(l)
            
            
    
    f.close()
    
    for curwork in worklines:
        xmlfile = curwork + '.' + edition + '.xml'
        outfname = '/Users/gcrane/github/first1kwork/data/' + re.sub('(tlg006[12])\.(tlg[0-9]+)','\g<1>/\g<2>/',curwork) + xmlfile
        print(outfname)
        outf = open(outfname,'w')

        curbook = ''
        cursection = ''
        inrefsdecl = 0
        for l in teiheader:
            l = re.sub('<date>2023','<date type="release">2023-08-01',l)
            if(re.search('WORKTITLE',l)):
                l = re.sub('WORKTITLE',worktitles[curwork],l)
                l = l + '\n<author>Lucian</author>'
                l = l + '\n<editor role="translator">Henry Watson Fowler</editor>'
                l = l + '\n<editor role="translator">Francis George Fowler</editor>'
            l = re.sub('IDNO',xmlfile,l)
            l = re.sub('PNUM',firstpages[curwork],l)
            if(re.search('<refsD',l)):
                inrefsdecl = 1
                if(curwork in hasbooks):
                    curf = open('bookref.txt')
                    for line in curf:
                        print(line,file=outf,end='')
                else:
                    curf = open('sectionref.txt')
                    for line in curf:
                        print(line,file=outf,end='')
                continue
            if(re.search('</refsD',l)):
                inrefsdecl = 0
                continue
            if(inrefsdecl):
                continue
                
            
            print(l,file=outf)
        curpage = ''
        for l in worklines[curwork]:
            
            m = re.search('<pb n="([0-9]+)"',l)
            if(m):
                curpage = 'v.' + volnum + '.p.' + m[1]
                l = re.sub('<pb n="([0-9]+)"','<pb n="'+curpage+'"',l)
                
            l = re.sub('(<note[^>*])n="([0-9]+)\.([0-9]+)"','\g<1>n="v.'+volnum+'.p.\g<2>.n.\g<3>"',l)
            
            while(re.search('(<note[^>]*>)(.+?</note>)',l)):
            
                m = re.search('(<note[^>]*>)(.+?</note>)',l)
                if(m):
                    notetext = m[2]
                    if(re.search('[α-ω]',notetext)):
                        l = re.sub('(<note)([^>]*>.+?</note>)','<xnote xml:lang="mul"\g<2>',l,1)
                    else:
                        l = re.sub('(<note)([^>]*>.+?</note>)','<xnote xml:lang="eng"\g<2>',l,1)
            l = re.sub('<xnote','<note',l)
            
            #l = re.sub('<note','<note xml:lang="eng"',l)
            m = re.search('<div[^>]+subtype="book" n="([^"]+)"',l)
            if(m):
                curbook = m[1]
                l = re.sub('(subtype="[^"]+")','\g<1> xml:base="urn:cts:greekLit:'+curwork+'.'+edition+'"',l)
                
            m = re.search('<div[^>]+subtype="section" n="([^"]+)"',l)
            if(m):
                cursection = m[1]
                l = re.sub('(subtype="[^"]+")','\g<1> xml:base="urn:cts:greekLit:'+curwork+'.'+edition+':'+curbook+'"',l)
           
            l = re.sub('<note>','<note xml:lang="eng">',l)
            print(l,file=outf)
            
        print('</body></text></TEI>',file=outf)
        outf.close()

        


for i in range(1,5):
    splitvol(str(i))
    
 
#don't add these again
for i in range(1,71):
    #continue
    if(i<10):
        curwork = 'tlg0062.tlg00' + str(i)    
    elif(i<100):
        curwork = 'tlg0062.tlg0' + str(i)
        
    if(not curwork in worktitles):
        print('\n'+curwork,'skipped')
        continue
        
        #print(curwork,edition,worktitles[curwork],metadata[curwork])
    #else:
    newfname = 'ctsfiles/' + curwork + '-cts.xml'
    if(os.path.exists(newfname) and 0):
        print('exists',newfname)
    else:
        xmlfile = curwork + '.' + edition + '.xml'

        fname = '/Users/gcrane/github/canonical-greekLit/data/tlg0062/' + re.sub('tlg006[12]\.(tlg[0-9]+)','\g<1>/',curwork) + '__cts__.xml'

        f = open(fname)
    
        text = f.read()
        f.close()
        outf = open(newfname,'w')
        outf.write(text)
        print('created',newfname)
    
    outfname = '/Users/gcrane/github/first1kwork/data/' + re.sub('(tlg006[12])\.(tlg[0-9]+)','\g<1>/\g<2>/',curwork) + '__cts__.xml'
    newfname = '/Users/gcrane/github/canonical-greekLit/data/tlg0062/' + re.sub('tlg006[12]\.(tlg[0-9]+)','\g<1>/',curwork) + '__cts__.xml'

    f = open(newfname)
    text = f.read()
    outf = open(outfname,'w')
    print('outfname',outfname)
    
            
    fullmeta = '<ti:translation urn="urn:cts:greekLit:'+curwork+'.perseus-eng4" xml:lang="eng" workUrn="urn:cts:greekLit:'+curwork+'">\n'
    fullmeta = fullmeta + '<ti:label xml:lang="eng">' + worktitles[curwork] + '</ti:label>\n'
    fullmeta = fullmeta + '<ti:description xml:lang="eng">' + metadata[curwork] + '</ti:description>\n\n</ti:translation>\n\n</ti:work>\n'
    
    newtext = re.sub('</ti:work>',fullmeta,text)
    outf.write(newtext)
    print('writing',outfname,fullmeta)
    outf.close()
    #print('\n'+text+'\n')

/Users/gcrane/github/first1kwork/data/tlg0062/tlg029/tlg0062.tlg029.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg064/tlg0062.tlg064.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg007/tlg0062.tlg007.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg014/tlg0062.tlg014.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg022/tlg0062.tlg022.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg020/tlg0062.tlg020.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg068/tlg0062.tlg068.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg067/tlg0062.tlg067.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg066/tlg0062.tlg066.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg035/tlg0062.tlg035.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg023/tlg0062.tlg023.perseus-eng4.xml
/Users/gcrane/github/first1kwork/data/tlg0062/tlg027/tlg0062.tlg0

In [2]:
for foo in refsbook:
    print(foo)

<refsDecl n="CTS">
<cRefPattern matchPattern="(\w+).(\w+)"
replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']/tei:div[@n='$2'])">
<p>This pointer pattern extracts book and section</p>
</cRefPattern>
<cRefPattern matchPattern="(\w+)"
replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1'])">
<p>This pointer pattern extracts book</p>
</cRefPattern>
</refsDecl>


In [106]:
teiheader

['<?xml version="1.0" encoding="UTF-8"?>',
 '<?xml-model href="https://epidoc.stoa.org/schema/latest/tei-epidoc.rng" schematypens="http://relaxng.org/ns/structure/1.0"?>',
 '<TEI xmlns="http://www.tei-c.org/ns/1.0">',
 '<teiHeader xml:lang="eng">',
 '<fileDesc>',
 '<titleStmt>',
 '<title xml:lang="eng">WORKTITLE</title>',
 '<sponsor>Tufts University</sponsor>',
 '<funder>National Endowment for the Humanities</funder>',
 '<principal xml:id="GRC">Gregory Crane</principal>',
 '<respStmt>',
 '<persName>Gregory Crane</persName>',
 '<resp>Digital conversion and editing</resp>',
 '</respStmt>',
 '',
 '</titleStmt>',
 '<publicationStmt>',
 '<authority>Tufts University</authority>',
 '<idno type="filename">IDNO</idno>',
 '<authority>Perseus Digital Library</authority>',
 '<idno type="filename">IDNO</idno>',
 '<availability>',
 '<licence target="https://creativecommons.org/licenses/by-sa/4.0/">Available under a Creative Commons Attribution-ShareAlike 4.0 International License</licence>',
 '</ava