In [None]:
from bs4 import BeautifulSoup

In [None]:
import pandas as pd

In [None]:
import re

In [None]:
import os

# Conversion to TEI

In [None]:
class TEI:
    """ EcoCor TEI document object"""
    langmap = {'Englisch':'en', 
               'Deutsch':'de',
               'Orig. Franz. / Englisch': 'en'
              }
    teiid = 1
    def __init__(self, title, author, sourcetxtname):
        self.title = title
        print(title)
        self.author = author
        self.empty = False
        self.sourcetxtname = sourcetxtname.replace('.txt','')
        #print(self.sourcetxtname)
        if sourcetxtname != 'NoSourceTxt':
            self.sourcetxtpath = f'ecocorMD/{self.sourcetxtname}.txt'
            self.outputfilename = self.sourcetxtname
            if not os.path.exists(self.sourcetxtpath):
                #print(sourcetxtname)
                #print(self.sourcetxtpath)
                self.empty = True
        else:
            self.empty = True
            self.sourcetxtpath = None
            self.outputfilename = title
            
        self.year = None
        self.wikidataid = None
        self.wikidataidauth = None
        self.lang = 'Undefined'
        with open('aux/EcoStub.xml') as ecostub:
            stubsoup = BeautifulSoup(ecostub, 'xml')
        self.tree = stubsoup
        self.len_words = 0
    
    def isolang(self):
        return self.langmap.get(self.lang)
    
    def measure_length(self, text):
        length = len(re.findall(r'\b\w.*?\b', text))
        #print(length)
        return length 
    
    def wikify_tree(self, treetitle, treeauthor):
        treeauthor['ref'] = self.wikidataidauth
        treetitle['ref'] = self.wikidataid
        
    def checkchapterheader(self, paragraph):
        if paragraph.startswith('##'):
            return True
        return False

    def checkpartheader(self, paragraph):
        if paragraph.startswith('#') and not paragraph.startswith('##'):
            return True
        return False
    
    def checkfront(self, paragraph):
        if paragraph.startswith('^'):
            #print('front')
            #print(paragraph)
            return True
        return False
        
        
    def add_text(self):
        teitext = self.tree.find('text')
        teitext.clear()
        body = self.tree.new_tag('body')
        toptag = body
        currentdiv = body
        if self.sourcetxtpath == None:
            self.empty = True
            return 'NoText'
        if not os.path.exists(self.sourcetxtpath):
            self.empty = True
            #print(self.sourcetxtpath)
            return 'NoText'
        with open(self.sourcetxtpath) as opensourcefile:
            #text_raw = opensourcefile.read()
            text_to_insert = opensourcefile.readlines()
            text_raw = '\n'.join(text_to_insert)
        #print(text_raw)[:20]
        #print(text_to_insert)[:2]
        self.len_words = self.measure_length(text_raw)
        for paragraph in text_to_insert:
            if self.checkchapterheader(paragraph):
                div = self.tree.new_tag('div')
                head = self.tree.new_tag('head')
                head.append(paragraph.strip('#'))
                div['type'] = 'chapter'
                div.append(head)
                body.append(div)
                currentdiv = div
            elif self.checkpartheader(paragraph):
                div = self.tree.new_tag('div')
                head = self.tree.new_tag('head')
                head.append(paragraph.strip('#'))
                div['type'] = 'group'
                div.append(head)
                body.append(div)
                toptag = div
                currentdiv = div
                #print('Div found')
                #print(self.title)
            elif self.checkfront(paragraph):
                front = self.tree.new_tag('front')
                p = self.tree.new_tag('p')
                p.append(paragraph.strip('^ \n'))
                if len(paragraph.strip('^')) > 0:
                    front.append(p)
                teitext.append(front)
                currentdiv = front
            else:
                p = self.tree.new_tag('p')
                p.append(paragraph)
                if len(paragraph.strip()) > 0: 
                    currentdiv.append(p)
        teitext.append(body)
        
    
    def update_tree(self):
        #def updatetag(tagname, tagcontent):
        #    treetag = self.tree.find(tagname)
            
        treetitle = self.tree.find('title') # find title tag in the stub
        treetitle.clear() # remove text from stub
        treetitle.append(self.title) # add current title
        treeauthor = self.tree.find('author')
        treeauthor.clear() # remove text from stub
        treeauthor.append(self.author) # add current title
        
        
        self.wikify_tree(treetitle, treeauthor) ## add wikidata links
        
        self.add_text()
        #print(self.tree.find('body'))
        
        numpages = self.tree.find('measure')
        numpages.clear()
        numpages.append(str(self.len_words))
        
        root = self.tree.find('TEI')
        numzeros = 5 if TEI.teiid < 10 else 4
        root['xml:id'] = f'eco{"0"*numzeros}{TEI.teiid}'
        TEI.teiid+=1
        #print(self.isolang())
        root['xml:lang'] = self.isolang()
    
    def serialize(self):
        """SERIALISATION"""
        self.update_tree()
        return self.tree.prettify()
    
    def choose_folder(self):
        if self.empty:
            return 'no_text_yet' 
        return 'has_text'
        
    def output_TEI(self):
        with open(f'tei/{self.isolang()}/{self.choose_folder()}/{self.outputfilename}.xml', 
                  'w') as outfile:
            outfile.write(self.serialize())
        
        

In [None]:
def row_to_tei(row):
    author = row['Autor*in']
    title = row['Titel']
    sourcetxtname = row['Filename']
    new_tei = TEI(title, author, sourcetxtname)
    new_tei.year = row['Jahr']
    new_tei.wikidataid = row['Wiki-Data ID Work']
    new_tei.wikidataidauth = f'https://www.wikidata.org/wiki/{row["Wiki-Data ID Author"]}'
    new_tei.lang = row['Sprache']
    new_tei.output_TEI()
    return f'{sourcetxtname} success'

In [None]:
df = pd.read_excel('aux/EcoCorMetadata.xlsx')

In [None]:
df['Filename'] = df['Filename'].fillna('NoSourceTxt')

In [None]:
df.apply(row_to_tei, axis=1)