In [None]:
from bs4 import BeautifulSoup

In [None]:
import pandas as pd

In [None]:
import re

In [None]:
import os

In [None]:
import wikidataintegrator as wdi

# Conversion to TEI

In [159]:
class TEI:
    """ EcoCor TEI document object"""
    langmap = {'Englisch':'en', 
               'Deutsch':'de',
               'Orig. Franz. / Englisch': 'en'
              }
    authors_formatted = {}
    teiid = 1
    def __init__(self, title, author, sourcetxtname):
        self.title = title
        print(title)
        self.author = author
        self.empty = False
        self.sourcetxtname = sourcetxtname.replace('.txt','')
        #print(self.sourcetxtname)
        if sourcetxtname != 'NoSourceTxt':
            self.sourcetxtpath = f'ecocorMD/{self.sourcetxtname}.txt'
            self.outputfilename = self.sourcetxtname
            if not os.path.exists(self.sourcetxtpath):
                #print(sourcetxtname)
                #print(self.sourcetxtpath)
                self.empty = True
        else:
            self.empty = True
            self.sourcetxtpath = None
            self.outputfilename = title
        self.outputfilename = self.outputfilename.replace(' ', '')
            
        self.year = None
        self.wikidataid = None
        self.wikidataidauth = None
        self.lang = 'Undefined'
        with open('aux/EcoStub.xml') as ecostub:
            stubsoup = BeautifulSoup(ecostub, 'xml')
        self.tree = stubsoup
        self.len_words = 0
    
    def isolang(self):
        return self.langmap.get(self.lang)
    
    def measure_length(self, text):
        length = len(re.findall(r'\b\w.*?\b', text))
        #print(length)
        return length 
    
    def heur_conv(self, auth_string):
        '''heuristic conversion of author string for those who have no wikidata'''
        probable_surname = auth_string[-1:]
        rest = auth_string[:-1]
        return f'{probable_surname}, {rest}'
    
    def wikify_tree(self, treetitle, treeauthor):
        treeauthor.clear() # remove text from stub
        if 'entity/Q' not in self.wikidataidauth:
            auth_corr_format = self.heur_conv(self.author)
        elif self.wikidataidauth not in TEI.authors_formatted:
            auth_corr_format = get_author_correct_format(self.wikidataidauth)
            TEI.authors_formatted[self.wikidataidauth] = auth_corr_format
        else:
            auth_corr_format = TEI.authors_formatted[self.wikidataidauth] 
        treeauthor.append(auth_corr_format)
        treeauthor['ref'] = self.wikidataidauth
        treetitle['ref'] = self.wikidataid
        
    def checksubchapterheader(self, paragraph):
        if paragraph.startswith('###'):
            return True
        return False
        
    def checkchapterheader(self, paragraph):
        if paragraph.startswith('##') and not paragraph.startswith('###'):
            return True
        return False

    def checkpartheader(self, paragraph):
        if paragraph.startswith('#') and not paragraph.startswith('##'):
            return True
        return False
    
    
    
    def checkfront(self, paragraph):
        if paragraph.startswith('^'):
            #print('front')
            #print(paragraph)
            return True
        return False
    
    def get_paragraph_id(self, count):
        return f'{self.get_full_id()}_{count*10}'        
        
        
    def add_text(self):
        teitext = self.tree.find('text')
        teitext.clear()
        body = self.tree.new_tag('body')
        toptag = body
        currentdiv = body
        if self.sourcetxtpath == None:
            self.empty = True
            return 'NoText'
        if not os.path.exists(self.sourcetxtpath):
            self.empty = True
            #print(self.sourcetxtpath)
            return 'NoText'
        with open(self.sourcetxtpath) as opensourcefile:
            #text_raw = opensourcefile.read()
            text_to_insert = opensourcefile.readlines()
            text_raw = '\n'.join(text_to_insert)
        #print(text_raw)[:20]
        #print(text_to_insert)[:2]
        self.len_words = self.measure_length(text_raw)
        pcount=1
        for paragraph in text_to_insert:
            if self.checksubchapterheader(paragraph):
                div = self.tree.new_tag('div')
                head = self.tree.new_tag('head')
                head.append(paragraph.strip('#'))
                div['type'] = 'subchapter'
                div.append(head)
                chapter.append(div)
                currentdiv = div
            elif self.checkchapterheader(paragraph):
                div = self.tree.new_tag('div')
                head = self.tree.new_tag('head')
                head.append(paragraph.strip('#'))
                div['type'] = 'chapter'
                div.append(head)
                toptag.append(div)
                chapter = div
                currentdiv = div
            elif self.checkpartheader(paragraph):
                div = self.tree.new_tag('div')
                head = self.tree.new_tag('head')
                head.append(paragraph.strip('#'))
                div['type'] = 'group'
                div.append(head)
                body.append(div)
                toptag = div
                currentdiv = div
                #print('Div found')
                #print(self.title)
            elif self.checkfront(paragraph):
                front = self.tree.new_tag('front')
                p = self.tree.new_tag('p')
                p['xml:id'] = self.get_paragraph_id(pcount)
                pcount+=1
                p.append(paragraph.strip('^ \n'))
                if len(paragraph.strip('^')) > 0:
                    front.append(p)
                teitext.append(front)
                currentdiv = front
            else:
                p = self.tree.new_tag('p')
                p['xml:id'] = self.get_paragraph_id(pcount)
                pcount+=1
                p.append(paragraph)
                if len(paragraph.strip()) > 0: 
                    currentdiv.append(p)
        teitext.append(body)
    
    def get_full_id(self):
        numzeros = 5 if TEI.teiid < 10 else 4
        return f'eco_{self.isolang()}_{"0"*numzeros}{TEI.teiid}'
    
    def update_tree(self):
        #def updatetag(tagname, tagcontent):
        #    treetag = self.tree.find(tagname)
            
        treetitle = self.tree.find('title') # find title tag in the stub
        treetitle.clear() # remove text from stub
        treetitle.append(self.title) # add current title
        
        sdesc = self.tree.find('sourceDesc')
        for bibl in sdesc.findAll('bibl'):
            if 'type' in bibl.attrs:
                if bibl['type'] == 'firstEdition':
                    thisdate = bibl.find('date')
                    thisdate.append(str(self.year))
                    thisdate['when'] = str(self.year)[:4]
        
        
        treeauthor = self.tree.find('author')
        self.wikify_tree(treetitle, treeauthor) ## add wikidata links
        
        self.add_text()
        #print(self.tree.find('body'))
        
        numpages = self.tree.find('measure')
        numpages.clear()
        numpages.append(str(self.len_words))
        
        root = self.tree.find('TEI')
        root['xml:id'] = self.get_full_id()
        TEI.teiid+=1
        #print(self.isolang())
        root['xml:lang'] = self.isolang()
    
    
    def serialize(self):
        """SERIALISATION"""
        self.update_tree()
        return self.tree.prettify()
    
    def choose_folder(self):
        if self.empty:
            return 'no_text_yet' 
        return 'has_text'
        
    def output_TEI(self):
        with open(f'tei/{self.isolang()}/{self.choose_folder()}/{self.outputfilename}.xml', 
                  'w') as outfile:
            outfile.write(self.serialize())
        
        

In [160]:
def row_to_tei(row):
    author = row['Autor*in']
    title = row['Titel']
    sourcetxtname = row['Filename']
    new_tei = TEI(title, author, sourcetxtname)
    new_tei.year = row['Jahr']
    new_tei.wikidataid = row['Wiki-Data ID Work'].replace('/wiki/','/entity/')
    new_tei.wikidataidauth = f'https://www.wikidata.org/entity/{row["Wiki-Data ID Author"]}'
    new_tei.lang = row['Sprache']
    new_tei.output_TEI()
    return f'{sourcetxtname} success'

In [161]:
df = pd.read_excel('aux/EcoCorMetadata.xlsx')

In [162]:
df['Filename'] = df['Filename'].fillna('NoSourceTxt')

### Getting normalized author string in eltec format 
https://github.com/dracor-org/eco_en/issues/4

When used within a [], an author's name is given in a standardized format (surname, forename/s, (YYYY-YYYY)) as shown in this example.
`<author ref="viaf:31996364">Forster, Edward Morgan (1879-1970)</author>`

In [None]:
def query_wikidata_for_author_data(auth_wikidata_id):
    qwr = f"""
    SELECT ?family_nameLabel ?given_nameLabel ?dob ?dod
    WHERE {{
      wd:{auth_wikidata_id} wdt:P735 ?given_name.
      wd:{auth_wikidata_id} wdt:P734 ?family_name.
      wd:{auth_wikidata_id} wdt:P569 ?dob.
      wd:{auth_wikidata_id} wdt:P570 ?dod.

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    jsonresult = wdi.wdi_core.WDItemEngine.execute_sparql_query(qwr)
        
    #with open('sample.json', 'w') as openfile:
    #    json.dump(jsonresult, openfile, indent=4)
    return jsonresult

In [None]:
def get_author_correct_format(wikidata_id):
    wikidata_id = wikidata_id.replace('https://www.wikidata.org/entity/','')
    for item in query_wikidata_for_author_data(wikidata_id)['results']['bindings']:
        surname = item['family_nameLabel']['value']
        name = item['given_nameLabel']['value']
        yob = item['dob']['value'][:4]
        yod = item['dod']['value'][:4]
    return f'{surname}, {name} ({yob}-{yod})'

In [None]:
get_author_correct_format('Q5879')

## Run

In [163]:
TEI.teiid = 1
df.apply(row_to_tei, axis=1)

Utopia
Nova Atlantis
Robinson Crusoe
The Mysteries of Udolpho
Frankenstein
Moby-Dick
The Last Man
The Paradise of Bachelors and the Tartarus of Maids
Little House on the Prairie
Paris in the Twentieth Century
The Purchase of the North Pole
Tess of the d’Urbervilles 
The American Claimant 
Earth Revisited
The Aerial Brickfield
Dracula
A Corner in Lightning
The Wreck of the South Pole, or the Great Dissembler 
Heart of Darkness
The White Battalions
The Evacuation of England: The Twist in the Gulf Stream
The Great Weather Syndicate 
England's The Air Trust
The Man Who Rocked The Earth
Erewhon
Wuthering Heights
After London, or Wild England
The Hoosier Schoolmaster: A Story of Backwoods Life in Indiana
Deephaven
A Crystal Age
Wilhelm Meisters Lehrjahre
Heinrich von Ofterdingen
Das Erdbeben in Chili
Die Elfen
Das fremde Kind
Die Bergwerke zu Falun
Lebens-Ansichten des Katers Murr
Wilhelm Meisters Wanderjahre
Die Judenbuche
Der Hochwald
Bunte Steine
Der Grüne Heinrich (zweite Fassung)
Nachso

0                                 NoSourceTxt success
1                                 NoSourceTxt success
2                                 NoSourceTxt success
3                      1794_Radcliffe_Udolpho success
4                   1818_Shelley_Frankenstein success
                           ...                       
63            1836_Chamisso_Reise-um-die-Welt success
64     1789_White_Natural-History-of-Selborne success
65                    1850_Cooper_Rural-Hours success
66                        1854_Thoreau_Walden success
67    1911_Muir_My-First-Summer-in-the-Sierra success
Length: 68, dtype: object