## Imports

In [None]:
import os

In [None]:
import re

In [None]:
from bs4 import BeautifulSoup

In [None]:
import pandas as pd

In [None]:
#!pip3 install yiddish

In [None]:
import yiddish

## Data import and initial exploration of metadata

In [None]:
table = pd.read_csv('../preparatory_data/benyehuda_drama.12092023.tsv', sep='\t')

In [None]:
table

In [None]:
table['html']

In [None]:
table.columns

In [None]:
table.info()

In [None]:
table['Author_EN'] = table['Author_EN'].fillna('NoAuthor')

In [None]:
table['Author_EN'].value_counts().plot.bar(figsize=(15,8));

In [None]:
table['Author_EN'].value_counts().plot.pie(figsize=(10,10));

In [None]:
table = table.astype({'orig_publication_date': 'datetime64[ns]'})

In [None]:
table['orig_publication_date'].dt.year

In [None]:
table['orig_publication_date'].dt.year.plot.hist(figsize=(15,8), bins=20);

## TEI creation functions

In [None]:
def parse_table(table):
    table.apply(parse_play,
                axis=1)

In [None]:
def parse_play(row):
    html = row['html']
    soup = BeautifulSoup(html)
    html_body = soup.find('body')
    tei_tree, tei_header, standoff, tei_text, tei_body = generate_tei_stub(soup)
    populate_the_tei_body(tei_body, html_body, soup)
    populate_the_tei_header(row, tei_header, soup)
    creation_date = row['Creation_Date']
    print_date = row['raw_publication_date']
    fill_standoff_with_dates(standoff, written=creation_date, print_date=print_date)
    new_file_path = f'../tei/{row["Author_EN"].lower().replace(" ", "_")}_{row["TextID"]}.xml'
    with open(new_file_path, 'w') as output:
        output.write(tei_tree.prettify())

In [None]:
def populate_the_tei_body(tei_body, html_body, soup):
    all_p = html_body.find_all('p')
    for item in all_p:
        speech = parse_p(item, soup)
        #print(speech)
        tei_body.append(speech)

In [None]:
def salvage_speaker_regex(newsp, soup):
    text = newsp.text
    newsp.clear()
    lookup = re.search(r"(^[^\s]+):(.+)", text)
    if lookup is not None:
        speakername = lookup.group(1)
        speaker_id = create_id(speakername)
        newspeaker = soup.new_tag("speaker", id=speaker_id)
        newspeaker.append(speakername)
        newsp.append(newspeaker)
        newsp.append(lookup.group(2))
    else:
        speechtext = soup.new_tag("p")
        speechtext.append(text)
        newsp.append(speechtext)

In [None]:
def parse_p(item, soup):
    newsp = soup.new_tag("sp")
    found_bold = False
    for child in item:
        if child.name == 'strong':
            found_bold = True
            speakername = child.text
            #print(spekaername)
            speaker_id = create_id(speakername)
            newspeaker = soup.new_tag("speaker", id=speaker_id)
            newspeaker.append(speakername)
            newsp.append(newspeaker)
            #print('SUCCESS')
        else:
            speechtext = soup.new_tag("p")
            speechtext.append(child)
            newsp.append(speechtext)
    if not found_bold:
        salvage_speaker_regex(newsp, soup)
    return newsp

In [None]:
def create_id(speakername):
    speakername = speakername.lower().strip('.,:!; ')
    if re.search('[אאַאָבבֿגדהוװוּױזחטייִײײַככּךלמםנןסעפּפֿףצץקרששׂתּת]', speakername):
        return (yiddish.transliterate(speakername))
    
    return (speakername)
    

In [None]:
create_id('פלָוִיוס')

In [None]:
create_id('Daniel')

### TEI generation functions

In [None]:
def generate_tei_stub(soup):
    new_tree = soup.new_tag("TEI")
    new_tree['xmlns'] = "http://www.tei-c.org/ns/1.0"
    new_header = soup.new_tag("teiHeader")
    new_tree.append(new_header)
    standoff = soup.new_tag("standOff") 
    new_tree.append(standoff)
    new_text = soup.new_tag("text") 
    new_tree.append(new_text)
    new_body = soup.new_tag("body")
    new_text.append(new_body)
    return new_tree, new_header, standoff, new_text, new_body

In [None]:
def populate_the_tei_header(row, tei_header, soup):
    '''this function should take metadata from the CSV and enrich the teiHeader'''
    fdesc = soup.new_tag('fileDesc')
    titlestmt = soup.new_tag('titleStmt')
    fdesc.append(titlestmt)
    add_pbstmt(fdesc)
    add_sourcedesc(fdesc)
    tei_header.append(fdesc)
    populate_header_from_metadata(row, tei_header, soup)
    

In [None]:
def populate_header_from_metadata(row, tei_header, soup):
    play_title = row['Title']
    add_title_to_header(tei_header, play_title, soup)
    author = row['Author']
    author_eng = row['Author_EN']
    add_author_to_header(tei_header, author, author_eng, soup)

In [None]:
def add_author_to_header(header, author_name, soup):
    fdesc = header.find('titleStmt')
    author = soup.new_tag('author')
    author.append(author_name)
    fdesc.append(author)

In [None]:
def add_title_to_header(header, play_title, soup):
    titlest = header.find('titleStmt')
    title = soup.new_tag('title')
    title['type'] = 'main'
    title.append(play_title)
    titlest.append(title)

In [None]:
def add_author_to_header(header, author_name, author_name_eng, soup):
    fdesc = header.find('titleStmt')
    author = soup.new_tag('author')
    author.append(author_name)
    fdesc.append(author)    
    author_eng = soup.new_tag('author')
    author_eng['lang'] = 'eng'
    author_eng.append(author_name_eng)
    fdesc.append(author_eng)

In [None]:
def add_pbstmt(filedesc):
    pubstmt_as_string = """
      <publicationStmt>
        <publisher xml:id="dracor">DraCor</publisher>
        <idno type="URL">https://dracor.org</idno>
        <availability>
          <licence>
            <ab>CC0 1.0</ab>
            <ref target="https://creativecommons.org/publicdomain/zero/1.0/">Licence</ref>
          </licence>
        </availability>
      </publicationStmt>
    """
    pbsoup = BeautifulSoup(pubstmt_as_string, 'xml')
    pbstmt = pbsoup.publicationStmt
    filedesc.append(pbstmt)

In [None]:
def add_sourcedesc(filedesc):
    sourcedesc_as_string = """
      <sourceDesc>
        <bibl type="digitalSource">
          <name>ENTER SOURCE NAME HERE</name>
          <idno type="URL">ENTER SOURCE URL HERE</idno>
          <availability status="free">
            <p>In the public domain.</p>
          </availability>
        </bibl>
      </sourceDesc>
    """
    sdsoup = BeautifulSoup(sourcedesc_as_string, 'xml')
    sd = sdsoup.sourceDesc
    filedesc.append(sd)

In [None]:
def fill_standoff_with_dates(standoff, 
                 print_date=None,
                 written=None,
                 premiere=None):
    
    listevent_as_string = f'''
        <listEvent>
        <event type="print" when="{print_date}">
        <desc/>
        </event>
        <event type="premiere" when="{premiere}">
        <desc/>
        </event>
        <event type="written" when="{written}">
        <desc/>
        </event>
        </listEvent>
    '''
    listevent = BeautifulSoup(listevent_as_string, 'xml')
    standoff.append(listevent)

## Use

In [None]:
parse_table(table)