## Imports

In [None]:
import os

In [None]:
import re

In [None]:
from bs4 import BeautifulSoup

In [None]:
import pandas as pd

## Data import and initial exploration of metadata

In [None]:
table = pd.read_csv('../preparatory_data/benyehuda.drama.29112022.tsv', sep='\t')

In [None]:
table

In [None]:
table['html']

In [None]:
table.columns

In [None]:
table.info()

In [None]:
table['AuthorWikidata'].value_counts().plot.bar(figsize=(15,8));

In [None]:
table['AuthorWikidata'].value_counts().plot.pie(figsize=(10,10));

In [None]:
table = table.astype({'orig_publication_date': 'datetime64[ns]'})

In [None]:
table['orig_publication_date'].dt.year

In [None]:
table['orig_publication_date'].dt.year.plot.hist(figsize=(15,8), bins=20);

## TEI creation functions

In [None]:
def parse_table(table):
    table.apply(parse_play,
                axis=1)

In [None]:
def parse_play(row):
    html = row['html']
    soup = BeautifulSoup(html)
    html_body = soup.find('body')
    tei_tree, tei_text, tei_body = generate_tei_stub(soup)
    populate_the_tei_body(tei_body, html_body, soup)
    new_file_path = f'../tei/{row["TextID"]}.xml'
    with open(new_file_path, 'w') as output:
        output.write(tei_tree.prettify())

In [None]:
def generate_tei_stub(soup):
    new_tree = soup.new_tag("TEI")
    new_text = soup.new_tag("text")
    new_tree.append(new_text)
    new_body = soup.new_tag("body")
    new_text.append(new_body)
    return new_tree, new_text, new_body

In [None]:
def populate_the_tei_body(tei_body, html_body, soup):
    all_p = html_body.find_all('p')
    for item in all_p:
        speech = parse_p(item, soup)
        #print(speech)
        tei_body.append(speech)

In [None]:
def populate_the_tei_header(tei_header, metadata):
    '''this function should take metadata from the CSV and enrich the teiHeader'''
    #row = metadata.loc[]
    pass
    

In [None]:
def salvage_speaker_regex(newsp, soup):
    text = newsp.text
    newsp.clear()
    lookup = re.search(r"(.+):(.+)", text)
    if lookup is not None:
        spekaername = lookup.group(1)
        newspeaker = soup.new_tag("speaker", id=spekaername)
        newspeaker.append(spekaername)
        newsp.append(newspeaker)
        newsp.append(lookup.group(2))
    else:
        newsp.append(text)

In [None]:
def parse_p(item, soup):
    newsp = soup.new_tag("sp")
    found_bold = False
    for child in item:
        if child.name == 'strong':
            found_bold = True
            spekaername = child.text
            #print(spekaername)
            newspeaker = soup.new_tag("speaker", id=spekaername)
            newspeaker.append(spekaername)
            newsp.append(newspeaker)
            #print('SUCCESS')
        else:
            newsp.append(child)
    if not found_bold:
        salvage_speaker_regex(newsp, soup)
    return newsp

## Use

In [None]:
parse_table(table)