## imports

In [82]:
import sys
import re
#from transliterate import translit
from bs4 import BeautifulSoup, Tag
from datetime import datetime

In [83]:
#from translitua import translit, UkrainianISO9

In [84]:
import os

## functions

In [85]:
def parse_file(fpath):
    path_to_file = fpath
    with open(path_to_file) as openfile:
        file_lines = openfile.readlines()
    soup = parse_lines(file_lines) 
    augmented_soup = post_process(soup)
    tree_to_write = str(soup).replace('\n', '')
    tree_to_write = add_spaces_inline_stages(tree_to_write) #
    with open(re.sub(r'\.(txt|md)', r'.xml',fpath), 'w') as outfile:
        outfile.write(tree_to_write)

In [86]:
def add_spaces_inline_stages(tree_as_string):
    tree_as_string = re.sub(r'</stage>([^\s])', r'</stage> \1', tree_as_string)
    tree_as_string = re.sub(r'([^\s])<stage>', r'\1 <stage>', tree_as_string)
    return tree_as_string

In [87]:
def add_pbstmt(filedesc):
    pubstmt_as_string = """
      <publicationStmt>
        <publisher xml:id="dracor">DraCor</publisher>
        <idno type="URL">https://dracor.org</idno>
        <availability>
          <licence>
            <ab>CC0 1.0</ab>
            <ref target="https://creativecommons.org/publicdomain/zero/1.0/">Licence</ref>
          </licence>
        </availability>
      </publicationStmt>
    """
    pbsoup = BeautifulSoup(pubstmt_as_string, 'xml')
    pbstmt = pbsoup.publicationStmt
    filedesc.append(pbstmt)

In [88]:
def add_sourcedesc(filedesc):
    sourcedesc_as_string = """
      <sourceDesc>
        <bibl type="digitalSource">
          <name>ENTER SOURCE NAME HERE</name>
          <idno type="URL">ENTER SOURCE URL HERE</idno>
          <availability status="free">
            <p>In the public domain.</p>
          </availability>
        </bibl>
      </sourceDesc>
    """
    sdsoup = BeautifulSoup(sourcedesc_as_string, 'xml')
    sd = sdsoup.sourceDesc
    filedesc.append(sd)

In [89]:
def add_rev_desc(header):
    revdesc_as_string = f"""
    <revisionDesc>
         <listChange>
        <change when="{datetime.today().strftime('%Y-%m-%d')}">DESCRIBE CHANGE</change>
        </listChange>
    </revisionDesc>"""
    rdsoup = BeautifulSoup(revdesc_as_string, 'xml')
    rd = rdsoup.revisionDesc
    header.append(rd)

In [90]:
def create_header():
    header = Tag(name='teiHeader')
    fdesc = Tag(name='fileDesc')
    titlestmt = Tag(name='titleStmt')
    fdesc.append(titlestmt)
    add_pbstmt(fdesc)
    add_sourcedesc(fdesc)
    header.append(fdesc)
    return(header)

In [91]:
def get_div_level(line):
    div_level = 1 # since we already located one # and since 0 is <body> level in this model
    for char in line:
        if char == '#':
            div_level+=1
        else:
            break
    return(div_level)

In [92]:
def handle_line_with_markup(first_character, rest_of_line, current_lowest_tag, current_lowest_div):
    ''' handles linke with specific ezdrama markup symbol at the start
    returns new current_lowest_tag which is to be appended further'''
    if first_character == '$':
        new_stage = Tag(name='stage')
        new_stage.append(rest_of_line.strip())
        current_lowest_div.append(new_stage)
        current_lowest_tag = new_stage # при отключениии stage перестает быть мультистрочным и поедать все немаркированные строки после себя
    elif first_character == '(':
        new_stage = Tag(name='stage')
        new_stage.append(first_character)
        new_stage.append(rest_of_line.strip())
        current_lowest_div.append(new_stage)
    elif first_character == '@':
        new_sp = Tag(name='sp')
        new_sp.append(rest_of_line)
        current_lowest_div.append(new_sp)
        current_lowest_tag = new_sp
    elif first_character == '^':
        new_cl = Tag(name='castList')
        new_cl.append(rest_of_line)
        current_lowest_div.append(new_cl)
        current_lowest_tag = new_cl
    elif first_character == '#':
        new_div = Tag(name='div')
        head = Tag(name='head')
        head.append(rest_of_line.strip('#'))
        new_div['level'] = get_div_level(rest_of_line)
        new_div.append(head)
        if new_div['level'] > current_lowest_div['level']:
            current_lowest_div.append(new_div)
        elif new_div['level'] == current_lowest_div['level']:
            current_lowest_div.parent.append(new_div)
        else:
            current_lowest_div.parent.parent.append(new_div)
        current_lowest_div = new_div
        current_lowest_tag = new_div
    return(current_lowest_tag, current_lowest_div)

In [93]:
def add_author_to_header(header, line):
    fdesc = header.find('titleStmt')
    author = Tag(name='author')
    author.append(line[7:])
    fdesc.append(author)

In [94]:
def add_title_to_header(header, line):
    titlest = header.find('titleStmt')
    title = Tag(name='title')
    title['type'] = 'main'
    title.append(line[7:])
    titlest.append(title)

In [95]:
def add_subtitle_to_header(header, line):
    titlest = header.find('titleStmt')
    title = Tag(name='title')
    title['type'] = 'sub'
    title.append(line[10:])
    titlest.append(title)

In [96]:
def add_standoff(tei):
    today = datetime.today().strftime('%Y')
    standoff_as_string = f'''
    <standOff>
        <listEvent>
        <event type="print" when="{today}">
        <desc/>
        </event>
        <event type="premiere" when="{today}">
        <desc/>
        </event>
        <event type="written" when="{today}">
        <desc/>
        </event>
        </listEvent>
        <listRelation>
        <relation name="wikidata" active="INSERT" passive="INSERT"/>
        </listRelation>
    </standOff>
    '''
    standoffsoup = BeautifulSoup(standoff_as_string, 'xml')
    standoff = standoffsoup.standOff
    tei.append(standoff)

In [97]:
def parse_lines(file_lines):
    '''This function parses lines of a file 
    in ezdrama format and 
    produces an output XML tree
    at this stage we only identify: 
    -- basic <div> structure
    -- external (not-inside-the-sp) <stage> directions 
    -- <sp>-s without internal markup
    '''
    special_symb_list = '@$^#('
    root = Tag(name='TEI')
    header = create_header()
    root.append(header)
    add_standoff(root)
    text = Tag(name='text')
    body = Tag(name='body')
    text.append(body)
    root.append(text)
    
    current_lowest_tag = body
    current_lowest_div = body
    current_lowest_div['level'] = 0
    
    for line in file_lines:
        if line.startswith('@author'):
            add_author_to_header(header, line.strip())
        elif line.startswith('@title'):
            add_title_to_header(header, line.strip())
        elif line.startswith('@subtitle'):
            add_subtitle_to_header(header, line.strip())
        else:
            first_character = line[:1] # отрезаем первый спецсимвол, берем его
            rest_of_line = line[1:] # отрезаем первый спецсимвол, берем остаток
            if first_character in special_symb_list:
                current_lowest_tag, current_lowest_div = handle_line_with_markup(first_character, 
                                                                 rest_of_line, 
                                                                 current_lowest_tag,
                                                                 current_lowest_div)
            else:
                current_lowest_tag.append(line)
    return(root)

In [98]:
def check_prose(first_line_of_speech):
    #print(first_line_of_speech)
    if first_line_of_speech.startswith('~'):
        return False
    return True

In [99]:
def clean_after_translit(line):
    line = line.replace('і', 'i')
    line = line.replace('ї', 'i')
    line = line.replace('і', 'i')
    line = line.replace('є', 'e')
    line = line.replace("'", "")
    line = line.replace("’", "")
    line = line.replace("«", "")
    line = line.replace("»", "")
    line = line.replace("′", "")
    line = line.replace(" ", "_")
    return line

In [100]:
def add_cast_items(soup):
    castList = soup.find('castList')
    casttext = castList.text
    castList.clear()
    cast_lines = casttext.split('\n')
    # first line is head
    castHead = Tag(name='head')
    castHead.append(cast_lines[0])
    castList.append(castHead)
    # next lines -- castItems
    for line in cast_lines[1:]:
        castItem = Tag(name='castItem')
        castItem.append(line)
        castList.append(castItem)

In [101]:
def post_process_sp(sp):
    text_of_sp = sp.text 
    sp.clear()
    
    text_split_in_lines = text_of_sp.split('\n')
    first_line = text_split_in_lines[0]
    speaker = Tag(name='speaker')
    sp.append(speaker)
    check_stage = re.search('([^()]+)(\(.+?\))([.,:!;])?', first_line)
    if check_stage:
        speaker.append(check_stage.group(1).strip())
        inside_stage = Tag(name='stage')
        inside_stage.append(check_stage.group(2).strip())
        sp.append(inside_stage)
        
        ending_punct = check_stage.group(3)
        if ending_punct is not None:
            speaker.append(ending_punct)
    else:
        speaker.append(first_line.strip())
        
    
    prose = check_prose(text_split_in_lines[1])
    if prose:
        speechtext = Tag(name='p') 
    else:
        speechtext = Tag(name='lg')
        text_split_in_lines[1] = text_split_in_lines[1].strip('~')
    for line in text_split_in_lines[1:]:
        if len(line) > 0:
            switch_to_poetry = not check_prose(line)
            check_inline_brackes  = re.findall('([^()]*)(\(.+?\)[.,:!;]?)([^()]*)', line)
            if check_inline_brackes:
                for triplet in check_inline_brackes: 
                    if len(triplet[0]) > 0:
                        if not prose:
                            poetic_line = Tag(name='l')
                            poetic_line.append(triplet[0].strip('~'))
                            speechtext.append(poetic_line)
                        else:
                            speechtext.append(triplet[0])
                    inside_stage = Tag(name='stage')
                    inside_stage.append(triplet[1].strip())
                    speechtext.append(inside_stage)
                    if len(triplet[2]) > 0:
                        if not prose:
                            poetic_line = Tag(name='l')
                            poetic_line.append(triplet[2])
                            speechtext.append(poetic_line)
                        else:
                            speechtext.append(triplet[2])
            #elif line.startswith('i$'):   

            else:
                if switch_to_poetry:
                    sp.append(speechtext)
                    speechtext = Tag(name='lg')
                    poetic_line = Tag(name='l') 
                    poetic_line.append(line.strip('~'))
                    speechtext.append(poetic_line)
                    prose = False
                elif prose:
                    speechtext.append(line)
                else:
                    poetic_line = Tag(name='l') 
                    poetic_line.append(line)
                    speechtext.append(poetic_line)


    sp.append(speechtext)

    if re.search('[йцукенгшщзхъфывапролджэячсмитью]', speaker.text.lower()):
        clean_who = clean_after_translit(translit(speaker.text.strip('.,:!; '), UkrainianISO9)).lower()
    else:
        clean_who = speaker.text.strip('.,:!; ').lower()
    sp['who'] = f'#{clean_who}'

In [102]:
def post_process(soup):
    set_of_char_pairs = set() # множество пар ID + строка 
    add_cast_items(soup)
    del(soup.find('body')['level'])
    for sp in soup.find_all('sp'):
        post_process_sp(sp)
        if 'who' in sp.attrs:
            #print[sp
            #try:
            set_of_char_pairs.add((sp['who'], sp.speaker.text.strip('. '))) #
            #except:
            #    pass
    for div in soup.find_all('div'):
        if div['level'] == 0:
            div.attrs = {}
        elif div['level'] == 1:
            div.attrs = {}
            div['type'] = 'act'
        elif div['level'] == 2:
            div.attrs = {}
            div['type'] = 'scene'
    add_particdesc_to_header(soup, set_of_char_pairs)
    add_rev_desc(soup.teiHeader)

    soup['xmlns'] = "http://www.tei-c.org/ns/1.0"

    return(soup)

In [103]:
def add_particdesc_to_header(soup, set_of_char_pairs):
    #print(set_of_char_pairs)
    profileDesc = Tag(name = 'profileDesc')
    particDesc = Tag(name = 'particDesc')
    profileDesc.append(particDesc)
    listPerson = Tag(name = 'listPerson')
    particDesc.append(listPerson)
    for pair in set_of_char_pairs:
        person = Tag(name = 'person')
        person['xml:id'] = pair[0].strip('#')
        persName = Tag(name = 'persName')
        person.append(persName)
        #print(pair[1])
        persName.append(pair[1])
        listPerson.append(person)
    teiHeader = soup.find('teiHeader')
    teiHeader.append(profileDesc)

## Use

In [104]:
path = 'sample_text.txt'
path = 'sample.txt'
parse_file(path)

In [105]:
newfile = path.replace('.txt', '.xml')

## adding indentation

take the DraCor indentation scheme from Github

In [106]:
!wget https://raw.githubusercontent.com/dracor-org/gerdracor/main/format.conf

--2023-04-20 20:08:17--  https://raw.githubusercontent.com/dracor-org/gerdracor/main/format.conf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 183 [text/plain]
Saving to: 'format.conf.3'


2023-04-20 20:08:18 (11.6 MB/s) - 'format.conf.3' saved [183/183]



#### indent with xml formatter

Preparation: install the formatter on your machine:http://www.kitebird.com/software/xmlformat/ 

above link doesnt work. try in terminal: :brew install xmlformat: or another package manager for your OS.

the following two lines are only needed if using in Google Colab

In [107]:
 !cp xmlformat.pl /usr/local/bin/xmlformat

cp: xmlformat.pl: No such file or directory


In [108]:
!chmod 755 -R /usr/local/bin/xmlformat

chmod: -R: No such file or directory


actual indenting

In [109]:
os.system(f'xmlformat --config-file=format.conf \"{newfile}\" > \"{newfile.replace(".xml","")}_indented.xml\" ')

0

In [110]:
os.system(f'rm {newfile}')

0