This notebook provides code that takes a folder of plays represented in dbnl as input and creates a folder of naf files that provide the spoken text of these plays as output. 

In [None]:
import sys
import os
import nltk
from nltk.tokenize import word_tokenize
import lxml.etree as etree
from KafNafParserPy import *
from collections import OrderedDict

In [None]:
def extract_text(xmlfile):
    '''
    Extracts the spoken text from a dbnl xml file representing a play
    :param xmlfile: xmlinput file
    :return: ordered dictionary with keys string n_speaker values spoken text
    '''

    my_xml = etree.parse(xmlfile)
    speech_elements = my_xml.xpath('//sp')
    counter = 0
    speaker_text = OrderedDict()
    
    for speech in speech_elements:
        counter += 1
        text = ''
        speaker = 'Unknown'
        for child in speech:
            if child.tag == 'speaker':
                if len(child) > 0:
                    speaker = child[0].text
                    if speaker is None:
                        speaker = 'Unknown'
                elif child.text is None:
                    speaker = 'Unknown'
                else:
                    speaker = child.text
            elif child.tag == 'l':
                #this file has a subelement with <hi> where the text is embedded. if child.text is None, check for child of child
                if child.text is not None:
                    text = text + " " + child.text
                else:
                    for gchild in child:
                        if gchild.tag == 'hi':
                            if gchild.text is None:
                                if len(gchild) > 0:
                                    text = text + " " + gchild[0].text
                            else:
                                text = text + " " + gchild.text
        speaker_text[str(counter) + "_" + speaker] = text.lstrip(" ")

    return speaker_text

In [None]:

def turn_text_to_tokens(speaker_text):
    '''
    This function will tokenize the text and return tokens and sentences,

    :param speaker_text: ordered dictionary with keys string n_speaker values spoken text
    :return:
    '''

    speaker_tokens = OrderedDict()

    for k, v in speaker_text.items():
        tokens = word_tokenize(v)
        speaker_tokens[k] = tokens

    return speaker_tokens

In [None]:

def create_naf_file(text_dict, token_dict, outputfile):
    '''
    This function will create a naf file with a token layer
    :param tokens:
    :return:
    '''
    Nafparser = KafNafParser(type = 'NAF')
    Nafparser.set_language('en')

    my_lp = Nafparser.create_linguistic_processor('text', 'nltk', nltk.__version__)
    Nafparser.add_linguistic_processor('text', my_lp)

    index = 0
    counter = 0
    sent = 1

    for k, v in token_dict.items():
        span = []
        prev_text = index
        text = text_dict.get(k)
        if text is None:
            print('ERROR: orginal text not found for:', k)
        else:
            for token in v:
                counter += 1
                text_loc = index - prev_text
                if text[text_loc] == token[0]:
                    offset = str(index)
                elif text[text_loc + 1] == token[0]:
                    index += 1
                    offset = str(index)
                length = len(token)
                index += length
                wid =  'w' + str(counter)
                Nafparser.create_wf(token, str(sent), offset, wid, str(length))
                span.append(wid)
                if token == ".":
                    sent += 1
            #once all tokens are added, add markable identifying the speaker
            #print(k, span)

    Nafparser.dump(outputfile)


In [None]:

def clean_xml(xmlfile):

    with open (xmlfile, "r") as myfile:
        output = xmlfile.replace(".xml", "-cleand.xml")
        with open(output, "w") as outfile:
            for line in myfile:
                if "&nbsp;" in line:
                    line = line.replace("&nbsp;", "nbsp")
                outfile.write(line)
    return output

In [None]:
def from_xml_to_naf(xmlfile, outputfile):

    cleaned_xml = clean_xml(xmlfile)
    speaker_text_dict = extract_text(cleaned_xml)
    if len(speaker_text_dict) > 0:
        speaker_tokens_dict = turn_text_to_tokens(speaker_text_dict)
        create_naf_file(speaker_text_dict, speaker_tokens_dict, outputfile)

In [None]:
#provide paths from the input directory to the output directory
inputdir = '../../dbnl_input_test/'
outputdir = '../../naf_input_test/'

#check if outputdir exists, otherwise create it
if not os.path.isdir(outputdir):
    os.mkdir(outputdir)

for filename in os.listdir(inputdir):
    #assumes all dbnl files end with .xml, adjust if there are other endings
    if filename.endswith(".xml"):
        inputfile = os.path.join(inputdir, filename)
        print(inputfile)
        outputfile = os.path.join(outputdir, filename)
        from_xml_to_naf(inputfile, outputfile)