In [1]:
import gzip
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as ET

In [2]:
movie_plots = pd.read_csv(
    'MovieSummaries/plot_summaries.txt',
    delimiter='\t',
    names=['id', 'plot']
)

In [3]:
agent_verbs_args = ['nsubj', 'agent']
patient_verbs_args = ['dobj', 'nsubjpass', 'iobj', 'prep_']
attributes_governors_args = ['nsubj', 'appos']
attributes_dependents_args = ['nsubj', 'appos', 'amod', 'nn']

In [4]:
def display_mention(mention, root):
    if (mention.attrib and mention.attrib['representative'] == 'true'):
        print('This is the representative mention')
    sentence_id = mention.find('sentence').text
    start = int(mention.find('start').text)
    end = int(mention.find('end').text)
    head = int(mention.find('head').text)
    for element in root.findall('./document/sentences/sentence/[@id="'+sentence_id+'"]/tokens/token'):
        if int(element.attrib['id']) in range(start, end):
            if int(element.attrib['id']) == head:
                head_word = element.find('word').text
            print(element.find('word').text, end=" ")
    print(" ")
    print("Head word is: ", head_word)

In [5]:
def generate_dependencies(root):
    dependencies = []
    for sentence in root.findall("./document/sentences/sentence"):
        for dependency in sentence.findall("./collapsed-ccprocessed-dependencies/dep"):
            if (dependency.attrib['type'] in agent_verbs_args + patient_verbs_args + attributes_governors_args + attributes_dependents_args):
                dependencies.append(((int(sentence.attrib['id'])), int(dependency.find('dependent').attrib['idx']), int(dependency.find('governor').attrib['idx']), dependency.attrib['type']))
    return dependencies

In [6]:
def generate_person_rw_couples(dependencies, people, root):
    rw_bag = []
    for (s, dep, gov, typ) in dependencies: 
        # Replace dep and gov by the head of the representative mention, if applicable
        filtered_corefs = list(filter(lambda x: any([(int(mention.find('./head').text) in [dep, gov]) and (
                        s == int(mention.find('./sentence').text)) for 
                        mention in x.findall('./mention')]) , 
                        root.findall("./document/coreference/coreference")))
        dep_sentence = s
        gov_sentence = s
        if len(filtered_corefs) > 0:
            dep_heads = []
            gov_heads = []
            for coref in filtered_corefs:
                dep_mentions = list(filter(lambda mention: (int(mention.find('./head').text) == dep) and (
                            s == int(mention.find('./sentence').text)), 
                            coref.findall("./mention")))
                gov_mentions = list(filter(lambda mention: (int(mention.find('./head').text) == gov) and (
                            s == int(mention.find('./sentence').text)), 
                            coref.findall("./mention")))
                head = int(coref.find("./mention/[@representative='true']/head").text)
                head_sentence = int(coref.find("./mention/[@representative='true']/sentence").text)
                if len(dep_mentions) > 0:
                    dep_heads.append((head, head_sentence))
                if len(gov_mentions) > 0:
                    gov_heads.append((head, head_sentence))
                
            # if len(set(dep_heads)) > 1:
                # print('More than 1 representative for dep')
            # elif len(set(gov_heads)) > 1:
                # print('More than 1 representative for gov')
            # else:
            if len(set(dep_heads)) == 1:
                dep, dep_sentence = dep_heads[0]
            if len(set(gov_heads)) == 1:
                gov, gov_sentence = gov_heads[0]

        r = ''

        # Output (person, [word, type_identifier])
        if ((typ in patient_verbs_args) or ('prep_' in typ)):
            r = 'patient'
        elif((typ in agent_verbs_args) and (root.find("./document/sentences/sentence/[@id='"+str(s)+"']/tokens/token/[@id='"+str(gov)+"']/POS"
                    ) and "VB" in root.find("./document/sentences/sentence/[@id='"+str(s)+"']/tokens/token/[@id='"+str(gov)+"']/POS").text)):
            r = 'agent'
        elif(typ in attributes_dependents_args):
            r = 'attribute'

        # Append bag of (r,w)
        if r != '':
            if (r in ['agent', 'patient']) or (typ in attributes_governors_args) :
                rw_bag.append((root.find("./document/sentences/sentence/[@id='"+str(dep_sentence)+"']/tokens/token/[@id='"+str(dep)+"']/lemma").text,
                        (r, root.find("./document/sentences/sentence/[@id='"+str(gov_sentence)+"']/tokens/token/[@id='"+str(gov)+"']/lemma").text)))
            if (r == 'attribute'):
                rw_bag.append((root.find("./document/sentences/sentence/[@id='"+str(gov_sentence)+"']/tokens/token/[@id='"+str(gov)+"']/lemma").text,
                        (r, root.find("./document/sentences/sentence/[@id='"+str(dep_sentence)+"']/tokens/token/[@id='"+str(dep)+"']/lemma").text)))

    # Return (r,w) couples related to people
    return list(filter(lambda x: x[0] in people, rw_bag))

In [7]:
words_per_movie = []
for (i,j) in tqdm(enumerate(movie_plots['id'])):
    words_per_character = []
    with gzip.open('corenlp_plot_summaries/' + str(j) + '.xml.gz', 'rt', encoding='utf-8') as f:
        root = ET.fromstring(f.read())
        characters = list(set(map(lambda x: x.find('word').text, list(filter(
            lambda x: x.find('NER').text == 'PERSON', root.findall("./document/sentences/sentence/tokens/token"))))))
        dependencies = generate_dependencies(root)
        rw_list = generate_person_rw_couples(dependencies, characters, root)
        rw_per_character = []
        for person in characters:
            rw_per_character.append((person, list(set(map(lambda x: x[1], list(filter(lambda x: x[0] == person, rw_list)))))))
        rw_per_character = list(filter(lambda x: len(x[1]) > 0, rw_per_character))
        words_per_character = words_per_character + rw_per_character
    words_per_movie.append((j, words_per_character))

42303it [1:11:03,  9.92it/s]


In [8]:
# print('The total number of characters is: {}.'.format(len(words_per_character)))

In [9]:
# words_per_character[:5]

In [11]:
root = ET.Element("root")
doc = ET.SubElement(root, "document")

for j in range(len(words_per_movie)):
    movie_id = words_per_movie[j][0]
    words_per_character = words_per_movie[j][1]
    mov = ET.SubElement(doc, "movie", id=str(movie_id))
    for i in range(len(words_per_character)):
        character = words_per_character[i][0]
        character_words = words_per_character[i][1]
        ch = ET.SubElement(mov, "character", name=character)
        for (r, w) in character_words:
            ET.SubElement(ch, r).text = w

tree = ET.ElementTree(root)
tree.write("generated_character_words_per_movie.xml")

In [12]:
import lxml.etree
import lxml.builder    

E = lxml.builder.ElementMaker()
ROOT = E.root
DOC = E.document
CHARACTER = E.character
WORD = E.word

doc = ROOT( DOC() ) 

for j in range(len(words_per_movie)):
    movie_id = words_per_movie[j][0]
    words_per_character = words_per_movie[j][1]
    for i in range(len(words_per_character)):
        character = words_per_character[i][0]
        character_words = words_per_character[i][1]
        ch = E.character(
            E.name(character),
            E.movie_id(str(movie_id)),
            *(E.word(w, name=r) for (r,w) in character_words)
        )
        doc.append(ch)

pretty_output = open("generated_character_words_prettyprint_per_movie.xml", "w")
pretty_output.write(lxml.etree.tostring(doc, pretty_print=True).decode())
pretty_output.close()