In [1]:
import xml.etree.ElementTree as ET
import core2sent
import pandas as pd

In [3]:
FOLDER_PATH = "../data/corenlp_plot_summaries/"
PLOT_SUMMARY_PATH = "../data/plot_summaries.txt"

In [4]:
plot_summaries = pd.read_csv(PLOT_SUMMARY_PATH, sep='\t', header=None)
plot_summaries.columns = ['wiki_id', 'plot']
plot_summaries.head()

Unnamed: 0,wiki_id,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [37]:
def map_mentions_to_first_mention(coreferences, tokens_per_sentence):
    """
    :param coreferences: dict of coreferences
    :param tokens_per_sentence: list of tokens per sentence
    return: mapped sentences
    """
    # generate the token dict
    token_dict = {}
    for sentence_id, sentence in enumerate(tokens_per_sentence):
        token_dict[sentence_id] = []
        for token in sentence['tokens']:
            token_dict[sentence_id].append(token[0])

    # map the mentions to the first mention
    for mention_replacements in coreferences:
        for mention_id, mention in enumerate(mention_replacements['mentions']):
            sentence_id = int(mention['sentence'])
            start = int(mention['head'])
            end = int(mention['end'])
            print(sentence_id, start, end, ' '.join(token_dict[sentence_id][start:end]))
            if mention_id == 0:
                representative_mention_name = token_dict[sentence_id][start:end]
            else:
                other_mention_name = token_dict[sentence_id][start:end]
                token_dict[sentence_id][start:end] = [representative_mention_name, ''*(end-start-1)]
        
    # concatenate the tokens back to sentences
    replaced_sentences = []
    for sentence_id, sentence in enumerate(tokens_per_sentence):
        replaced_sentences.append(' '.join(token_dict[sentence_id]))
    replaced_sentences = ' '.join(replaced_sentences)
    return replaced_sentences

In [38]:
for index, row in plot_summaries.iterrows():
    if str(row['wiki_id']) == "3217":
        FILE_PATH = FOLDER_PATH + str(row['wiki_id']) + ".txt.xml"
        tree = ET.parse(FILE_PATH)
        parsed_xml = core2sent.convert_sentences(tree)
        parsed_corref = core2sent.convert_coref(tree)
        mapped_sentences = map_mentions_to_first_mention(parsed_corref, parsed_xml)

0 13 41 AD , where he is almost immediately captured by Lord Arthur 's men , who suspect him to be an agent for Duke Henry , with whom Arthur
0 13 14 AD
0 33 41 an agent for Duke Henry , with whom
0 16 17 where
0 29 30 who
1 0 1 He
1 9 10 ,
0 23 25 by Lord
2 17 19 Arthur 's


TypeError: sequence item 4: expected str instance, list found

In [32]:
tree = ET.parse('../data/corenlp_plot_summaries/3217.txt.xml')
parsed_corref = core2sent.convert_coref(tree)
parsed_corref

[{'mentions': [{'sentence': 0, 'start': 12, 'end': 41, 'head': 13},
   {'sentence': 0, 'start': 12, 'end': 14, 'head': 13}],
  'first_mention': (0, 13),
  'num': 0,
  'id': 'E0'},
 {'mentions': [{'sentence': 0, 'start': 32, 'end': 41, 'head': 33},
   {'sentence': 0, 'start': 16, 'end': 17, 'head': 16},
   {'sentence': 0, 'start': 29, 'end': 30, 'head': 29},
   {'sentence': 1, 'start': 0, 'end': 1, 'head': 0},
   {'sentence': 1, 'start': 9, 'end': 10, 'head': 9}],
  'first_mention': (0, 16),
  'num': 1,
  'id': 'E1'},
 {'mentions': [{'sentence': 0, 'start': 22, 'end': 25, 'head': 23},
   {'sentence': 2, 'start': 17, 'end': 19, 'head': 17}],
  'first_mention': (0, 23),
  'num': 2,
  'id': 'E2'},
 {'mentions': [{'sentence': 0, 'start': 22, 'end': 41, 'head': 25},
   {'sentence': 0, 'start': 22, 'end': 26, 'head': 25},
   {'sentence': 3, 'start': 4, 'end': 6, 'head': 5}],
  'first_mention': (0, 25),
  'num': 3,
  'id': 'E3'},
 {'mentions': [{'sentence': 1, 'start': 5, 'end': 14, 'head': 7}