In [1]:
XML_FILE_EXAMPLE = 'MovieSummaries/morethan100MB/corenlp_plot_summaries/4227.xml.gz'

In [2]:
import xml.etree.ElementTree as ET
import gzip

def extract_named_entities(xml_file):
    with gzip.open(xml_file, 'rt', encoding='utf-8') as f:
        tree = ET.parse(f)
        root = tree.getroot()

        entities = []
        for sentence in root.iter('sentence'):
            for token in sentence.iter('token'):
                ner = token.find('NER').text
                word = token.find('word').text
                if ner != 'O':  # Skip non-entities
                    entities.append((word, ner))
        return entities

entities = extract_named_entities(XML_FILE_EXAMPLE)


In [3]:
entities

[('Redmond', 'PERSON'),
 ('Barry', 'PERSON'),
 ('Barry', 'PERSON'),
 ('Lyndon', 'PERSON'),
 ('An', 'PERSON'),
 ('1750s', 'DATE'),
 ('Ireland', 'LOCATION'),
 ('Redmond', 'PERSON'),
 ('Barry', 'PERSON'),
 ('Barry', 'PERSON'),
 ('Nora', 'PERSON'),
 ('Brady', 'PERSON'),
 ('Barry', 'PERSON'),
 ('English', 'MISC'),
 ('John', 'PERSON'),
 ('Quin', 'PERSON'),
 ('Nora', 'PERSON'),
 ('Barry', 'PERSON'),
 ('Quin', 'PERSON'),
 ('Barry', 'PERSON'),
 ('Dublin', 'LOCATION'),
 ('Feeney', 'PERSON'),
 ('Barry', 'PERSON'),
 ('British', 'MISC'),
 ('Grogan', 'PERSON'),
 ('Quin', 'PERSON'),
 ('Barry', 'PERSON'),
 ('Nora', 'PERSON'),
 ('Barry', 'PERSON'),
 ('Nora', 'PERSON'),
 ('Quin', 'PERSON'),
 ('Barry', 'PERSON'),
 ('France', 'LOCATION'),
 ('the', 'DURATION'),
 ('Seven', 'DURATION'),
 ('Years', 'DURATION'),
 ("'", 'MISC'),
 ('War', 'MISC'),
 ('Grogan', 'PERSON'),
 ('French', 'MISC'),
 ('Minden', 'LOCATION'),
 ('Barry', 'PERSON'),
 ('Holland', 'LOCATION'),
 ('Potzdorf', 'PERSON'),
 ('British', 'LOCATION'),

In [4]:
all_tokens = set([entity[1] for entity in entities])
all_tokens

{'DATE',
 'DURATION',
 'LOCATION',
 'MISC',
 'ORDINAL',
 'ORGANIZATION',
 'PERSON',
 'TIME'}

In [5]:
import pandas as pd
df_entities = pd.DataFrame(entities, columns=["content", "token_type"])
df_entities = df_entities.groupby("token_type").agg(list).reset_index()

In [24]:
df_entities #.to_dict(orient="split")

Unnamed: 0,token_type,content
0,DATE,"[1750s, 1763, now]"
1,DURATION,"[the, Seven, Years, the, next, few, years]"
2,LOCATION,"[Ireland, Dublin, France, Minden, Holland, Bri..."
3,MISC,"[English, British, ', War, French, Prussians, ..."
4,ORDINAL,[second]
5,ORGANIZATION,"[Prussian, Ministry, of, Police, Prince, of, T]"
6,PERSON,"[Redmond, Barry, Barry, Lyndon, An, Redmond, B..."
7,TIME,"[the, night, morning]"


In [25]:
dict_ = {b:a for a , b in entities}

In [26]:
dict_.items()

dict_items([('PERSON', 'III'), ('DATE', 'now'), ('LOCATION', 'Belgium'), ('MISC', 'Prussians'), ('DURATION', 'years'), ('ORDINAL', 'second'), ('ORGANIZATION', 'T'), ('TIME', 'morning')])

In [27]:
from collections import defaultdict

entity_dict = defaultdict(list)

# Populate the dictionary
for name, entity_type in entities:
    entity_dict[entity_type].append(name)

# Convert to a regular dict if needed
entity_dict = dict(entity_dict)

print(entity_dict)

{'PERSON': ['Redmond', 'Barry', 'Barry', 'Lyndon', 'An', 'Redmond', 'Barry', 'Barry', 'Nora', 'Brady', 'Barry', 'John', 'Quin', 'Nora', 'Barry', 'Quin', 'Barry', 'Feeney', 'Barry', 'Grogan', 'Quin', 'Barry', 'Nora', 'Barry', 'Nora', 'Quin', 'Barry', 'Grogan', 'Barry', 'Potzdorf', 'Barry', 'Frederick', 'Potzdorf', 'Barry', 'Potzdorf', 'Chevalier', 'de', 'Balibari', 'Patrick', 'Magee', 'Barry', 'Barry', 'Chevalier', 'Chevalier', 'Chevalier', 'Chevalier', 'Barry', 'Chevalier', 'Chevalier', 'Barry', 'Chevalier', 'Barry', 'Chevalier', 'Barry', 'Chevalier', 'Barry', 'Barry', 'Countess', 'Lyndon', 'Charles', 'Lyndon', 'George', 'III'], 'DATE': ['1750s', '1763', 'now'], 'LOCATION': ['Ireland', 'Dublin', 'France', 'Minden', 'Holland', 'British', 'Prussian', 'Prussian', 'Prussian', 'Prussian', 'Europe', 'Belgium'], 'MISC': ['English', 'British', "'", 'War', 'French', 'Prussians', 'Prussians'], 'DURATION': ['the', 'Seven', 'Years', 'the', 'next', 'few', 'years'], 'ORDINAL': ['second'], 'ORGANIZAT

In [28]:
pd.DataFrame.from_dict(entity_dict, orient="index")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
PERSON,Redmond,Barry,Barry,Lyndon,An,Redmond,Barry,Barry,Nora,Brady,...,Barry,Chevalier,Barry,Barry,Countess,Lyndon,Charles,Lyndon,George,III
DATE,1750s,1763,now,,,,,,,,...,,,,,,,,,,
LOCATION,Ireland,Dublin,France,Minden,Holland,British,Prussian,Prussian,Prussian,Prussian,...,,,,,,,,,,
MISC,English,British,',War,French,Prussians,Prussians,,,,...,,,,,,,,,,
DURATION,the,Seven,Years,the,next,few,years,,,,...,,,,,,,,,,
ORDINAL,second,,,,,,,,,,...,,,,,,,,,,
ORGANIZATION,Prussian,Ministry,of,Police,Prince,of,T,,,,...,,,,,,,,,,
TIME,the,night,morning,,,,,,,,...,,,,,,,,,,


In [32]:
row = { "ID_12" : entity_dict, "ID_13": entity_dict}

pd.DataFrame.from_dict(row, orient="index").reset_index("ID")

KeyError: 'Requested level (ID) does not match index name (None)'

In [13]:
import xml.etree.ElementTree as ET
import gzip

def extract_coreferences(xml_file):
    with gzip.open(xml_file, 'rt', encoding='utf-8') as f:
        tree = ET.parse(f)
        root = tree.getroot()

        corefs = []
        for coref in root.iter('coreference'):
            chain = []
            for mention in coref.iter('mention'):
                text_element = mention.find('text')
                if text_element is not None:  # Check if 'text' element exists
                    chain.append(text_element.text)
            if chain:  # Add chain only if it has valid mentions
                corefs.append(chain)
        return corefs

corefs = extract_coreferences(XML_FILE_EXAMPLE)
print(corefs)


[]


In [14]:
corefs

[]