In [6]:
# this script converts xml files to NER format
import os
from lxml import etree


class Entity:
    def __init__(self, _id, _type, _description):
        self.id = _id
        self.type = _type
        self.description = _description

        
        
class Token:
    def __init__(self, _id, _sent_id, _text, _ent_iob='O', _ent_type=''):
        self.id = _id
        self.sent_id = _sent_id
        self.text = _text
        self.ent_iob = _ent_iob
        self.ent_type = _ent_type
        
    def json(self):
        return {
            'id': self.id,
            'sentence_id': self.sent_id,
            'text': self.text,
            'entity_iob': self.ent_iob,
            'entity_type': self.ent_type
        }
    

def parse_xml_to_ner_format(doc_path, output_writer, verbose=False):
    tree = etree.parse(doc_path)
    # parse tokens
    token_tags = tree.findall('token')
    token_map = {}
    tokens = []
    for t in token_tags:
        sent_id = t.get('sentence')
        token_id = t.get('t_id')
        text = t.text
        if verbose:
            print(sent_id, token_id, text)
        token = Token(token_id, sent_id, text)
        token_map[token_id] = token
        tokens.append(token)
        
    # parse entities
    markables = tree.find('Markables')
    entities = {}
    for m in markables:
        if m.tag == 'ENTITY':
            ent_id = m.get('m_id')
            ent_type = m.get('ent_type')
            ent_description = m.get('TAG_DESCRIPTOR')
            if ent_type is not None and ent_type != '':
                e = Entity(ent_id, ent_type, ent_description)
                entities[ent_id] = e
    if verbose:
        print('Entities: ')
        for e_id in entities.keys():
            print(e_id, entities[e_id].type, entities[e_id].description)
            
    # parse entity-mentions
    mentions = markables.findall('ENTITY_MENTION')
    mentioned_tokens = {}
    for m in mentions:
        m_id = m.get('m_id')
        mentioned_tokens[m_id] = []
        for t in m.findall('token_anchor'):
            mentioned_tokens[m_id].append(t.get('t_id'))
        mentioned_tokens[m_id] = sorted(mentioned_tokens[m_id])
        
    # update token ner tags
    relations = tree.find('Relations')
    referrals = relations.findall('REFERS_TO')
    for r in referrals:
        target = r.find('target')
        ent_id = target.get('m_id')
        if ent_id in entities:
            sources = r.findall('source')
            ent = entities[ent_id]
            if verbose:
                print(ent.type, ent.description)
            entity_type = ent.type 
            source_token_ids = []
            for s in sources:
                e_mention_id = s.get('m_id')            
                if e_mention_id in mentioned_tokens:
                    source_token_ids.extend(mentioned_tokens[e_mention_id])

            source_token_ids = list(set(source_token_ids))
            int_source_token_ids = sorted([int(i) for i in source_token_ids])
            source_token_ids = list([str(i) for i in int_source_token_ids])
            if verbose:
                print('--------------------------------------------------')
            cur_sent_id = None
            for tid in source_token_ids:
                token = token_map[tid]
                new_sent_bool = (cur_sent_id != token.sent_id)
                entity_iob = 'I'
                if new_sent_bool:
                    cur_sent_id = token.sent_id
                    entity_iob = 'B'
                token.ent_iob = entity_iob
                token.ent_type = entity_type
                if verbose:
                    print(token.json())
            if verbose:
                print('--------------------------------------------------')
        
    # write tokens to output
    cur_sent_id = None
    for t in tokens:
        if cur_sent_id != t.sent_id:
            output_writer.write('\n')
            cur_sent_id = t.sent_id

        ent = t.ent_iob if t.ent_iob == 'O' else t.ent_iob + '-' + t.ent_type
        output_writer.write(t.text + ' ' + ent + '\n')
    

In [8]:
# testing
output_writer = open('all.txt', 'w')
doc_path = os.path.join('intra_cross_doc_annotations', 'corpus_stock', '124259_US_stocks_log_gains_for_fourth_week_in_a_row.xml')
parse_xml_to_ner_format(doc_path, output_writer, True)
output_writer.close()

0 1 Amerikaanse
0 2 beurzen
0 3 stijgen
0 4 vierde
0 5 opeenvolgende
0 6 week
1 7 04-Apr-09
2 8 Op
2 9 vrijdag
2 10 noteerden
2 11 de
2 12 Amerikaanse
2 13 beursindexen
2 14 opnieuw
2 15 een
2 16 stijging
2 17 ,
2 18 waarmee
2 19 een
2 20 positieve
2 21 periode
2 22 van
2 23 vier
2 24 weken
2 25 is
2 26 neergezet
2 27 .
3 28 Dit
3 29 ondanks
3 30 een
3 31 rapport
3 32 dat
3 33 diezelfde
3 34 dag
3 35 door
3 36 het
3 37 Amerikaanse
3 38 ministerie
3 39 van
3 40 werkgelegenheid
3 41 werd
3 42 gepubliceerd
3 43 en
3 44 waarin
3 45 werd
3 46 gemeld
3 47 dat
3 48 de
3 49 Amerikaanse
3 50 economie
3 51 in
3 52 maart
3 53 663.000
3 54 banen
3 55 was
3 56 kwijtgeraakt
3 57 en
3 58 de
3 59 werkeloosheid
3 60 was
3 61 gestegen
3 62 tot
3 63 8,5
3 64 %
3 65 ,
3 66 het
3 67 hoogste
3 68 cijfer
3 69 sinds
3 70 1983
3 71 .
4 72 De
4 73 Dow
4 74 Jones
4 75 Industrial
4 76 Average
4 77 noteerde
4 78 een
4 79 lichte
4 80 stijging
4 81 van
4 82 een
4 83 half
4 84 procent
4 85 of
4 86 39
4 87 punten
4 88

In [9]:
# composing all.txt
import os
parent_dir = 'with_original_labels'
output_writer = open(os.path.join(parent_dir, 'all.txt'), 'w')
parent_dir = os.path.join('intra_cross_doc_annotations')
subdirs = os.listdir(parent_dir)
for sub in subdirs:
    print('...in ', sub)
    doc_annotations_path = os.path.join(parent_dir, sub)
    subsubs = os.listdir(doc_annotations_path)
    for f in subsubs:
        ftype = f[-3:]
        if ftype == 'xml':
            print('parsing ', f)
            doc_path = os.path.join(doc_annotations_path, f)
            parse_xml_to_ner_format(doc_path, output_writer)

output_writer.close()


...in  corpus_stock
parsing  124259_US_stocks_log_gains_for_fourth_week_in_a_row.xml
parsing  62405_Sub-prime_lenders_send_jitters_through_global_markets.xml
parsing  251007_Japanese_stocks_continue_to_fall_after_earthquake.xml
parsing  116834_Japan_enters_recession.xml
parsing  121416_US_stock_markets_fall_to_lowest_levels_since_1997.xml
parsing  113219_Stock_markets_worldwide_fall_dramatically.xml
parsing  96770_World_stocks_plunge_on_fears_on_US_recession.xml
parsing  114864_Global_markets_plunge.xml
parsing  82738_US_stock_markets_tumble.xml
parsing  113330_Shares_worldwide_surge_due_to_US_government_plan.xml
parsing  121225_Dow_Jones_Industrial_Average_closes_at_lowest_level_in_six_years.xml
parsing  131162_Stock_markets_worldwide_rise_on_hopes_of_US_economic_recovery.xml
parsing  112579_Dow_falls_340_points_amid_unemployment_and_retail_sales_rates_news.xml
parsing  115064_Global_markets_surge_in_value.xml
parsing  114904_Dow_Jones_recovers_hundreds_of_points,_before_losing_them_i

In [13]:
# available ner tags
f = open(os.path.join(parent_dir, 'all.txt'), 'r')
s = set()
for l in f:
    if l != '\n':
        second = l[:-1].split(' ')[1]
        s.add(second)
f.close()
print(s)

{'I-FIN', 'B-PER', 'I-PRO', 'I-ORG', 'B-PRO', 'B-FIN', 'B-ORG', 'O', 'I-LOC', 'I-PER', 'B-LOC'}


In [30]:
# split all.txt into train.txt (70%) and test.txt (30%)
import random

f = open(os.path.join(parent_dir, 'all.txt'), 'r')
sentences = []
sent = []
lines = f.readlines()
num_sents = 0
for l in lines:
    if l == '\n':
        num_sents += 1

num_train = int(num_sents * 0.7)

for l in lines:
    if l == '\n' and len(sent) > 0:
        sentences.append(sent.copy())
        sent = []
    elif l != '\n':
        sent.append(l[:-1])

print('num_sents: ', num_sents)
print('num_train: ', num_train)
print('num_test: ', num_sents - num_train)
random.shuffle(sentences)     
sents_train = []
sents_test = []
for i, sent in enumerate(sentences):
    if i <= num_train:
        sents_train.append(sent)
    else:
        sents_test.append(sent)

w = open(os.path.join(parent_dir, 'train.txt'), 'w')
for sent in sents_train:
    for t in sent:
        w.write(t + '\n')
    w.write('\n')
w.close()

w = open(os.path.join(parent_dir, 'test.txt'), 'w')
for sent in sents_test:
    for t in sent:
        w.write(t + '\n')
    w.write('\n')
w.close()
f.close()

num_sents:  1786
num_train:  1250
num_test:  536


In [1]:
# write the files with original labels to new files with regular labels (PER, ORG, LOC, MISC)
import os
def map_to_regular_ner_tags(input_path, output_path):
    f = open(input_path, 'r')
    w = open(output_path, 'w')
    for l in f:
        if l != '\n':
            ner = l[:-1].split(' ')[1]
            if 'FIN' in ner or 'PRO' in ner:
                word = l[:-1].split(' ')[0]
                w.write(word + ' ' + ner[0:2] + 'MISC' + '\n')
            else:
                w.write(l)
        else:
            w.write(l)
                
    f.close()
    w.close()

In [3]:
input_path = os.path.join('with_original_labels', 'all.txt')
output_path = os.path.join('all.txt')
map_to_regular_ner_tags(input_path, output_path)

In [4]:
input_path = os.path.join('with_original_labels', 'train.txt')
output_path = os.path.join('train.txt')
map_to_regular_ner_tags(input_path, output_path)

In [5]:
input_path = os.path.join('with_original_labels', 'test.txt')
output_path = os.path.join('test.txt')
map_to_regular_ner_tags(input_path, output_path)