In [418]:
import os
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import re

In [607]:
class Doc():
    def __init__(self):
        self.Files = None
        self.rawSGM = None
        self.DocId = None
        self.DocType = None 
        self.DateTime = None 
        self.Headline = None
        self.Text = None
        self.Events = None
    
    def showMentions(self):
        def colorit(t):
            ENDC = '\033[0m'
            colors = ['\033[1;34;41m','\033[1;34;42m','\033[1;34;43m','\033[1;34;44m',
                      '\033[1;34;45m','\033[1;34;46m','\033[1;34;47m','\033[1;34;48m',
                      '\033[1;32;41m','\033[1;32;42m','\033[1;32;43m','\033[1;32;44m',
                      '\033[1;32;45m','\033[1;32;46m','\033[1;32;47m','\033[1;32;48m',
                      '\033[1;33;41m','\033[1;33;42m','\033[1;33;43m','\033[1;33;44m',
                      '\033[1;33;45m','\033[1;33;46m','\033[1;33;47m','\033[1;33;48m',
                      '\033[1;36;41m','\033[1;36;42m','\033[1;36;43m','\033[1;36;44m',
                      '\033[1;36;45m','\033[1;36;46m','\033[1;36;47m','\033[1;36;48m',
                      '\033[1;37;41m','\033[1;37;42m','\033[1;37;43m','\033[1;37;44m',
                      '\033[1;37;45m','\033[1;37;46m','\033[1;37;47m','\033[1;37;48m',
                      '\033[1;30;41m','\033[1;30;42m','\033[1;30;43m','\033[1;30;44m',
                      '\033[1;30;45m','\033[1;30;46m','\033[1;30;47m','\033[1;30;48m',
                      '\033[1;35;41m','\033[1;35;42m','\033[1;35;43m','\033[1;35;44m',
                      '\033[1;35;45m','\033[1;35;46m','\033[1;35;47m','\033[1;35;48m',
                      '\033[1;31;41m','\033[1;31;42m','\033[1;31;43m','\033[1;31;44m',
                      '\033[1;31;45m','\033[1;31;46m','\033[1;31;47m','\033[1;31;48m',
                      '\033[1;31;40m','\033[1;31;40m','\033[1;31;40m','\033[1;31;40m',
                      '\033[1;31;40m','\033[1;31;40m','\033[1;31;40m','\033[1;31;40m']
            
            return f"{colors[t[-1]%8]}{t[1]}{ENDC}"
        
        text = self.rawSGM
        charseqs = []
        for i in range(len(self.Events)):
            event = self.Events[i]
            for mention in event['MENTIONS']:
                charseqs.append([mention['CHARSEQ'],mention['TEXT'],i])

        parts  = []
        lastindex = 0
        charseqs.sort()
        for c in charseqs:
            START,END = c[0][0],c[0][1]
            parts.append(text[lastindex:START])
            parts.append(colorit(c))
            lastindex=END+1
        print (' '.join(parts).replace('  ',' ').replace('\n ','\n').replace('``','"').replace("''",'"'))
    
    


In [608]:
class Ace05Parser():
    """
    PARSER CLASS
    """
    def __init__(self,path):
        self.path = path
        self.Docs = []
        
    def groupFiles(self):
        """
        There are 4 files, which has different information, for each document.
        *.ag.xml  :
        *.apf.xml : This file has the annotated entities and events with arguments.
        *.sgm     : This file has the raw text and includes metadata such as; DocID, DocType, Datetime, Headline, Text.
        *.tab     :
        """
        self.listofFiles = os.listdir(self.path)
        self.groupedFiles = [[self.path+doc for doc in self.listofFiles[doc_idx:doc_idx+4]] for doc_idx in list(range(len(self.listofFiles)))[::4]]            
    
    def parseFiles(self):    
        
        for files in self.groupedFiles:
            
            docObject = Doc()
            docObject.Files = files
            docObject.Events = []
            agxml,apfxml,sgm,tab = files[0],files[1],files[2],files[3]
            
            # SGM Parse
            bs = BeautifulSoup(open(sgm,'r'))
            docObject.rawSGM = bs.text   
            docObject.DocId, docObject.DocType, \
            docObject.Datetime,docObject.Text = [attr.text for attr in bs.find_all(['docid','doctype','datetime','text'])]
            
            # APF.XML Parse
            tree = ET.parse(apfxml)
            root = tree.getroot()
            for event in root.findall('./document/event'):
                EVENT = event.attrib.copy()
                mentions = []
                for mention in event.findall('event_mention'):        
                    mention_ID = mention.attrib['ID']
                    mention_token = mention.find('anchor/charseq')
                    mention_text= mention_token.text
                    mention_charseq = [int(mention_token.attrib['START'])-1,int(mention_token.attrib['END'])]

                    mention_arguments = {}
                    for argument in mention.findall('event_mention_argument'):
                        REFID = argument.attrib['REFID']
                        ROLE = argument.attrib['ROLE']
                        TEXT = argument.find('./extent/charseq').text
                        CHARSEQ = [int(argument.find('./extent/charseq').attrib['START'])-1,int(argument.find('./extent/charseq').attrib['END'])]
                        mention_arguments[ROLE] = {'refid' : REFID, 'role' : ROLE, 'text' : TEXT, 'charseq' : CHARSEQ}
                    if mention_arguments == {}:
                        mention_arguments = None

                    mentions.append({'ID':mention_ID,'TEXT':mention_text,'CHARSEQ':mention_charseq,'ARGUMENTS':mention_arguments})
                EVENT['MENTIONS'] = mentions

                docObject.Events.append(EVENT)
                
            self.Docs.append(docObject)

In [623]:
parser = Ace05Parser('Dataset/data/English/nw/timex2norm/')
parser.groupFiles()
parser.parseFiles()

In [624]:
index = 43
parser.Docs[index].showMentions()


APW_ENG_20030310.0719 
NEWS STORY 
20030310 

Report: Hong Kong Jockey Club in talks to acquire its Macau rival


HONG KONG (AP)

The Hong Kong Jockey Club is in talks about [1;34;41mbuying out[0m the horse
racing club in neighboring Macau, a newspaper reported Tuesday.

The South China Morning Post quoted Hong Kong Jockey Club Chief
Executive Lawrence Wong as saying the discussions were underway, but
no price was reported.

The Hong Kong club is a charitable entity, while the Macau Jockey
Club is a for-profit concern controlled by gambling tycoon Stanley
Ho.

Hong Kong angered Macau's gaming industry last year when it banned
gamblers in Hong Kong from placing horse bets with anyone but the
Hong Kong Jockey Club. Macau had been taking off-track bets from
people in Hong Kong, but the offices handling those wagers were
closed.

The Macau club later threatened to start taking wagers on Hong Kong
races in Macau, at a better price for gamblers because less tax would
be levied, but the pl