In [1]:
import os
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import re

In [2]:
class Doc():
    def __init__(self):
        
        self.Files    = None
        self.rawSGM   = None
        self.rawAPF   = None
        self.DocId    = None
        self.DocType  = None 
        self.DateTime = None 
        self.Headline = None
        self.Text     = None
        self.Events   = None
    
    def showMentions(self):
        def colorit(t):
            ENDC = '\033[0m'
            colors = ['\033[1;34;41m','\033[1;34;42m','\033[1;34;43m','\033[1;34;44m',
                      '\033[1;34;45m','\033[1;34;46m','\033[1;34;47m','\033[1;34;48m',
                      '\033[1;32;41m','\033[1;32;42m','\033[1;32;43m','\033[1;32;44m',
                      '\033[1;32;45m','\033[1;32;46m','\033[1;32;47m','\033[1;32;48m',
                      '\033[1;33;41m','\033[1;33;42m','\033[1;33;43m','\033[1;33;44m',
                      '\033[1;33;45m','\033[1;33;46m','\033[1;33;47m','\033[1;33;48m',
                      '\033[1;36;41m','\033[1;36;42m','\033[1;36;43m','\033[1;36;44m',
                      '\033[1;36;45m','\033[1;36;46m','\033[1;36;47m','\033[1;36;48m',
                      '\033[1;37;41m','\033[1;37;42m','\033[1;37;43m','\033[1;37;44m',
                      '\033[1;37;45m','\033[1;37;46m','\033[1;37;47m','\033[1;37;48m',
                      '\033[1;30;41m','\033[1;30;42m','\033[1;30;43m','\033[1;30;44m',
                      '\033[1;30;45m','\033[1;30;46m','\033[1;30;47m','\033[1;30;48m',
                      '\033[1;35;41m','\033[1;35;42m','\033[1;35;43m','\033[1;35;44m',
                      '\033[1;35;45m','\033[1;35;46m','\033[1;35;47m','\033[1;35;48m',
                      '\033[1;31;41m','\033[1;31;42m','\033[1;31;43m','\033[1;31;44m',
                      '\033[1;31;45m','\033[1;31;46m','\033[1;31;47m','\033[1;31;48m',
                      '\033[1;31;40m','\033[1;31;40m','\033[1;31;40m','\033[1;31;40m',
                      '\033[1;31;40m','\033[1;31;40m','\033[1;31;40m','\033[1;31;40m']
            
            return f"{colors[t[-1]%8]}{t[1]}{ENDC}"
        
        text = self.rawSGM
        charseqs = []
        for i in range(len(self.Events)):
            event = self.Events[i]
            for mention in event['MENTIONS']:
                charseqs.append([mention['CHARSEQ'],mention['TEXT'],i])

        parts  = []
        lastindex = 0
        charseqs.sort()
        for c in charseqs:
            START,END = c[0][0],c[0][1]
            parts.append(text[lastindex:START])
            parts.append(colorit(c))
            lastindex=END
        parts.append(text[lastindex:])
        
        return str(len(self.Events))+'\n'+' '.join(parts).replace('  ',' ').replace('\n ','\n').replace('``','"').replace("''",'"')

In [3]:
class Ace05Parser():
    """
    PARSER CLASS
    """
    def __init__(self,path):
        self.path = path
        self.Docs = []
        
    def groupFiles(self):
        """
        There are 4 files, which has different information, for each document.
        *.ag.xml  :
        *.apf.xml : This file has the annotated entities and events with arguments.
        *.sgm     : This file has the raw text and includes metadata such as; DocID, DocType, Datetime, Headline, Text.
        *.tab     :
        """
        PATH_TO_SOURCES = self.path
        self.ALL_FILES = []
        for source in os.listdir(PATH_TO_SOURCES):
            for file in os.listdir(PATH_TO_SOURCES+source+'/timex2norm/'):
                self.ALL_FILES.append(PATH_TO_SOURCES+source+'/timex2norm/'+file)
                
        self.ALL_FILES.sort()
        self.groupedFiles = [self.ALL_FILES[doc_idx:doc_idx+4] for doc_idx in list(range(len(self.ALL_FILES)))[::4]]            
    
    def parseFiles(self):    
        
        for files in self.groupedFiles:
            
            docObject = Doc()
            docObject.Files = files
            docObject.Events = []
            agxml,apfxml,sgm,tab = files[0],files[1],files[2],files[3]
            
            # SGM Parse
            bs = BeautifulSoup(open(sgm,'r'))
            docObject.rawSGM = bs.text
            apf = open(apfxml,'r').read()
            docObject.rawAPF = apf
            docObject.DocId, docObject.DocType, \
            docObject.Datetime,docObject.Text = [attr.text for attr in bs.find_all(['docid','doctype','datetime','text'])]
            
            # APF.XML Parse
            tree = ET.parse(apfxml)
            root = tree.getroot()
            for event in root.findall('./document/event'):
                EVENT = event.attrib.copy()
                mentions = []
                for mention in event.findall('event_mention'):        
                    mention_ID = mention.attrib['ID']
                    mention_token = mention.find('anchor/charseq')
                    mention_text= mention_token.text
                    MSTART = int(mention_token.attrib['START'])-2
                    MEND = int(mention_token.attrib['END'])
                    mention_charseq = [MSTART,MEND]

                    mention_arguments = {}
                    for argument in mention.findall('event_mention_argument'):
                        REFID = argument.attrib['REFID']
                        ROLE = argument.attrib['ROLE']
                        TEXT = argument.find('./extent/charseq').text
                        ASTART = int(argument.find('./extent/charseq').attrib['START'])-2
                        AEND = int(argument.find('./extent/charseq').attrib['END'])
                        CHARSEQ = [ASTART,AEND]
                        mention_arguments[ROLE] = {'refid' : REFID, 'role' : ROLE, 'text' : TEXT, 'charseq' : CHARSEQ}
                    if mention_arguments == {}:
                        mention_arguments = None

                    mentions.append({'ID':mention_ID,'TEXT':mention_text,'CHARSEQ':mention_charseq,'ARGUMENTS':mention_arguments})
                EVENT['MENTIONS'] = mentions

                docObject.Events.append(EVENT)
                
            self.Docs.append(docObject)

In [5]:
parser = Ace05Parser('../../dataset/data/English/')
parser.groupFiles()
parser.parseFiles()

In [6]:
#325
index      = 325
sample_doc = parser.Docs[index]

In [7]:
#ie , im = -1,-1
#sample_doc.Events[ie]['MENTIONS'][im]['TEXT']
#t , c = sample_doc.Events[ie]['MENTIONS'][im]['TEXT'],sample_doc.Events[ie]['MENTIONS'][im]['CHARSEQ']
#print(t)
#print(c)
#sample_doc.rawSGM[c[0]:c[1]]

In [8]:
print(sample_doc.DocId)
print(sample_doc.showMentions())

 AFP_ENG_20030304.0250 
8

AFP_ENG_20030304.0250 
NEWS STORY 
20030304 

Death toll in Philippine blast could hit 30: radio ATTENTION - ADDS
fears of toll rising ///


DAVAO, Philippines, March 4 (AFP)

At least 19 people were [1;34;41mkilled[0m and 114 people were [1;34;42mwounded[0m in
Tuesday's southern Philippines airport [1;34;43mblast[0m , officials said, but
reports said the [1;34;44mdeath[0m toll could climb to 30.

Radio station DXDC placed the [1;34;47mdeath[0m toll at 30, without giving a
source for the figure, which officials could not immediately confirm.

The Davao Medical Center, a regional government hospital, recorded 19 [1;34;41mdeaths[0m with 50 [1;34;42mwounded[0m . Medical evacuation workers however said the [1;34;42minjured[0m list was around 114, spread out at various hospitals.

A powerful bomb [1;34;43mtore[0m through a waiting shed at the Davao City
international airport at about 5.15 pm (0915 GMT) while another [1;34;45mexplosion[0m hit a 

In [9]:
def check():
    NUMBERofMENTIONS = 5268
    NUMBERofCLUSTERS = 4046
    
    numberofmentions = 0
    numberofclusters = 0
        
    for doc in parser.Docs:
        numberofclusters+=len(doc.Events)

        for event in doc.Events:
            numberofmentions+=len(event['MENTIONS'])
        
    print ((NUMBERofCLUSTERS,numberofclusters),(NUMBERofMENTIONS,numberofmentions))

In [10]:
check()

(4046, 4090) (5268, 5349)


In [170]:
import pickle

docs = []
for doc in parser.Docs:
    docs.append(doc.__dict__)

with open('docs.pkl', 'wb') as p:
    pickle.dump(docs,p)

### Shape The Data

In [87]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import itertools
import pandas as pd

In [46]:
print(d.Text)


DAVAO, Philippines, March 4 (AFP)

At least 19 people were killed and 114 people were wounded in
Tuesday's southern Philippines airport blast, officials said, but
reports said the death toll could climb to 30.

Radio station DXDC placed the death toll at 30, without giving a
source for the figure, which officials could not immediately confirm.

The Davao Medical Center, a regional government hospital, recorded 19
deaths with 50 wounded. Medical evacuation workers however said the
injured list was around 114, spread out at various hospitals.

A powerful bomb tore through a waiting shed at the Davao City
international airport at about 5.15 pm (0915 GMT) while another
explosion hit a bus terminal at the city. There were no reports of
injuries in the second blast.

"It's a very powerful bomb. The waiting shed literally exploded,"
said Vice Mayor Luis Bongoyan, speaking to local radio station

Television footage showed medical teams carting away dozens of
wounded victims with fully armed t

In [169]:
def sentLevel(doc):
    text = doc.Text
    raw_text = doc.rawSGM
    
    dif = len(raw_text)-len(text)-1
    
    sents = [sent.strip().replace('\n',' ').replace(' ,',',') for sent in sent_tokenize(text)]
    
    boundaries = []
    lastidx = 0
    for sent in sents:
        lastidx += len(sent)
        boundaries.append(lastidx)
        
    coreferent_sentences = []
    
    for event in doc.Events:
        mentions = []
        for mention in event['MENTIONS']:
            index = mention['CHARSEQ'][0]-dif
            
            n_boundaries = boundaries.copy()
            n_boundaries.append(index)
            n_boundaries.sort()
            
            mentions.append(sents[n_boundaries.index(index)])
       
        coreferent_sentences+=([[comb[0].split('  ')[-1],comb[1].split('  ')[-1],1.0] for comb in list(itertools.combinations([i for i in mentions if len(i) != 1], 2))])
    return pd.DataFrame(coreferent_sentences,columns=['sent1','sent2','label'])

In [175]:
d= parser.Docs[378]
sentLevel(d)

Unnamed: 0,sent1,sent2,label
0,"``We mowed down'' the attackers, said Lt. Col....","There was fierce fighting in Kut, to the south...",1.0
1,A meeting to organize an interim government co...,while an ``interim administration'' is put in ...,1.0
2,"For the first time in the war, large parts of ...",on the battlefield and within an Iraqi populat...,1.0
3,and a medic who tried to save one of them sorr...,and a medic who tried to save one of them sorr...,1.0


In [176]:
print(d.showMentions())

46

APW_ENG_20030403.0862 
NEWS STORY 
20030403 

U.S. troops push toward Baghdad's southern outskirts; capital plunged
into darkness after explosions


SOUTH OF BAGHDAD, Iraq (AP)

U.S. soldiers [1;34;48mmoved[0m to capture Baghdad's Saddam International Airport
early Friday and [1;34;41mfought[0m running battles with Iraqi defenders along
the city's southern fringes. "A vise is closing on the regime,"
U.S. President George W. Bush said.

Some frontline units went on heightened alert against the threat of
chemical weapons, ordered to wear rubber boots and suits despite
temperatures that soared into the 90s.

There was fierce [1;34;42mfighting[0m in Kut, to the south of Baghdad, where
desperate Iraqis armed with rifles charged tanks in a [1;34;44msuicide[0m [1;34;42mraid[0m .
"We [1;34;45mmowed down[0m " the attackers, said Lt. Col. B.P. McCoy.

Despite declarations that more tough [1;34;43mfighting[0m lies ahead, the top
U.S. military official indicated there may not be 

In [165]:
sentLevel(d).sent1[192]

'"I don\'t think America will win this war, as our jihad (holy war) and our resistance will teach the Americans and British a lesson they will never forget," he said.'

In [164]:
print(d.showMentions())

17

AFP_ENG_20030320.0722 
NEWS STORY 
20030320 

Thousands take to streets across Mideast to protest war = (PICTURES)
=


CAIRO, March 20 (AFP)

Tens of thousands of people took to the streets across the Middle
East Thursday, [1;34;44mdemonstrating[0m against military [1;34;45mstrikes[0m on Iraq and
calling on Muslims to wage [1;34;46mjihad[0m against the United States and its
allies.

Lebanon and the Palestinian Authority issued fierce denunciations of
the US-led [1;34;45mattacks[0m , while the Egyptian government daily Al-Ahram
warned [1;34;45mit[0m marked "the beginning an era of US colonisation that will
benefit only Israel".
[1;34;44mProtests[0m against the [1;34;45maction[0m aimed at toppling Iraqi President Saddam
Hussein were held in cities across Libya, Egypt and Lebanon, as well
as in Amman, Damascus and the Gaza Strip.

However, there were no reports of [1;34;47mprotests[0m against the [1;34;45mwar[0m from the
Gulf states, many of whom are host to US milit