# re3d
This is an relation extraction dataset developed by Defence Science and Technology Laboratory, Ministry of Defence (United Kingdom) for the purposes of Cybersecurity. Although small in size, it seems to deliver relevant types of relations.

Documentation can be found here: https://github.com/dstl/re3d

In [235]:
import os
import json
import pandas as pd
import re
import textblob
import spacy

In [184]:
data_path = '../../data/re3d/'
dirs = os.listdir(data_path)
dirs.remove('annotation_process.png')
dirs.remove('README.md')
if '.git' in dirs:
    dirs.remove('.git')

# Sources
dirs

['Wikipedia',
 'BBC Online',
 'Australian Department of Foreign Affairs',
 'UK Government',
 'US State Department',
 'CENTCOM',
 'Delegation of the European Union to Syria']

In [40]:
# Exploring only the BBC Online Dataset but all have the same format
with open(data_path + dirs[1] + '/documents.json') as f:
    docs = json.load(f)
    
with open(data_path + dirs[1] + '/entities.json') as f:
    ents = json.load(f)

with open(data_path + dirs[1] + '/relations.json') as f:
    rels = json.load(f)

In [41]:
# sample document
docs[0]

{'_id': '3095FB15AFF7B552702E9EF46C083FDD',
 'sourceName': 'BBC Online',
 'sourceUrl': 'http://www.bbc.co.uk/news/world-middle-east-38605860',
 'wordCount': 214,
 'sentenceCount': 11,
 'title': 'Syria accuses Israel of bombardment',

In [42]:
# sample entities entry
ents[0]

{'_id': '3095FB15AFF7B552702E9EF46C083FDD-0-0-5-Location',
 'begin': 0,
 'end': 5,
 'type': 'Location',
 'value': 'Syria',
 'documentId': '3095FB15AFF7B552702E9EF46C083FDD',
 'confidence': 0}

In [43]:
# sample relation entry
rels[0]

{'_id': '3095FB15AFF7B552702E9EF46C083FDD-0-0-5-18-24-Dislikes',
 'begin': 6,
 'end': 17,
 'sourceBegin': 0,
 'sourceEnd': 5,
 'source': 'Syria',
 'targetBegin': 18,
 'targetEnd': 24,
 'target': 'Israel',
 'type': 'Dislikes',
 'value': 'has accused',
 'documentId': '3095FB15AFF7B552702E9EF46C083FDD',
 'confidence': 1}

In [44]:
# relations dataset
rels_df = pd.io.json.json_normalize(rels)
type(rels_df)

pandas.core.frame.DataFrame

In [45]:
rels_df.columns

Index(['_id', 'begin', 'confidence', 'documentId', 'end', 'source',
       'sourceBegin', 'sourceEnd', 'target', 'targetBegin', 'targetEnd',
       'type', 'value'],
      dtype='object')

In [46]:
# types of relations in BBC Online
rels_df.type.value_counts()

CoLocated          132
AlliesOf            26
CommWith            26
FightingAgainst     22
BelongsTo           19
InChargeOf          15
HasAttrOf           14
IsSynonymOf         13
Apart                8
Dislikes             5
Likes                1
Name: type, dtype: int64

In [47]:
# entities dataset
ents_df = pd.io.json.json_normalize(ents)
ents_df.columns

Index(['_id', 'begin', 'confidence', 'documentId', 'end', 'type', 'value'], dtype='object')

In [48]:
# types of entities in BBC ONline
ents_df.type.value_counts()

Organisation         298
Location             225
Person               215
Nationality           54
Temporal              43
Quantity              31
Weapon                20
Money                  9
DocumentReference      6
MilitaryPlatform       5
Vehicle                5
CommsIdentifier        2
Name: type, dtype: int64

In [49]:
# Dictionary implementation with range of values
class RangeDict(dict):
    def __getitem__(self, item):
        if type(item) != range:
            for key in self:
                if item in key:
                    return self[key]
            raise KeyError(item)
        else:
            return super().__getitem__(item)


In [166]:
# id of doc to RangeDicts of sentences
id2doc = dict()
id2doctext = dict()

for doc in docs:
    id2doc[doc['_id']] = RangeDict()
    id2doctext[doc['_id']] = doc['text']
    
    sents = doc['text'].split('.\n')[:-1]
    i = 0
    for sent in sents:
        id2doc[doc['_id']][range(i, i+len(sent))] = sent
        i += len(sent) + 1

In [53]:
id2doc['DDC23537DF2DA27754B9327683DCA114']

{range(0, 131): 'Iraqi forces have seized full control of Mosul University in their battle with so-called Islamic State, Iraqi state television said',
 range(132, 242): 'Military forces have been attempting to retake the city - the last major IS stronghold in Iraq - since October',
 range(243, 349): 'Iraqi officials said the IS group had used laboratories in the university complex to make chemical weapons',
 range(350, 453): "The head of the country's counter-terrorism operations said retaking the area was a significant victory",
 range(454, 852): '"""This great scientific and cultural building was liberated by the heroes,"" Lt Gen Talib Shaghati, said. ""The liberation of the University of Mosul means much, given its cultural, social and human symbolism."""\nElite troops had entered the compound on Friday as part of a major push into new territory. \nIraqi military officials claim they now control a large majority of the city\'s eastern half',
 range(853, 954): 'They also control thre

In [223]:
def add_sentence(rel):
    """
        For a relaton, locate its sentence.
    """
    try:
        doc_id = rel['_id'].split('-')[0]
        start = rel['sourceBegin']
        rel['sentence'] = id2doc[doc_id][start]
    
    except:
        rel['sentence'] = None
    
    return rel

In [17]:
add_sentence(rels[0])

{'_id': '3095FB15AFF7B552702E9EF46C083FDD-0-0-5-18-24-Dislikes',
 'begin': 6,
 'end': 17,
 'sourceBegin': 0,
 'sourceEnd': 5,
 'source': 'Syria',
 'targetBegin': 18,
 'targetEnd': 24,
 'target': 'Israel',
 'type': 'Dislikes',
 'value': 'has accused',
 'documentId': '3095FB15AFF7B552702E9EF46C083FDD',
 'confidence': 1,
 'sentence': 'Syria has accused Israel of bombarding an area west of Damascus, with reports of a military airport being hit'}

In [224]:
# returns documents and relations.
id2doc = dict()
def preprocess_re3d():
    """
        Basically gets the sentence for a particular relation and adds it to the data.
    """
    docs, rels = [], []
    
    for d in dirs:
        try:
            with open(data_path + '/' + d + '/documents.json') as f:
                a = json.loads(f.read())
                docs += a

            with open(data_path + '/' + d + '/relations.json') as f:
                a = json.loads(f.read())
#                 print(len(a))
                rels += a
        
        except Exception as e:
            print(e)
            print(d)

    global id2doc
    

    for doc in docs:
        id2doc[doc['_id']] = RangeDict()

        text = doc['text']
        def match(obj):
            return " "*len(obj.group(0))
        text = re.sub(r'\[\d+\]', match, text)
        b = textblob.TextBlob(text)
        sents = [str(s) for s in b.sentences]

        for sent in sents:
            ind = text.index(sent)
            id2doc[doc['_id']][range(ind, ind+len(sent))] = sent

    
    to_del = []
    for i, rel in enumerate(rels):

        doc_id = rel['documentId']
        start = rel['sourceBegin']
        rel['sentence'] = id2doc[doc_id][start]
            
            
    for i, r in enumerate(rels):
        if r['source'] not in r['sentence'] or r['target'] not in r['sentence']:
            to_del.append(i)
    for i in to_del:
        rels[i] = None
    rels = [i for i in rels if i is not None]
    
    
    return docs, rels

In [219]:
docs, rels = preprocess_re3d()

In [220]:
len(rels)

1426

In [221]:
i = 0
for r in rels:
    if r['source'] not in r['sentence'] or r['target'] not in r['sentence'] or r['value'] not in r['sentence']:
        print(r)
        i += 1
print(i)

{'_id': '051A4176702090F39056F34A9C29D574-3-21-55-152-222-InChargeOf', 'begin': 557, 'end': 583, 'sourceBegin': 453, 'sourceEnd': 487, 'source': 'the Swedish citizen Mohamed Moumou', 'targetBegin': 584, 'targetEnd': 654, 'target': 'an extremist group centered around the Brandbergen Mosque in Stockholm', 'type': 'InChargeOf', 'value': "an ''uncontested leader of", 'documentId': '051A4176702090F39056F34A9C29D574', 'confidence': 1, 'sentence': 'On December 7, 2006, the Swedish citizen Mohamed Moumou, who is described by the United States Department of the Treasury as an ""uncontested leader of an extremist group centered around the Brandbergen Mosque in Stockholm"", was put on the United Nations Security Council Committee 1267 list of foreign terrorists.   "'}
{'_id': '22BCFA25729AF03473435DAAAE67F5B8-1-129-151-86-101-IsSynonymOf', 'begin': 158, 'end': 186, 'sourceBegin': 186, 'sourceEnd': 208, 'source': 'the President of Syria', 'targetBegin': 143, 'targetEnd': 158, 'target': 'Bashar al-

In [188]:
data = pd.DataFrame.from_dict(rels)

In [222]:
data.type.value_counts()

CoLocated          476
FightingAgainst    206
BelongsTo          145
HasAttrOf          112
CommWith           107
InChargeOf          96
IsSynonymOf         93
AlliesOf            63
Likes               45
Dislikes            35
Apart               28
Name: type, dtype: int64

The relation classifications seem very relevant to our use case. We can train few shot relation classification model with these instances for more insights. 

# Converting into FewRel format

In [239]:
nlp_no_coref = spacy.load("en_core_web_sm")
def spacy_tokenize_no_coref(sentence):
        """
            Tokenizes the sentence using spacy
        """
        return list(map(str, nlp_no_coref(sentence)))

In [252]:
fewrel_dict = {}
for r in rels:
    if r['type'] not in fewrel_dict:
        fewrel_dict[r['type']] = []
    ind_dict = {}
    
    sentence = r['sentence']
    head = r['source']
    tail = r['target']
    tokens = spacy_tokenize_no_coref(sentence)
        
    tokenized_head = spacy_tokenize_no_coref(head)
    tokenized_tail = spacy_tokenize_no_coref(tail)

    head_indices = None
    tail_indices = None
    for i in range(len(tokens)):
        if tokens[i] == tokenized_head[0] and tokens[i:i+len(tokenized_head)] == tokenized_head:
            head_indices = list(range(i,i+len(tokenized_head)))
            break
    for i in range(len(tokens)):
        if tokens[i] == tokenized_tail[0] and tokens[i:i+len(tokenized_tail)] == tokenized_tail:
            tail_indices = list(range(i,i+len(tokenized_tail)))
            break
    
    if head_indices is None or tail_indices is None:
        # to handle a few edge cases
        continue
    
    ind_dict['tokens'] = tokens
    ind_dict['h'] = [head, "m", [head_indices]]
    ind_dict['t'] = [tail, "m", [tail_indices]]
    
    fewrel_dict[r['type']].append(ind_dict)

In [253]:
with open('re3d_fewrel_format.json', 'w') as f:
    json.dump(fewrel_dict, f)