# re3d
This is an relation extraction dataset developed by Defence Science and Technology Laboratory, Ministry of Defence (United Kingdom) for the purposes of Cybersecurity. Although small in size, it seems to deliver relevant types of relations.

Documentation can be found here: https://github.com/dstl/re3d

In [5]:
import os
import json
import codecs
import pandas as pd

In [3]:
data_path = '../../data/re3d/'
dirs = os.listdir(data_path)
dirs.remove('annotation_process.png')
dirs.remove('README.md')

# Sources
dirs

['Wikipedia',
 'US State Department',
 'UK Government',
 'Delegation of the European Union to Syria',
 'CENTCOM',
 'BBC Online',
 'Australian Department of Foreign Affairs']

In [8]:
# Exploring only the BBC Online Dataset but all have the same format
with codecs.open(data_path + dirs[1] + '/documents.json', encoding='utf-8') as f:
    docs = json.load(f)
    
with codecs.open(data_path + dirs[1] + '/entities.json', encoding='utf-8') as f:
    ents = json.load(f)

with codecs.open(data_path + dirs[1] + '/relations.json', encoding='utf-8') as f:
    rels = json.load(f)

JSONDecodeError: Extra data: line 2 column 1 (char 220)

In [4]:
# sample document
docs[0]

{'_id': '3095FB15AFF7B552702E9EF46C083FDD',
 'sourceName': 'BBC Online',
 'sourceUrl': 'http://www.bbc.co.uk/news/world-middle-east-38605860',
 'wordCount': 214,
 'sentenceCount': 11,
 'title': 'Syria accuses Israel of bombardment',

In [5]:
# sample entities entry
ents[0]

{'_id': '3095FB15AFF7B552702E9EF46C083FDD-0-0-5-Location',
 'begin': 0,
 'end': 5,
 'type': 'Location',
 'value': 'Syria',
 'documentId': '3095FB15AFF7B552702E9EF46C083FDD',
 'confidence': 0}

In [6]:
# sample relation entry
rels[0]

{'_id': '3095FB15AFF7B552702E9EF46C083FDD-0-0-5-18-24-Dislikes',
 'begin': 6,
 'end': 17,
 'sourceBegin': 0,
 'sourceEnd': 5,
 'source': 'Syria',
 'targetBegin': 18,
 'targetEnd': 24,
 'target': 'Israel',
 'type': 'Dislikes',
 'value': 'has accused',
 'documentId': '3095FB15AFF7B552702E9EF46C083FDD',
 'confidence': 1}

In [7]:
# relations dataset
rels_df = pd.io.json.json_normalize(rels)
type(rels_df)

pandas.core.frame.DataFrame

In [8]:
rels_df.columns

Index(['_id', 'begin', 'confidence', 'documentId', 'end', 'source',
       'sourceBegin', 'sourceEnd', 'target', 'targetBegin', 'targetEnd',
       'type', 'value'],
      dtype='object')

In [9]:
# types of relations in BBC Online
rels_df.type.value_counts()

CoLocated          132
AlliesOf            26
CommWith            26
FightingAgainst     22
BelongsTo           19
InChargeOf          15
HasAttrOf           14
IsSynonymOf         13
Apart                8
Dislikes             5
Likes                1
Name: type, dtype: int64

In [10]:
# entities dataset
ents_df = pd.io.json.json_normalize(ents)
ents_df.columns

Index(['_id', 'begin', 'confidence', 'documentId', 'end', 'type', 'value'], dtype='object')

In [11]:
# types of entities in BBC ONline
ents_df.type.value_counts()

Organisation         298
Location             225
Person               215
Nationality           54
Temporal              43
Quantity              31
Weapon                20
Money                  9
DocumentReference      6
Vehicle                5
MilitaryPlatform       5
CommsIdentifier        2
Name: type, dtype: int64

In [12]:
# Dictionary implementation with range of values
class RangeDict(dict):
    def __getitem__(self, item):
        if type(item) != range:
            for key in self:
                if item in key:
                    return self[key]
            raise KeyError(item)
        else:
            return super().__getitem__(item)


In [13]:
# id of doc to RangeDicts of sentences
id2doc = dict()

for doc in docs:
    id2doc[doc['_id']] = RangeDict()
    
    sents = doc['text'].split('.')[:-1]
    i = 0
    for sent in sents:
        id2doc[doc['_id']][range(i, i+len(sent))] = sent
        i += len(sent) + 1

In [14]:
id2doc['DDC23537DF2DA27754B9327683DCA114']

{range(0, 131): 'Iraqi forces have seized full control of Mosul University in their battle with so-called Islamic State, Iraqi state television said',
 range(132, 243): '\nMilitary forces have been attempting to retake the city - the last major IS stronghold in Iraq - since October',
 range(244, 351): '\nIraqi officials said the IS group had used laboratories in the university complex to make chemical weapons',
 range(352, 456): "\nThe head of the country's counter-terrorism operations said retaking the area was a significant victory",
 range(457, 563): '\n"""This great scientific and cultural building was liberated by the heroes,"" Lt Gen Talib Shaghati, said',
 range(564, 667): ' ""The liberation of the University of Mosul means much, given its cultural, social and human symbolism',
 range(668, 762): '"""\nElite troops had entered the compound on Friday as part of a major push into new territory',
 range(763, 856): " \nIraqi military officials claim they now control a large majority 

In [15]:
def add_sentence(rel):
    try:
        doc_id = rel['_id'].split('-')[0]
        start = rel['sourceBegin']
        rel['sentence'] = id2doc[doc_id][start]
    
    except:
        rel['sentence'] = None
    
    return rel

In [16]:
add_sentence(rels[0])

{'_id': '3095FB15AFF7B552702E9EF46C083FDD-0-0-5-18-24-Dislikes',
 'begin': 6,
 'end': 17,
 'sourceBegin': 0,
 'sourceEnd': 5,
 'source': 'Syria',
 'targetBegin': 18,
 'targetEnd': 24,
 'target': 'Israel',
 'type': 'Dislikes',
 'value': 'has accused',
 'documentId': '3095FB15AFF7B552702E9EF46C083FDD',
 'confidence': 1,
 'sentence': 'Syria has accused Israel of bombarding an area west of Damascus, with reports of a military airport being hit'}

In [19]:
# returns documents and relations.
def preprocess_re3d():
    docs, rels = [], []
    
    for d in dirs:
        try:
            with open(data_path + '/' + d + '/documents.json') as f:
                docs += json.loads(f.read())

            with open(data_path + '/' + d + '/relations.json') as f:
                rels += json.loads(f.read())
        
        except Exception as e:
            print(e)
            print(d)

    id2doc = dict()
    
    for doc in docs:
        id2doc[doc['_id']] = RangeDict()

        sents = doc['text'].replace('\xa0', ' ').split('.')[:-1]
        i = 0
        for sent in sents:
            id2doc[doc['_id']][range(i, i+len(sent))] = sent
            i += len(sent) + 1
    
    for rel in rels:
        try:
            doc_id = rel['_id'].split('-')[0]
            start = rel['sourceBegin']
            rel['sentence'] = id2doc[doc_id][start]

        except:
            rel['sentence'] = None
    
    return docs, rels

In [20]:
docs, rels = preprocess_re3d()

In [21]:
len(rels)

1430

In [22]:
rels[-1]

{'_id': '8BF4B60AA80B0601BE12F0164E2E7001-5-102-133-137-143-CoLocated',
 'begin': 1132,
 'end': 1134,
 'sourceBegin': 1100,
 'sourceEnd': 1131,
 'source': 'the UN Special Envoy de Mistura',
 'targetBegin': 1135,
 'targetEnd': 1141,
 'target': 'Geneva',
 'type': 'CoLocated',
 'value': 'in',
 'documentId': '8BF4B60AA80B0601BE12F0164E2E7001',
 'confidence': 1,
 'sentence': ' It is being conducted in full coordination and support of the intra-Syrian talks that will be held by the UN Special Envoy de Mistura in Geneva in February, and in view of the Brussels Conference on Syria and the region which will be hosted by the EU later in the spring'}

In [28]:
data = pd.DataFrame.from_dict(rels)

In [30]:
data.type.value_counts()

CoLocated          478
FightingAgainst    207
BelongsTo          147
CommWith           113
HasAttrOf          112
IsSynonymOf        105
InChargeOf          97
AlliesOf            63
Likes               45
Dislikes            35
Apart               28
Name: type, dtype: int64

The relation classifications seem very relevant to our use case. We can train few shot relation classification model with these instances for more insights. 