In [6]:
import json
import numpy as np
import os
from collections import defaultdict

In [2]:
data_dir = '/Volumes/External HDD/dataset/tacred/data/json'
train_split_file = os.path.join(data_dir, 'train_split.json')
test_split_file = os.path.join(data_dir, 'test_split.json')

In [3]:
def load_data(data_file):
    with open(data_file, 'rb') as handle:
        data = json.load(handle)
    return data

def group_by_relation(data):
    relation2data = defaultdict(lambda: list())
    for example in data:
        relation = example['relation']
        relation2data[relation].append(example)

    for relation, examples in relation2data.items():
        relation2data[relation] = np.array(examples)
    return relation2data

def relation2lengths(data1, data2):
    data2_relations = set(list(data2.keys()))
    seen_relations = set()
    for relation, samples1 in data1.items():
        seen_relations.add(relation)
        samples2 = data2[relation]
        print(f'Relation" {relation} | Train: {len(samples1)} | Test: {len(samples2)}')
    unseen_relation  = data2_relation 

In [4]:
train_data = load_data(train_split_file)
test_data = load_data(test_split_file)

In [7]:
train_rel2data = group_by_relation(train_data)
test_rel2data = group_by_relation(test_data)

In [8]:
relation2lengths(train_rel2data)

Relation" no_relation: 34816
Relation" per:title: 203
Relation" org:stateorprovince_of_headquarters: 35
Relation" org:member_of: 58
Relation" per:parents: 15
Relation" per:age: 39
Relation" per:employee_of: 187
Relation" per:date_of_death: 24
Relation" org:city_of_headquarters: 91
Relation" org:country_of_headquarters: 143
Relation" org:subsidiaries: 79
Relation" org:alternate_names: 111
Relation" per:countries_of_residence: 105
Relation" per:origin: 69
Relation" org:members: 71
Relation" per:cities_of_residence: 87
Relation" org:top_members/employees: 173
Relation" per:date_of_birth: 5
Relation" org:political/religious_affiliation: 34
Relation" per:stateorprovinces_of_residence: 47
Relation" per:other_family: 67
Relation" org:number_of_employees/members: 31
Relation" per:alternate_names: 43
Relation" per:spouse: 51
Relation" org:founded_by: 23
Relation" org:parents: 83
Relation" org:dissolved: 13
Relation" org:founded: 31
Relation" per:siblings: 35
Relation" per:children: 16
Relation"

In [9]:
relation2lengths(test_rel2data)

Relation" no_relation: 17405
Relation" per:title: 100
Relation" org:stateorprovince_of_headquarters: 16
Relation" org:member_of: 28
Relation" per:parents: 7
Relation" per:age: 19
Relation" per:employee_of: 92
Relation" per:date_of_death: 11
Relation" org:city_of_headquarters: 44
Relation" org:country_of_headquarters: 70
Relation" org:subsidiaries: 39
Relation" org:alternate_names: 55
Relation" per:countries_of_residence: 52
Relation" per:origin: 34
Relation" org:members: 34
Relation" per:cities_of_residence: 43
Relation" org:top_members/employees: 86
Relation" per:date_of_birth: 2
Relation" org:political/religious_affiliation: 16
Relation" per:stateorprovinces_of_residence: 22
Relation" per:other_family: 33
Relation" org:number_of_employees/members: 14
Relation" per:alternate_names: 21
Relation" per:spouse: 24
Relation" org:founded_by: 11
Relation" org:parents: 40
Relation" org:dissolved: 5
Relation" org:founded: 15
Relation" per:siblings: 17
Relation" per:children: 7
Relation" per:sch