In [None]:
from transformers import AutoTokenizer, AutoModel, AlbertTokenizer, AlbertModel, AutoModelForPreTraining
import json
import numpy as np

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
bert = AutoModel.from_pretrained("bert-base-cased")

In [None]:
# Opening JSON file
f = open('../data/CONLL04/dev_triples.json')
data_dev = json.load(f)

f = open('../data/CONLL04/test_triples.json')
data_test = json.load(f)

f = open('../data/CONLL04/train_triples.json')
data_train = json.load(f)

data_all = []
data_all.extend(data_dev)
data_all.extend(data_test)
data_all.extend(data_train)

In [None]:
sent_len = []
for s in data_all:
    sent_len.append(len(s['tokens']))

print('Average sentence length: {}' .format(np.round(np.mean(sent_len), 2)))
print('Max sentence length: {}' .format(np.max(sent_len)))
print('Min sentence length: {}' .format(np.min(sent_len)))

In [None]:
print('Tokenized version:')
tokenized_sent_len = []
for s in data_all:
    sent_str = ' '.join(s['tokens'])
    tokenized_sent_len.append(len(tokenizer.tokenize(sent_str)))

print('Average sentence length: {}' .format(np.round(np.mean(tokenized_sent_len), 2)))
print('Max sentence length: {}' .format(np.max(tokenized_sent_len)))
print('Min sentence length: {}' .format(np.min(tokenized_sent_len)))

In [None]:
entities = {'Peop': [],
            'Loc': [],
            'Other': [],
            'Org': []}

for s in data_all:
    for en in s['entities']:
        entities[en['type']].append([t for t in s['tokens'][en['start']:en['end']]])

In [None]:
all_entities_flattened = []
for k in entities.keys():
    for en in entities[k]:
        for s_en in en:
            all_entities_flattened.append(s_en)

In [None]:
entities['Outside'] = []
for s in data_all:
    for t in s['tokens']:
        if t not in all_entities_flattened:
            entities['Outside'].append([t])

In [None]:
print('Total Peop entities: {}' .format(len(entities['Peop'])))
print('Total Loc entities: {}' .format(len(entities['Loc'])))
print('Total Other entities: {}' .format(len(entities['Other'])))
print('Total Org entities: {}' .format(len(entities['Org'])))
print('Total Outside entities: {}' .format(len(entities['Outside'])))

In [None]:
unique_entities = {'Peop': [],
                   'Loc': [],
                   'Other': [],
                   'Org': [],
                   'Outside': []}
for k in entities.keys():
    for en in entities[k]:
        if en not in unique_entities[k]:
            unique_entities[k].append(en)

In [None]:
print('Total Peop entities: {}' .format(len(unique_entities['Peop'])))
print('Total Loc entities: {}' .format(len(unique_entities['Loc'])))
print('Total Other entities: {}' .format(len(unique_entities['Other'])))
print('Total Org entities: {}' .format(len(unique_entities['Org'])))
print('Total Outside entities: {}' .format(len(unique_entities['Outside'])))

In [None]:
len_entities = {'Peop': [],
                'Loc': [],
                'Other': [],
                'Org': [],
                'Outside': []}
for k in unique_entities.keys():
    for en in unique_entities[k]:
        len_entities[k].append(len(en))

In [None]:
print('Average length of the Peop entity: {}' .format(np.round(np.mean(len_entities['Peop']), 2)))
print('Average length of the Loc entity: {}' .format(np.round(np.mean(len_entities['Loc']), 2)))
print('Average length of the Other entity: {}' .format(np.round(np.mean(len_entities['Other']), 2)))
print('Average length of the Org entity: {}' .format(np.round(np.mean(len_entities['Org']), 2)))
print('Average length of the Outside entity: {}' .format(np.round(np.mean(len_entities['Outside']), 2)))

In [None]:
tokenized_unique_entities = {'Peop': [],
                             'Loc': [],
                             'Other': [],
                             'Org': [],
                             'Outside': []}
for k in unique_entities.keys():
    for en in unique_entities[k]:
        en_str = ' '.join(en)
        tokenized_unique_entities[k].append(tokenizer.tokenize(en_str))

len_tokenized_entities = {'Peop': [],
                          'Loc': [],
                          'Other': [],
                          'Org': [],
                          'Outside': []}
for k in tokenized_unique_entities.keys():
    for en in tokenized_unique_entities[k]:
        len_tokenized_entities[k].append(len(en))

print('Tokenized version:')
print('Average length of the Peop entity: {}' .format(np.round(np.mean(len_tokenized_entities['Peop']), 2)))
print('Average length of the Loc entity: {}' .format(np.round(np.mean(len_tokenized_entities['Loc']), 2)))
print('Average length of the Other entity: {}' .format(np.round(np.mean(len_tokenized_entities['Other']), 2)))
print('Average length of the Org entity: {}' .format(np.round(np.mean(len_tokenized_entities['Org']), 2)))
print('Average length of the Outside entity: {}' .format(np.round(np.mean(len_tokenized_entities['Outside']), 2)))

In [None]:
vocabulary = []
for k in tokenizer.get_vocab().keys():
    vocabulary.append(k)

In [None]:
len(vocabulary)

In [None]:
flattened_unique_entities_words = {'Peop': [],
                                   'Loc': [],
                                   'Other': [],
                                   'Org': [],
                                   'Outside': []}
for k in unique_entities.keys():
    for en in unique_entities[k]:
        for s_en in en:
            if s_en not in flattened_unique_entities_words[k]:
                flattened_unique_entities_words[k].append(s_en)

In [None]:
print('Total unique words, part of Peop entities: {}' .format(len(flattened_unique_entities_words['Peop'])))
print('Total unique words, part of Loc entities: {}' .format(len(flattened_unique_entities_words['Loc'])))
print('Total unique words, part of Other entities: {}' .format(len(flattened_unique_entities_words['Other'])))
print('Total unique words, part of Org entities: {}' .format(len(flattened_unique_entities_words['Org'])))
print('Total unique Outside words: {}' .format(len(flattened_unique_entities_words['Outside'])))

In [None]:
tokenized_flattened_unique_entities_words = {'Peop': [],
                                             'Loc': [],
                                             'Other': [],
                                             'Org': [],
                                             'Outside': []}
for k in flattened_unique_entities_words.keys():
    for w in flattened_unique_entities_words[k]:
        tokenized_flattened_unique_entities_words[k].append(tokenizer.tokenize(w))

len_tokenized_flattened_unique_entities_words = {'Peop': [],
                                                 'Loc': [],
                                                 'Other': [],
                                                 'Org': [],
                                                 'Outside': []}
for k in tokenized_flattened_unique_entities_words.keys():
    for w in tokenized_flattened_unique_entities_words[k]:
        len_tokenized_flattened_unique_entities_words[k].append(len(w))

print('Tokenized version:')
print('Average length of Peop word: {}' .format(np.round(np.mean(len_tokenized_flattened_unique_entities_words['Peop']), 2)))
print('Average length of Loc word: {}' .format(np.round(np.mean(len_tokenized_flattened_unique_entities_words['Loc']), 2)))
print('Average length of Other word: {}' .format(np.round(np.mean(len_tokenized_flattened_unique_entities_words['Other']), 2)))
print('Average length of Org word: {}' .format(np.round(np.mean(len_tokenized_flattened_unique_entities_words['Org']), 2)))
print('Average length of Outside word: {}' .format(np.round(np.mean(len_tokenized_flattened_unique_entities_words['Outside']), 2)))

In [None]:
oov_words = {'Peop': [],
             'Loc': [],
             'Other': [],
             'Org': [],
             'Outside': []}
for k in flattened_unique_entities_words.keys():
    for w in flattened_unique_entities_words[k]:
        if w not in vocabulary:
            oov_words[k].append(w)

In [None]:
print('Percentage of OOV Peop words (words to be split in word-pieces): {}%' .format(100*np.round(len(oov_words['Peop'])/len(flattened_unique_entities_words['Peop']), 3)))
print('Percentage of OOV Loc words (words to be split in word-pieces): {}%' .format(100*np.round(len(oov_words['Loc'])/len(flattened_unique_entities_words['Loc']), 3)))
print('Percentage of OOV Other words (words to be split in word-pieces): {}%' .format(100*np.round(len(oov_words['Other'])/len(flattened_unique_entities_words['Other']), 3)))
print('Percentage of OOV Org words (words to be split in word-pieces): {}%' .format(100*np.round(len(oov_words['Org'])/len(flattened_unique_entities_words['Org']), 3)))
print('Percentage of OOV Outside words (words to be split in word-pieces): {}%' .format(100*np.round(len(oov_words['Outside'])/len(flattened_unique_entities_words['Outside']), 3)))