In [None]:
from transformers import AutoTokenizer, AutoModel, AlbertTokenizer, AlbertModel, AutoModelForPreTraining
import json
import numpy as np

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
bert = AutoModel.from_pretrained("bert-base-cased")

In [None]:
# Opening JSON file
f = open('../data/ADE/raw/ade_split_0_test.json')
data_test = json.load(f)

f = open('../data/ADE/raw/ade_split_0_train.json')
data_all = json.load(f)

data_all.extend(data_test) 

In [None]:
sent_len = []
for s in data_all:
    sent_len.append(len(s['tokens']))

print('Average sentence length: {}' .format(np.round(np.mean(sent_len), 2)))
print('Max sentence length: {}' .format(np.max(sent_len)))
print('Min sentence length: {}' .format(np.min(sent_len)))

In [None]:
print('Tokenized version:')
tokenized_sent_len = []
for s in data_all:
    sent_str = ' '.join(s['tokens'])
    tokenized_sent_len.append(len(tokenizer.tokenize(sent_str)))

print('Average sentence length: {}' .format(np.round(np.mean(tokenized_sent_len), 2)))
print('Max sentence length: {}' .format(np.max(tokenized_sent_len)))
print('Min sentence length: {}' .format(np.min(tokenized_sent_len)))

In [None]:
entities = {'Drug': [],
            'Adverse-Effect': []}
true_entities_indexes = []
for s in data_all:
    for en in s['entities']:
        entities[en['type']].append([t for t in s['tokens'][en['start']:en['end']]])

In [None]:
all_entities_flattened = []
for en in entities['Drug']:
    for s_en in en:
        all_entities_flattened.append(s_en)
for en in entities['Adverse-Effect']:
    for s_en in en:
        all_entities_flattened.append(s_en)

In [None]:
entities['Outside'] = []
for s in data_all:
    for t in s['tokens']:
        if t not in all_entities_flattened:
            entities['Outside'].append([t])

In [None]:
print('Total DRUG entities: {}' .format(len(entities['Drug'])))
print('Total Adverse-Effect entities: {}' .format(len(entities['Adverse-Effect'])))
print('Total Outside entities: {}' .format(len(entities['Outside'])))

In [None]:
unique_entities = {'Drug': [],
                   'Adverse-Effect': [],
                   'Outside': []}
for k in entities.keys():
    for en in entities[k]:
        if en not in unique_entities[k]:
            unique_entities[k].append(en)

In [None]:
print('Total unique DRUG entities: {}' .format(len(unique_entities['Drug'])))
print('Total unique Adverse-Effect entities: {}' .format(len(unique_entities['Adverse-Effect'])))
print('Total unique Outside entities: {}' .format(len(unique_entities['Outside'])))

In [None]:
len_entities = {'Drug': [],
                'Adverse-Effect': [],
                'Outside': []}
for k in unique_entities.keys():
    for en in unique_entities[k]:
        len_entities[k].append(len(en))

In [None]:
print('Average length of the Drug entity: {}' .format(np.round(np.mean(len_entities['Drug']), 2)))
print('Average length of the Adverse-Effect entity: {}' .format(np.round(np.mean(len_entities['Adverse-Effect']), 2)))
print('Average length of the Outside entity: {}' .format(np.round(np.mean(len_entities['Outside']), 2)))

In [None]:
tokenized_unique_entities = {'Drug': [],
                             'Adverse-Effect': [],
                             'Outside': []}
for k in unique_entities.keys():
    for en in unique_entities[k]:
        en_str = ' '.join(en)
        tokenized_unique_entities[k].append(tokenizer.tokenize(en_str))

len_tokenized_entities = {'Drug': [],
                          'Adverse-Effect': [],
                          'Outside': []}
for k in tokenized_unique_entities.keys():
    for en in tokenized_unique_entities[k]:
        len_tokenized_entities[k].append(len(en))

print('Tokenized version:')
print('Average length of Drug entity: {}' .format(np.round(np.mean(len_tokenized_entities['Drug']), 2)))
print('Average length of Adverse-Effect entity: {}' .format(np.round(np.mean(len_tokenized_entities['Adverse-Effect']), 2)))
print('Average length of Outside entity: {}' .format(np.round(np.mean(len_tokenized_entities['Outside']), 2)))

In [None]:
vocabulary = []
for k in tokenizer.get_vocab().keys():
    vocabulary.append(k)

In [None]:
vocabulary = []
for k in tokenizer.get_vocab().keys():
    if k[:2] == '##':
        vocabulary.append(k[2:])
    else:
        vocabulary.append(k)

In [None]:
len(vocabulary)

In [None]:
flattened_unique_entities_words = {'Drug': [],
                                   'Adverse-Effect': [],
                                   'Outside': []}
for k in unique_entities.keys():
    for en in unique_entities[k]:
        for s_en in en:
            if s_en not in flattened_unique_entities_words[k]:
                flattened_unique_entities_words[k].append(s_en)

In [None]:
tokenized_flattened_unique_entities_words = {'Drug': [],
                                             'Adverse-Effect': [],
                                             'Outside': []}
for k in flattened_unique_entities_words.keys():
    for w in flattened_unique_entities_words[k]:
        tokenized_flattened_unique_entities_words[k].append(tokenizer.tokenize(w))

len_tokenized_flattened_unique_entities_words = {'Drug': [],
                                                 'Adverse-Effect': [],
                                                 'Outside': []}
for k in tokenized_flattened_unique_entities_words.keys():
    for w in tokenized_flattened_unique_entities_words[k]:
        len_tokenized_flattened_unique_entities_words[k].append(len(w))

print('Tokenized version:')
print('Average length of Drug word: {}' .format(np.round(np.mean(len_tokenized_flattened_unique_entities_words['Drug']), 2)))
print('Average length of Adverse-Effect word: {}' .format(np.round(np.mean(len_tokenized_flattened_unique_entities_words['Adverse-Effect']), 2)))
print('Average length of Outside word: {}' .format(np.round(np.mean(len_tokenized_flattened_unique_entities_words['Outside']), 2)))

In [None]:
print('Total unique words, part of DRUG entities: {}' .format(len(flattened_unique_entities_words['Drug'])))
print('Total unique words, part of Adverse-Effect entities: {}' .format(len(flattened_unique_entities_words['Adverse-Effect'])))
print('Total unique Outside words: {}' .format(len(flattened_unique_entities_words['Outside'])))

In [None]:
oov_words = {'Drug': [],
             'Adverse-Effect': [],
             'Outside': []}
for k in flattened_unique_entities_words.keys():
    for w in flattened_unique_entities_words[k]:
        if w not in vocabulary:
            oov_words[k].append(w)

In [None]:
print('Percentage of OOV Drug words (words to be split in word-pieces): {}%' .format(100*np.round(len(oov_words['Drug'])/len(flattened_unique_entities_words['Drug']), 3)))
print('Percentage of OOV Adverse-Effect words (words to be split in word-pieces): {}%' .format(100*np.round(len(oov_words['Adverse-Effect'])/len(flattened_unique_entities_words['Adverse-Effect']), 3)))
print('Percentage of OOV Outside words (words to be split in word-pieces): {}%' .format(100*np.round(len(oov_words['Outside'])/len(flattened_unique_entities_words['Outside']), 3)))