In [None]:
#Import statements
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer
import gc

max_length = 200

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [None]:
tokenizer_checkpoint = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(tokenizer_checkpoint)
offset = bert_tokenizer.vocab_size


In [None]:
train_df = pd.read_csv('../data/medal_smaller_train.csv')
val_df = pd.read_csv('../data/medal_smaller_validation.csv')
test_df = pd.read_csv('../data/medal_smaller_validation.csv')

samples = train_df.label.value_counts().sort_values(ascending=False)[:6500].index # Using the top 6500 occurring acronyms
print(samples[:5])


In [None]:
train_df = train_df.loc[train_df['label'].isin(samples)]
val_df = val_df.loc[val_df['label'].isin(samples)]
test_df = test_df.loc[test_df['label'].isin(samples)]

label_names = (list(train_df.label) +
               list(val_df.label))
label_names = sorted(list(set(label_names)))

label_dict = {value: index + offset for index, value in enumerate(label_names)}
reverse_label_dict = {value: key for key, value in label_dict.items()}
print(f"Size: {len(label_names)}", label_names)
print(f"Dict Item 1: {list(label_dict.items())[0]}")
print(f"Reverse Dict Item 1: {list(reverse_label_dict.items())[0]}")

print("Train dataset length:", len(train_df))
print("Validation dataset length:", len(val_df))

with open('../vocabulary/dictionary.json', 'w') as json_file:
    json.dump(label_dict, json_file)

with open('../vocabulary/reverse_dictionary.json', 'w') as json_file:
    json.dump(reverse_label_dict, json_file)

with open('../vocabulary/label_names.json', 'w') as json_file:
    json.dump(label_names, json_file)
    

In [None]:
def tokenize(dataset, tokenizer=bert_tokenizer, max_len=max_length, label_dict=label_dict):
    input_ids = []
    token_type_ids = []
    start_positions = []
    end_positions = []
    attention_masks = []
    label_ids = []

    for text, loc, abbreviation, label in zip(dataset['text'], dataset['location'], dataset['abbreviation'], dataset['label']):
        pre_tokens = tokenizer.tokenize(' '.join(text.split()[:loc]))
        adjusted_loc_start = len(pre_tokens) + 1
        adjusted_loc_end = adjusted_loc_start + len(tokenizer.tokenize(abbreviation))
        encoded_input = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        
        if adjusted_loc_end < max_length:
            input_ids.append(encoded_input['input_ids'])
            token_type_ids.append(encoded_input['token_type_ids'])
            start_positions.append(adjusted_loc_start)
            end_positions.append(adjusted_loc_end)
            attention_masks.append(encoded_input['attention_mask'])
            label_ids.append(label_dict[label])

    input_ids = np.array(input_ids, dtype=np.int32).squeeze()
    token_type_ids = np.array(token_type_ids, dtype=np.int32).squeeze()
    attention_masks = np.array(attention_masks, dtype=np.int32).squeeze()
    start_positions = np.array(start_positions, dtype=np.int32).squeeze()
    end_positions = np.array(end_positions, dtype=np.int32).squeeze()
    label_ids = np.array(label_ids, dtype=np.int32).squeeze()

    print("First text:\n", dataset['text'].iloc[0])
    print("First location:", dataset['location'].iloc[0])
    print("First acronym:", dataset['text'].iloc[0].split()[dataset['location'].iloc[0]])
    print("First expansion:", dataset['label'].iloc[0])
    print("First text decoded:\n", tokenizer.decode(input_ids[0]))
    print("Confirm adjusted location accuracy: \n",
          tokenizer.decode(input_ids[0][start_positions[0]:end_positions[0]]))
    print("Confirm label:", reverse_label_dict[label_ids[0]])

    return input_ids, token_type_ids, attention_masks, start_positions, end_positions, label_ids


In [None]:
print("Train-------------------------------------------------------------")
train_input_ids, train_token_type_ids, train_attention_masks, train_start_positions, train_end_positions, train_labels = tokenize(train_df)
print("Val---------------------------------------------------------------")
val_input_ids, val_token_type_ids, val_attention_masks, val_start_positions, val_end_positions, val_labels = tokenize(val_df)
print("Test---------------------------------------------------------------")
test_input_ids, test_token_type_ids, test_attention_masks, test_start_positions, test_end_positions, test_labels = tokenize(test_df)

np.save('../tokenized_medal_inputs//train_input_ids.npy', train_input_ids)
np.save('../tokenized_medal_inputs//train_token_type_ids.npy', train_token_type_ids)
np.save('../tokenized_medal_inputs//train_attention_masks.npy', train_attention_masks)
np.save('../tokenized_medal_inputs//train_start_positions.npy', train_start_positions)
np.save('../tokenized_medal_inputs//train_end_positions.npy', train_end_positions)
np.save('../tokenized_medal_inputs//train_labels.npy', train_labels)
np.save('../tokenized_medal_inputs//val_input_ids.npy', val_input_ids)
np.save('../tokenized_medal_inputs//val_token_type_ids.npy', val_token_type_ids)
np.save('../tokenized_medal_inputs//val_attention_masks.npy', val_attention_masks)
np.save('../tokenized_medal_inputs//val_start_positions.npy', val_start_positions)
np.save('../tokenized_medal_inputs//val_end_positions.npy', val_end_positions)
np.save('../tokenized_medal_inputs//val_labels.npy', val_labels)
np.save('../tokenized_medal_inputs//test_input_ids.npy', test_input_ids)
np.save('../tokenized_medal_inputs//test_token_type_ids.npy', test_token_type_ids)
np.save('../tokenized_medal_inputs//test_attention_masks.npy', test_attention_masks)
np.save('../tokenized_medal_inputs//test_start_positions.npy', test_start_positions)
np.save('../tokenized_medal_inputs//test_end_positions.npy', test_end_positions)
np.save('../tokenized_medal_inputs//test_labels.npy', test_labels)
