# Token- & Sequence-Level BERT Embeddings

Partially adapted from https://github.com/huggingface/transformers/blob/3763f8944dc3fef8afb0c525a2ced8a04889c14f/examples/extract_features.py
(Apache License 2.0)

The output is a TSV file:
```
sentence_id    layer_nr    token    embedding
```
E.g. (with `LAYERS=[-1, -2], TOKEN_LVL=True`, input = "hello world embedding")
```
1    -1    hello    [0.123456789, 0.987654321, ...]
1    -2    hello    [0.111111111, 0.222222222, ...]
1    -1    world    ...
1    -2    world    ...
1    -1    em       ...
1    -2    em       ...
1    -1    ##bed    ...
1    -2    ##bed    ...
1    -1    ##ding    ...
1    -2    ##ding    ...
```

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
!pip install pytorch_pretrained_bert 

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 4.4MB/s 
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2


In [0]:
import torch
import urllib
import pandas as pd
from pytorch_pretrained_bert import BertTokenizer, BertModel
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import numpy as np

In [4]:
BATCH_SIZE = 32
ROUNDING_ACC = 9
LAYERS = [-1]  # [-1, -2]
TOKEN_LVL = True  # token-based or sequence-based

if TOKEN_LVL:
    # Task 1: Span identification
    MAX_LEN = 95
    TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-train.tsv?token=AD7GEDMEHQSUS34AOSIHGF26Q4WYK'
    DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-dev.tsv?token=AD7GEDI3J6KMIKA6XXTKT6S6Q4WYI'
    TEST_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/si-test.tsv?token=AD7GEDM7A3GFIAEZHHESFO26LP4BQ'
    # Make sure you have enough free space in your Drive
    # train_bert.tsv is 4.3 GB (dev_bert.tsv is only 730 MB)
else:
    # Task 2: Technique classification
    MAX_LEN = 128
    TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-train.tsv?token=AD7GEDOCX5E6S5RBB5T5YPS6NJMGI'
    DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-dev.tsv?token=AD7GEDIJLXNFOELYLNANKFK6NJMGK'
    TEST_URL = ''
    TOKEN_INDICES = list(range(1, 11))  # [] if you only want the [CLS] token

OUT_PREFIX = '/content/gdrive/My Drive/colab_projects/data/'
# MODEL = 'bert-base-uncased'
MODEL = 'bert-base-cased'
# MODEL = 'bert-large-uncased'

# Toggle this!
# MODE = 'train'
MODE = 'dev'
# MODE = 'test'

if MODE == 'train':
    IN_URL = TRAIN_URL
elif MODE == 'dev':
    IN_URL = DEV_URL
elif MODE == 'test':
    IN_URL = TEST_URL

UNCASED = 'uncased' in MODEL

EXTRA_FILE_INFO = ''

OUT_FILE = OUT_PREFIX + MODE + '_' + MODEL + EXTRA_FILE_INFO + '.tsv'
if not TOKEN_LVL:
    OUT_FILE = OUT_PREFIX + 'tc_' + MODE + '_' + MODEL + EXTRA_FILE_INFO + '.tsv'
print(OUT_FILE)

/content/gdrive/My Drive/colab_projects/data/dev_bert-base-cased.tsv


In [5]:
if not torch.cuda.is_available():
    print("WARNING: GPU not available!!")
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0) 

tokenizer = BertTokenizer.from_pretrained(MODEL, do_lower_case=UNCASED)
model = BertModel.from_pretrained(MODEL)
model.cuda()

100%|██████████| 213450/213450 [00:00<00:00, 1089760.47B/s]
100%|██████████| 404400730/404400730 [00:11<00:00, 33907561.17B/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [0]:
def get_comments(filename, url=True):
    if url:
        comments = []
        with urllib.request.urlopen(filename) as f:
            for line in f:
                if line.startswith(b'#'):
                    comments.append(line.decode("utf-8"))
                else:
                    break
        return comments
    with open(filename, 'r', encoding='utf8') as f:
        commentiter = takewhile(lambda s: s.startswith('#'), f)
        comments = list(commentiter)
    return comments

comments = get_comments(IN_URL)
full_df = pd.read_csv(IN_URL, sep='\t', skiprows=len(comments), quoting=3)
if TOKEN_LVL:
    sent_df = full_df.groupby('sent_id')['token'].apply(list).to_frame()
    sentences = sent_df['token'].tolist()
else:
    sentences = full_df['text'].tolist()

In [0]:
class InputFeatures(object):

    def __init__(self, tokens, sent_idx, input_ids, input_mask, sentence):
        self.tokens = tokens
        self.sent_idx = sent_idx
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.sentence = sentence


def convert_sentences_to_features(sentences, seq_length, tokenizer):
    features = []
    for (idx, tok_list) in enumerate(sentences, start=1):
        if TOKEN_LVL:
            if UNCASED:
                tok_list = [str(tok).lower() for tok in tok_list]
            else:
                tok_list = [str(tok) for tok in tok_list]
            sentence = ' '.join(tok_list)
        else:
            if UNCASED:
                sentence = tok_list.lower()
        tokens = tokenizer.tokenize(sentence)

        # +2 = [CLS] and [SEP]
        if len(tokens) + 2 > seq_length:
            print('Sentence ' + str(idx) + ' will be truncated; original length:', len(tokens))
            print(sentence)
            print(tokens)
            tokens = tokens[0:(seq_length - 2)]

        tokens.insert(0, '[CLS]')
        tokens.append('[SEP]')

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)

        features.append(
            InputFeatures(
                tokens=tokens,
                sent_idx=idx,
                input_ids=input_ids,
                input_mask=input_mask,
                sentence=sentence))
    return idx, features

n_sents, features = convert_sentences_to_features(sentences, MAX_LEN + 2,
                                                  tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_indices = torch.arange(n_sents, dtype=torch.long)

data = TensorDataset(all_input_ids, all_input_mask, all_indices)
sampler = SequentialSampler(data)
dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)

In [8]:
# The loop nesting is really inopportune, but it is partially because of the
# batching and partially in order to sort the output by token first (instead of
# by layer).

model.eval()
with open(OUT_FILE, 'w', encoding='utf-8') as f:
    for input_ids, input_mask, index_batch in dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)

        all_encoder_layers, _ = model(input_ids, token_type_ids=None, 
                                      attention_mask=input_mask)

        for b, idx in enumerate(index_batch):
            feature = features[idx.item()]
            sent_idx = feature.sent_idx
            if TOKEN_LVL:
                for (tok_idx, token) in enumerate(feature.tokens):
                    if token in ['[CLS]', '[SEP]']:
                        continue
                    for layer in LAYERS:
                        layer_output = all_encoder_layers[int(layer)].detach().cpu().numpy()
                        layer_output = layer_output[b]
                        values = [round(x.item(), ROUNDING_ACC) for x in layer_output[tok_idx]]
                        out = str(sent_idx) + '\t' + str(layer) + '\t' + token + '\t' + str(values)
                        f.write(out + '\n')
                if (sent_idx % 1000 == 0):
                    print(out)
            else:
                CLS_IDX = 0  # [CLS] as representation of the entire sequence
                for layer in LAYERS:
                    f.write(str(sent_idx) + '\t' + str(layer) + '\t' + feature.sentence + '\t')
                    out = '['
                    layer_output = all_encoder_layers[int(layer)].detach().cpu().numpy()
                    layer_output = layer_output[b]
                    for tok_idx in [CLS_IDX] + TOKEN_INDICES:
                        values = [round(x.item(), ROUNDING_ACC) for x in layer_output[tok_idx]]
                        out += str(values)[1:-1] + ', '
                    f.write(out[:-2] + ']\n')
                if (sent_idx % 100 == 0):
                    print(sent_idx)
                

1000	-1	,	[0.207205608, 0.029325726, 0.016545981, 0.10657455, -0.18901737, -0.226870328, 0.012841736, 0.283367097, 0.1029239, -0.450918615, 0.035501119, -0.101360753, 0.399461418, 0.072172798, -0.312558293, -0.075272128, 0.106708281, 0.457835495, 0.557505071, -0.007306874, -0.238188878, 0.164919376, 0.275595397, 0.175105855, 0.344778776, 0.102511525, -0.55417031, 0.274397671, -0.18575637, 0.240854248, -0.052412242, 0.20503594, -0.235072896, -0.212708801, -0.096643202, -0.526881933, 0.215610102, 0.074587375, -0.421681345, -0.111438155, -0.006917406, -0.24822025, 0.109819621, 0.11593879, 0.025734231, 0.068005942, -0.40861553, 0.384155929, -0.011950441, -0.030471647, -0.427182525, -0.566701591, -0.45510754, -0.009329192, -0.198395967, -0.02807932, -0.035992071, 0.00301259, 0.120096117, -0.413646013, 0.156792104, -0.161967203, 0.418425381, 0.194126636, -0.279747725, 0.087508023, 0.255667448, 0.596190214, 0.022677585, 0.087372832, 0.126892865, -0.057125125, 0.085050561, -0.193847582, 0.4104

In [9]:
OUT_FILE_CMD = "'" + OUT_FILE + "'"
!echo $OUT_FILE_CMD
!head $OUT_FILE_CMD
!tail $OUT_FILE_CMD

/content/gdrive/My Drive/colab_projects/data/dev_bert-base-cased.tsv
1	-1	Police	[0.458445191, -0.899525166, 0.053096779, 0.097589806, -0.421768248, -0.315354496, -0.017581193, -0.212217957, 0.183210865, -0.022954516, 0.434597075, 0.800242543, 0.476972282, -0.463722199, -0.645279646, 0.048812132, 0.192717791, 0.449927151, 0.464656651, -0.276453078, 0.342477798, -0.804492533, -0.242095649, 0.085211344, -0.238981396, -0.067337714, -0.24630715, 0.30714643, 0.294420928, 0.655085742, 0.230818927, 0.296558738, 0.183663562, -0.130662888, 0.188334048, 0.018924966, 0.180001602, 0.747597516, 0.284552515, 0.351651073, -0.078291491, 0.046472702, -0.137535661, 0.106007151, -0.043341417, 0.159632772, 0.631503999, 1.065459728, 0.209501624, -0.587553918, 0.248702526, 0.38276875, -0.225292832, 0.317952305, 0.478018373, -0.468436003, -0.136917815, -0.595710158, -0.066029213, -0.063653171, 0.412006795, 0.479769111, 0.167953983, -0.252480626, 0.406169266, -0.142041728, 0.07373748, -0.130843118, -0.3274572

In [10]:
1/0  # To avoid running the following sections if selecting 'Run all'

ZeroDivisionError: ignored

# Appending the extracted embeddings to pre-softmax predictions


In [0]:
def combine(predictions, tokens, combined):
    with open(predictions, encoding='utf8') as fpred:
        with open(tokens, encoding='utf8') as fcls:
            with open(combined, 'w', encoding='utf8') as out:
                for line_pred in fpred:
                    cells = line_pred.split('\t')
                    out.write('\t'.join(cells[:3]) + '\t')
                    pred = cells[3]
                    line_cls = next(fcls)
                    cls = line_cls.strip().split('\t')[3]
                    out.write(cls[:-1])
                    out.write(', ')
                    out.write(pred[1:])

combine('gdrive/My Drive/colab_projects/data/tc_train_20200308-221011_2.tsv',
        'gdrive/My Drive/colab_projects/data/tc_train_bert-base-uncased_10.tsv',
        '/content/gdrive/My Drive/colab_projects/data/full_bert_train.tsv')
combine('gdrive/My Drive/colab_projects/data/tc_dev_20200308-221011_2.tsv',
        'gdrive/My Drive/colab_projects/data/tc_dev_bert-base-uncased_10.tsv',
        '/content/gdrive/My Drive/colab_projects/data/full_bert_dev.tsv')
combine('gdrive/My Drive/colab_projects/data/tc_test_20200308-221011_2.tsv',
        'gdrive/My Drive/colab_projects/data/tc_test_bert-base-uncased_10.tsv',
        '/content/gdrive/My Drive/colab_projects/data/full_bert_test.tsv')

# Appending dev_bert-base-uncased to train_bert-base-uncased

(for task 1)

Only run if necessary!! This is slow and the resulting file is very large.

In [0]:
# It's a lot faster to use the Google Drive GUI to copy & rename the file instead of doing this:
# !cp '/content/gdrive/My Drive/colab_projects/data/train_bert-base-uncased.tsv' '/content/gdrive/My Drive/colab_projects/data/train+dev_bert-base-uncased.tsv'

In [0]:
LAST_TRAIN_SENT_ID = 21501
START_SENT_ID = 22

with open('/content/gdrive/My Drive/colab_projects/data/train+dev_bert-base-uncased.tsv', 'a', encoding='utf8') as f_out:
    with open('/content/gdrive/My Drive/colab_projects/data/dev_bert-base-uncased.tsv', encoding='utf8') as f_in:
        for line in f_in:
            cells = line.split('\t', 1)
            sent_id = int(cells[0])
            if sent_id < START_SENT_ID:
                continue
            # Update the sentence ID
            f_out.write(str(LAST_TRAIN_SENT_ID + sent_id - (START_SENT_ID - 1)) + '\t' + cells[1])

In [0]:
!head '/content/gdrive/My Drive/colab_projects/data/train+dev_bert-base-uncased.tsv'
!tail '/content/gdrive/My Drive/colab_projects/data/train+dev_bert-base-uncased.tsv'

In [0]:
# In case Google Drive doesn't appear to update the files:
# (Takes very long...)
drive.flush_and_unmount()