# Token-level BERT embeddings

Partially adapted from https://github.com/huggingface/transformers/blob/3763f8944dc3fef8afb0c525a2ced8a04889c14f/examples/extract_features.py
(Apache License 2.0)

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [0]:
!pip install pytorch_pretrained_bert
!pip install seqeval    

In [0]:
import torch
import urllib
import pandas as pd
from pytorch_pretrained_bert import BertTokenizer, BertModel
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import numpy as np

In [0]:
MAX_LEN = 85
BATCH_SIZE = 32
LAYERS = [-1]
TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/train-improved-sentiwordnet-arguingfullindiv-pos.tsv?token=AD7GEDPOUJFOQS3HTDRWMOS6KZP62'
DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/dev-improved-sentiwordnet-arguingfullindiv-pos.tsv?token=AD7GEDNCXBQCYYC5ZKLNIWC6KZP6Y'
TEST_URL = ''
# OUT_PREFIX = '/content/gdrive/My Drive/colab_projects/data/'
OUT_PREFIX = ''

# Toggle this!
MODE = 'train'
# MODE = 'dev'
# MODE = 'test'

if MODE == 'train':
    IN_URL = TRAIN_URL
elif MODE == 'dev':
    IN_URL = DEV_URL
elif MODE == 'test':
    IN_URL = TEST_URL

OUT_FILE = OUT_PREFIX + MODE + '_bert.tsv'

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0) 

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertModel.from_pretrained('bert-base-uncased')
model.cuda()

In [0]:
def get_comments(filename, url=True):
    if url:
        comments = []
        with urllib.request.urlopen(filename) as f:
            for line in f:
                if line.startswith(b'#'):
                    comments.append(line.decode("utf-8"))
                else:
                    break
        return comments
    with open(filename, 'r', encoding='utf8') as f:
        commentiter = takewhile(lambda s: s.startswith('#'), f)
        comments = list(commentiter)
    return comments

comments = get_comments(IN_URL)
full_df = pd.read_csv(IN_URL, sep='\t', skiprows=len(comments), quoting=3)
sent_df = full_df.groupby('sent_id')['token'].apply(list).to_frame()
sentences = sent_df['token'].tolist()

In [0]:
class InputFeatures(object):

    def __init__(self, tokens, sent_idx, input_ids, input_mask):
        self.tokens = tokens
        self.sent_idx = sent_idx
        self.input_ids = input_ids
        self.input_mask = input_mask


def convert_sentences_to_features(sentences, seq_length, tokenizer):
    features = []
    for (idx, tok_list) in enumerate(sentences, start=1):
        tok_list = [str(tok).lower() for tok in tok_list]
        sentence = ' '.join(tok_list)
        tokens = tokenizer.tokenize(sentence)

        # +2 = [CLS] and [SEP]
        if len(tokens) + 2 > seq_length:
            print('Sentence will be truncated', len(tokens), idx)
            print(sentence)
            print(tokens)
            tokens = tokens[0:(seq_length - 2)]

        tokens.insert(0, '[CLS]')
        tokens.append('[SEP]')

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        input_mask = [1] * len(input_ids)
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)

        assert len(input_ids) == seq_length
        assert len(input_mask) == seq_length

        features.append(
            InputFeatures(
                tokens=tokens,
                sent_idx=idx,
                input_ids=input_ids,
                input_mask=input_mask))
    return idx, features

n_sents, features = convert_sentences_to_features(sentences, MAX_LEN + 2, tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_indices = torch.arange(n_sents, dtype=torch.long)

data = TensorDataset(all_input_ids, all_input_mask, all_indices)
sampler = SequentialSampler(data)
dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)

In [0]:
# The loop nesting is really inopportune, but it is (partially?) caused by the batching

model.eval()
with open(OUT_FILE, 'w', encoding='utf-8') as f:
    for input_ids, input_mask, index_batch in dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)

        all_encoder_layers, _ = model(input_ids, token_type_ids=None, 
                                      attention_mask=input_mask)
        # all_encoder_layers = all_encoder_layers

        for b, idx in enumerate(index_batch):
            feature = features[idx.item()]
            sent_idx = feature.sent_idx
            for (tok_idx, token) in enumerate(feature.tokens):
                if token in ['[CLS]', '[SEP]']:
                    continue
                for layer in LAYERS:
                    layer_output = all_encoder_layers[int(layer)].detach().cpu().numpy()
                    layer_output = layer_output[b]
                    values = [round(x.item(), 9) for x in layer_output[tok_idx]]
                    out = str(sent_idx) + '\t' + str(layer) + '\t' + token + '\t' + str(values)
                    f.write(out + '\n')
                    if (sent_idx % 1000 == 0 and tok_idx == 1):
                        print(out)

In [0]:
# !head '/content/gdrive/My Drive/colab_projects/data/train_bert.tsv'
!mv -v train_bert.tsv '/content/gdrive/My Drive/colab_projects/data/