In [23]:
import numpy as np
import transformers as tfs

In [6]:
tokenizer = tfs.AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")
tokenizer

PreTrainedTokenizerFast(name_or_path='allenai/scibert_scivocab_cased', vocab_size=31116, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [34]:
outs = tokenizer(
    [
        "Alabaster is a word with multiple syllables, multi-word dataset".split(), 
        "America is a country with multiple syllables.".split(),
    ], 
    return_tensors="pt", 
    padding=True, 
    truncation=True, 
    max_length=512,
    is_split_into_words=True
)
outs

{'input_ids': tensor([[  101,   197, 18814, 17597,   163,   105,  4257,   188,  1745, 27604,
         30111,   430,   955,   578,  4257,  5240,   102],
        [  101,   568,  1915, 30108,   163,   105,  5649,   188,  1745, 27604,
         30111,   211,   102,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}

In [69]:
list(map(
    tokenizer.convert_ids_to_tokens,
    outs.input_ids
))

[['[CLS]',
  'al',
  '##aba',
  '##ster',
  'is',
  'a',
  'word',
  'with',
  'multiple',
  'syllable',
  '##s',
  ',',
  'multi',
  '-',
  'word',
  'dataset',
  '[SEP]'],
 ['[CLS]',
  'am',
  '##eric',
  '##a',
  'is',
  'a',
  'country',
  'with',
  'multiple',
  'syllable',
  '##s',
  '.',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]']]

In [35]:
from functools import reduce
from typing import List

def replace_chain(text:str, tokens_to_drop:List[str]) -> str:
    return reduce(
        lambda t, token: t.replace(token, ""), 
        tokens_to_drop, 
        text
    )

def detokenize(tokenizer, input_ids):
    text = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids)
    )

    return replace_chain(text, ["[CLS]", "[SEP]", "[PAD]"])


detokenize(tokenizer, outs["input_ids"][0])

' alabaster is a word with multiple syllables, multi - word dataset '

In [18]:
tokenizer.decode(outs["input_ids"][0], skip_special_tokens=True)

'alabaster is a word with multiple syllables.'

In [36]:
example = tokenizer.convert_ids_to_tokens(outs["input_ids"][0])
example

['[CLS]',
 'al',
 '##aba',
 '##ster',
 'is',
 'a',
 'word',
 'with',
 'multiple',
 'syllable',
 '##s',
 ',',
 'multi',
 '-',
 'word',
 'dataset',
 '[SEP]']

In [43]:
example_classification = np.array(
    [0.8, 0.9, 0.1, 0.5, 0.2, 0.3, 0.4, 0.5, 0.7, 0.1, 0.1, 0.1, 0.95, 0.93, 0.9, 0.9, 0.95],
)

In [44]:
for c, t in zip(example_classification, example):
    print(f"{t}: {c}")

[CLS]: 0.8
al: 0.9
##aba: 0.1
##ster: 0.5
is: 0.2
a: 0.3
word: 0.4
with: 0.5
multiple: 0.7
syllable: 0.1
##s: 0.1
,: 0.1
multi: 0.95
-: 0.93
word: 0.9
dataset: 0.9
[SEP]: 0.95


In [63]:
from itertools import filterfalse
from typing import Tuple
from typing import Tuple
from itertools import takewhile


def merge_tokens_w_classifications(
    tokens:List[str], 
    classifications:List[float]
) -> List[Tuple[str, float]]:
    merged = []
    for token, classification in zip(tokens, classifications):
        if token.startswith("##"):
            merged[-1] = (merged[-1][0] + token[2:], merged[-1][1])
        else:
            merged.append((token, classification))
    return merged


def is_special_token(token):
    return token.startswith("[") and token.endswith("]")


def high_probablity_token_groups(
    tokens_classifications: List[Tuple[str, float]], 
    threshold=0.5,
) -> List[List[Tuple[str, float]]]:

    datasets = []
    dataset = []
    for token, score in tokens_classifications:
        if score >= threshold:
            dataset.append((token, score))
        else:
            if len(dataset) > 0:
                datasets.append(dataset)
                dataset = []
    if len(dataset) > 0:
        datasets.append(dataset)

    return datasets


tokens_classifications = list(filterfalse(
    lambda x: is_special_token(x[0]),
    merge_tokens_w_classifications(example, example_classification)
))
high_probablity_token_groups(tokens_classifications, threshold=0.9)

[[('alabaster', 0.9)],
 [('multi', 0.95), ('-', 0.93), ('word', 0.9), ('dataset', 0.9)]]

In [58]:
tokens_classifications

[('alabaster', 0.9),
 ('is', 0.2),
 ('a', 0.3),
 ('word', 0.4),
 ('with', 0.5),
 ('multiple', 0.7),
 ('syllables', 0.1),
 (',', 0.1),
 ('multi', 0.95),
 ('-', 0.93),
 ('word', 0.9),
 ('dataset', 0.9)]

In [52]:
from itertools import groupby


for k, g in  list(groupby(tokens_classifications, lambda x: int(x[1]>0.9))):
    print(k, list(g))


0 []
1 []
0 []


In [62]:


high_probablity_token_groups(tokens_classifications, threshold=0.9)

[[('alabaster', 0.9)],
 [('multi', 0.95), ('-', 0.93), ('word', 0.9), ('dataset', 0.9)]]

In [72]:
np.array([[1, 2, 3, 4]])[np.newaxis, ...].shape

(1, 1, 4)