In [5]:
from transformers import AutoModel, AutoTokenizer
import collections

In [6]:
m = AutoModel.from_pretrained('emanjavacas/MacBERTh')

Some weights of the model checkpoint at emanjavacas/MacBERTh were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
tok = AutoTokenizer.from_pretrained('emanjavacas/MacBERTh')

In [8]:
import pandas as pd

In [7]:
data = pd.read_csv('./Leiden/Datasets/OED/data/oed-quotes-subset.tsv', sep='\t')

In [13]:
sents = data['quote'].values[:2]

In [27]:
ids = tok(list(sents), return_tensors='pt', padding=True)

In [28]:
ids

{'input_ids': tensor([[    2,   960,  1148,   859,  9690,  1529,   873, 11036,   842,    16,
           878,   905,  3333,   549,   839,   828, 13716,   833, 17292,  2870,
           549,   839, 14307,    30,   911,   924,   905,   914,  1466,   844,
          1049, 27431,   565,    18,     3],
        [    2,   869,   915,   944,   868,   881,  5267,   549,   844, 17091,
            16,   911,  1200,   549,   844,    43, 27431,  6477,    18,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1

In [52]:
output = m(**ids)
output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [55]:
output = m(**ids, output_hidden_states=True)
output.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [61]:
for idx, out in enumerate(output['hidden_states']):
    print(idx, out.shape)

0 torch.Size([2, 35, 768])
1 torch.Size([2, 35, 768])
2 torch.Size([2, 35, 768])
3 torch.Size([2, 35, 768])
4 torch.Size([2, 35, 768])
5 torch.Size([2, 35, 768])
6 torch.Size([2, 35, 768])
7 torch.Size([2, 35, 768])
8 torch.Size([2, 35, 768])
9 torch.Size([2, 35, 768])
10 torch.Size([2, 35, 768])
11 torch.Size([2, 35, 768])
12 torch.Size([2, 35, 768])


In [45]:
sent2 = "God hath no suche bodyly membres, as this membres to the lettre dothe pretende to shewe: but all this was done in great mistery."

In [46]:
sent2

'God hath no suche bodyly membres, as this membres to the lettre dothe pretende to shewe: but all this was done in great mistery.'

In [47]:
ids2 = tok([sent2], return_tensors='pt', padding=True)

In [48]:
subwords_to_token_ids(ids2['input_ids'][0], tok)

defaultdict(list,
            {'god': [[1]],
             'hath': [[2]],
             'no': [[3]],
             'suche': [[4]],
             'bodyly': [[5, 6]],
             'membres': [[7, 8], [12, 13]],
             ',': [[9]],
             'as': [[10]],
             'this': [[11], [26]],
             'to': [[14], [21]],
             'the': [[15]],
             'lettre': [[16, 17]],
             'dothe': [[18]],
             'pretende': [[19, 20]],
             'shewe': [[22]],
             ':': [[23]],
             'but': [[24]],
             'all': [[25]],
             'was': [[27]],
             'done': [[28]],
             'in': [[29]],
             'great': [[30]],
             'mistery': [[31, 32]],
             '.': [[33]]})

In [23]:
sents[0]

'God hath no suche bodyly membres, as this texte to the lettre dothe pretende to shewe: but all this was done in great mistery.'

In [22]:
tok.convert_ids_to_tokens(ids['input_ids'][0])

['[CLS]',
 'god',
 'hath',
 'no',
 'suche',
 'body',
 '##ly',
 'membr',
 '##es',
 ',',
 'as',
 'this',
 'text',
 '##e',
 'to',
 'the',
 'lett',
 '##re',
 'dothe',
 'pretend',
 '##e',
 'to',
 'shewe',
 ':',
 'but',
 'all',
 'this',
 'was',
 'done',
 'in',
 'great',
 'mister',
 '##y',
 '.',
 '[SEP]']

In [41]:
def subwords_to_token_ids(ids, tokenizer, prefix='##'):
    # for ids, k in tqdm.tqdm(zip(tokens['input_ids'], keyword)):
    #     mapping = subwords_to_token_ids(ids, tokenizer)
    #     subwords = tokenizer.convert_ids_to_tokens(ids)
    #     for idxs in mapping[k]:
    #         out = ''.join(subwords[i].lstrip('##') for i in idxs)
    #         assert out == k, (out, k)
    output = collections.defaultdict(list)
    special = set(tokenizer.special_tokens_map.values())
    subwords = tokenizer.convert_ids_to_tokens(ids)
    ids, word = [], ''
    for idx, subword in enumerate(subwords):
        if subword in special:
            continue
        if subword.startswith(prefix):
            word += subword[len(prefix):]
            ids.append(idx)
        else:
            if word:
                output[word].append(ids)
            ids, word = [idx], subword
    if word:
        output[word].append(ids)
    return output

In [63]:
data[['quote', 'keyword']].head()

Unnamed: 0,quote,keyword
0,"God hath no suche bodyly membres, as this text...",mistery
1,"For we do it not actuallye in dede, but onlye ...",misterye
2,Whiche place is to be vnderstande in a mistery .,mistery
3,At welles five licour I shal drawe Where al my...,mysteryes
4,The glorius modir Quhilk of hir natur consavit...,misteris


In [65]:
data = pd.read_csv('./Downloads/thesis final data.utf.csv')

In [72]:
a, b, c = zip(*data[['Context before', 'Query term', 'Context after']].values)

sents, keywords = [], []
for a1, b1, c1 in zip(a, b, c):
    sents.append(a1 + ' ' + b1 + ' '+ c1)
    keywords.append(b1)

In [75]:
sents[0], keywords[0]

('a yellow colour . It was filtered and evaporated , when it left a quantity of emulsine . The yellow mass on the filter was treated with boiling alcohol , which became yellow , while the residue lost almost the whole',
 'mass')

In [79]:
ids = tok(list(sents[:5]), return_tensors='pt', padding=True)

In [80]:
output = m(**ids)
output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [86]:
for idx, keyword in enumerate(keywords[:5]):
    mapping = subwords_to_token_ids(ids['input_ids'][idx], tok)
    target = mapping[keyword]
    print(target, "--", keyword, "--", sents[idx])

[[25]] -- mass -- a yellow colour . It was filtered and evaporated , when it left a quantity of emulsine . The yellow mass on the filter was treated with boiling alcohol , which became yellow , while the residue lost almost the whole
[[21]] -- mass -- . In my paper on the Physical Properties of Ice this promise is fulfilled ; I have shown how a mass of compact ice may be liquefied by pressure , in parallel planes perpendicular to the direction of the force ,
[[22]] -- mass -- cone in 7 minutes after the commencement of filling , which may be taken as the time in which the mass of iron in the cone had reached 21 Fahr. It was also found that the firm surrounding solid crust had
[[22], [38]] -- mass -- perpendicularly towards that surface , saving an abatement that must be made for the inequality of pressure upon the central mass , when that is not in equilibrium . But if the central mass be infinitely , small , whether it
[[22]] -- mass -- took place ; and , when this had subsided , the 

In [83]:
mapping

defaultdict(list,
            {'took': [[1]],
             'place': [[2]],
             ';': [[3], [41]],
             'and': [[4], [42]],
             ',': [[5], [11], [23], [26], [48]],
             'when': [[6], [24]],
             'this': [[7]],
             'had': [[8]],
             'subsided': [[9, 10]],
             'the': [[12], [21], [43]],
             'whole': [[13]],
             'was': [[14], [27], [37]],
             'poured': [[15], [38]],
             'into': [[16]],
             'a': [[17]],
             'proper': [[18]],
             'vessel': [[19]],
             '.': [[20], [32]],
             'mass': [[22]],
             'cold': [[25]],
             'grayish': [[28, 29]],
             '-': [[30]],
             'brown': [[31], [44]],
             'boiling': [[33]],
             'distilled': [[34, 35]],
             'water': [[36]],
             'upon': [[39]],
             'it': [[40]],
             'residuum': [[45, 46, 47]]})