# Sentence vector examples using Huggingface BERT models

Reference:

- https://github.com/BramVanroy/bert-for-inference/blob/master/introduction-to-bert.ipynb
- https://qiita.com/sugulu_Ogawa_ISID/items/e522a38b812b8edb8a54


In [2]:
import torch
from transformers.modeling_bert import BertModel
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer

tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking', output_hidden_states=True, return_dict=True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [4]:
type(tokenizer)

transformers.tokenization_bert_japanese.BertJapaneseTokenizer

In [5]:
sentences = ['吾輩は猫である', '本日は晴天なり']
encoded = tokenizer(sentences, return_tensors='pt', padding=True)
encoded


{'input_ids': tensor([[    2,  7184, 30046,     9,  6040,    12,    31,     3],
        [    2,   108, 28486,     9,  4798, 28849,   297,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1]])}

In [6]:
tokenizer.convert_ids_to_tokens(encoded.input_ids.flatten().tolist())

['[CLS]',
 '吾',
 '##輩',
 'は',
 '猫',
 'で',
 'ある',
 '[SEP]',
 '[CLS]',
 '本',
 '##日',
 'は',
 '晴',
 '##天',
 'なり',
 '[SEP]']

In [7]:
# Got samples from https://qiita.com/sugulu_Ogawa_ISID/items/e522a38b812b8edb8a54
sentences = ["会社をクビになった。", "テレワークばかりでクビが痛い。", "会社を解雇された。", "すもももももももものうち"]
encoded = tokenizer(sentences, return_tensors='pt', padding=True)
tokenizer.convert_ids_to_tokens(encoded.input_ids.flatten().tolist())

['[CLS]',
 '会社',
 'を',
 'クビ',
 'に',
 'なっ',
 'た',
 '。',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[CLS]',
 'テレ',
 '##ワーク',
 'ばかり',
 'で',
 'クビ',
 'が',
 '痛',
 '##い',
 '。',
 '[SEP]',
 '[CLS]',
 '会社',
 'を',
 '解雇',
 'さ',
 'れ',
 'た',
 '。',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[CLS]',
 'す',
 '##も',
 '##も',
 'も',
 'もも',
 'も',
 'もも',
 'の',
 'うち',
 '[SEP]']

In [13]:
encoded.to(device)
with torch.no_grad():
    outputs = model(**encoded)
len(outputs.last_hidden_state)

4

In [14]:
sent_vec_cls1 = outputs.last_hidden_state[0][0][:]
sent_vec_cls2 = outputs.last_hidden_state[1][0][:]
sent_vec_cls3 = outputs.last_hidden_state[2][0][:]
sent_vec_cls4 = outputs.last_hidden_state[3][0][:]

cos_sim_12 = cos(sent_vec_cls1, sent_vec_cls2)
cos_sim_13 = cos(sent_vec_cls1, sent_vec_cls3)
cos_sim_14 = cos(sent_vec_cls1, sent_vec_cls4)
print(cos_sim_12)
print(cos_sim_13)
print(cos_sim_14)


NameError: name 'cos' is not defined

In [10]:
hidden_states1 = result1[2]
print(len(hidden_states1))

sent_vec_avg1 = torch.mean(hidden_states1[-1], dim=1).squeeze()
print(sent_vec_avg1.shape)

hidden_states2 = result2[2]
hidden_states3 = result3[2]
hidden_states4 = result4[2]
sent_vec_avg2 = torch.mean(hidden_states2[-1], dim=1).squeeze()
sent_vec_avg3 = torch.mean(hidden_states3[-1], dim=1).squeeze()
sent_vec_avg4 = torch.mean(hidden_states4[-1], dim=1).squeeze()

cos_sim_12 = cos(sent_vec_avg1, sent_vec_avg2)
cos_sim_13 = cos(sent_vec_avg1, sent_vec_avg3)
cos_sim_14 = cos(sent_vec_avg1, sent_vec_avg4)
print(cos_sim_12)
print(cos_sim_13)
print(cos_sim_14)

13
torch.Size([768])
tensor(0.8348, device='cuda:0')
tensor(0.9370, device='cuda:0')
tensor(0.6194, device='cuda:0')
