# Sentence vector examples

Reference:

- https://github.com/BramVanroy/bert-for-inference/blob/master/introduction-to-bert.ipynb
- https://qiita.com/sugulu_Ogawa_ISID/items/e522a38b812b8edb8a54


In [1]:
import torch
from transformers.modeling_bert import BertModel
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer

tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [2]:
model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking', output_hidden_states=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [3]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [4]:
from transformers import BertConfig

config_japanese = BertConfig.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
print(config_japanese)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 32000
}



In [5]:
# Got samples from https://qiita.com/sugulu_Ogawa_ISID/items/e522a38b812b8edb8a54
text1 = "会社をクビになった。"
text2 = "テレワークばかりでクビが痛い。"
text3 = "会社を解雇された。"
text4 = "すもももももももものうち"

In [6]:
input_ids = tokenizer.encode(text1, return_tensors='pt') 
input_ids = input_ids.to(device)

print(tokenizer.convert_ids_to_tokens(input_ids[0].tolist()))
print(input_ids)

with torch.no_grad():
    result1 = model(input_ids)
    print(result1[0].shape)
    print(result1[1].shape)

    input_ids = tokenizer.encode(text2, return_tensors='pt') 
    input_ids = input_ids.to(device)
    print(tokenizer.convert_ids_to_tokens(input_ids[0].tolist()))
    print(input_ids)

    result2 = model(input_ids)

    input_ids = tokenizer.encode(text3, return_tensors='pt')
    input_ids = input_ids.to(device)
    print(tokenizer.convert_ids_to_tokens(input_ids[0].tolist()))
    print(input_ids)

    result3 = model(input_ids)

    input_ids = tokenizer.encode(text4, return_tensors='pt')
    input_ids = input_ids.to(device)
    result4 = model(input_ids)


['[CLS]', '会社', 'を', 'クビ', 'に', 'なっ', 'た', '。', '[SEP]']
tensor([[    2,   811,    11, 13700,     7,    58,    10,     8,     3]],
       device='cuda:0')
torch.Size([1, 9, 768])
torch.Size([1, 768])
['[CLS]', 'テレ', '##ワーク', 'ばかり', 'で', 'クビ', 'が', '痛', '##い', '。', '[SEP]']
tensor([[    2,  5521,  3118,  4027,    12, 13700,    14,  4897, 28457,     8,
             3]], device='cuda:0')
['[CLS]', '会社', 'を', '解雇', 'さ', 'れ', 'た', '。', '[SEP]']
tensor([[   2,  811,   11, 7279,   26,   20,   10,    8,    3]],
       device='cuda:0')


In [7]:
word_vec1 = result1[0][0][3][:]  # クビ
word_vec2 = result2[0][0][5][:]  # クビ
word_vec3 = result3[0][0][3][:]  # 解雇

In [8]:
cos = torch.nn.CosineSimilarity(dim=0)
cos_sim_12 = cos(word_vec1, word_vec2)
cos_sim_13 = cos(word_vec1, word_vec3)

print(cos_sim_12)
print(cos_sim_13)

tensor(0.6647, device='cuda:0')
tensor(0.7841, device='cuda:0')


In [9]:
sent_vec_cls1 = result1[0][0][0][:]
sent_vec_cls2 = result2[0][0][0][:]
sent_vec_cls3 = result3[0][0][0][:]
sent_vec_cls4 = result4[0][0][0][:]

cos_sim_12 = cos(sent_vec_cls1, sent_vec_cls2)
cos_sim_13 = cos(sent_vec_cls1, sent_vec_cls3)
cos_sim_14 = cos(sent_vec_cls1, sent_vec_cls4)
print(cos_sim_12)
print(cos_sim_13)
print(cos_sim_14)


tensor(0.7645, device='cuda:0')
tensor(0.9015, device='cuda:0')
tensor(0.6147, device='cuda:0')


In [10]:
hidden_states1 = result1[2]
print(len(hidden_states1))

sent_vec_avg1 = torch.mean(hidden_states1[-1], dim=1).squeeze()
print(sent_vec_avg1.shape)

hidden_states2 = result2[2]
hidden_states3 = result3[2]
hidden_states4 = result4[2]
sent_vec_avg2 = torch.mean(hidden_states2[-1], dim=1).squeeze()
sent_vec_avg3 = torch.mean(hidden_states3[-1], dim=1).squeeze()
sent_vec_avg4 = torch.mean(hidden_states4[-1], dim=1).squeeze()

cos_sim_12 = cos(sent_vec_avg1, sent_vec_avg2)
cos_sim_13 = cos(sent_vec_avg1, sent_vec_avg3)
cos_sim_14 = cos(sent_vec_avg1, sent_vec_avg4)
print(cos_sim_12)
print(cos_sim_13)
print(cos_sim_14)

13
torch.Size([768])
tensor(0.8348, device='cuda:0')
tensor(0.9370, device='cuda:0')
tensor(0.6194, device='cuda:0')
