#  Bert Install

+ https://github.com/huggingface/transformers
+ [教學1](https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/)
+ [教學2](https://pypi.org/project/pytorch-pretrained-bert/)

In [1]:
import torch
from transformers import *

In [2]:
MODELS = [(BertModel,BertTokenizer,'bert-base-uncased')]

In [3]:
for model_class,tokenizer_class,pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer  = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)
    # Encode text
    input_ids = torch.tensor([tokenizer.encode("Here is some text ot encode",add_special_tokens=True)])
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]
    

    

# Bert code

In [27]:
# 描述用
def describe(x):
    print('>>Type: {}'.format(x.type()))
    print('>>Shape/size: {}'.format(x.shape))
    print('>>Values: \n{}'.format(x))

In [28]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import torch

device = torch.device("cuda:0")

# 拿到bert預訊練模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [29]:
# 確認一下單字量
vocab = tokenizer.vocab
print("字典大小：", len(vocab))

字典大小： 30522


In [30]:
# 挑出來看看
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)
for t, id in zip(random_tokens, random_ids):
    print("{0:15}{1:10}".format(t, id))


token               index          
-------------------------
italian              3059
cocky               24995
##tia               10711
envoy               19918
mel                 11463
paula               13723
republic             3072
gunnery             27919
##ʷ                 29709
sheridan            13243


In [31]:
# 單字可對應到字典內
text = "Today is good right"
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)

print(text)
print(tokens[:10], '...')
print(ids[:10], '...')

Today is good right
['today', 'is', 'good', 'right'] ...
[2651, 2003, 2204, 2157] ...


In [32]:
#試試分段，注意句子頭尾加上[CLS] [SEP]
stext = 'Here is the sentence I want embeddings for.'
smarked_text = "[CLS] " + text + " [SEP]"
stokenized_text = tokenizer.tokenize(smarked_text)
print(stokenized_text)

['[CLS]', 'today', 'is', 'good', 'right', '[SEP]']


In [49]:
text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
the           1,996
good          2,204
good          2,204
is            2,003
good          2,204
,             1,010
right         2,157
?             1,029
[SEP]           102


In [50]:
print(type(tokenized_text))
segments_ids = [1] * len(tokenized_text)
segments_ids

<class 'list'>


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [51]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [52]:
print(torch.cuda.is_available())
device = torch.device('cuda')

# move to cpu
#tokens_tensor = tokens_tensor.to(device)
#segments_tensors = segments_tensors.to(device)
#model.to(device)
#print(tokens_tensor)
#print(segments_tensors)

True


In [53]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers,_= model(tokens_tensor,segments_tensors)

In [54]:
print ("Number of layers:", len(encoded_layers))
layer_i = 0

print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0

print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))

Number of layers: 12
Number of batches: 1
Number of tokens: 10
Number of hidden units: 768


In [55]:
# current dimentions:
# [# layers, # batches, # tokens, # features]
# 把12層hidden濃縮一起
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings.size()

torch.Size([12, 1, 10, 768])

In [56]:
token_embeddings = torch.squeeze(token_embeddings,dim=1)
token_embeddings.size()

torch.Size([12, 10, 768])

In [57]:
# current dimentions:
# [# layers, # batches, # tokens, # features]

# what we want
# [# tokens, # layers, # features]]

token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()

torch.Size([10, 12, 768])

In [58]:
# concat vector
token_vec_cat = []
for token in token_embeddings:
    cat = torch.cat((token[-1],token[-2],token[-3],token[-4]),dim=0)
    token_vec_cat.append(cat)
print(len(token_vec_cat),' x ',len(token_vec_cat[0]))

10  x  3072


In [59]:
#sum last 4
token_vec_sum = []
for token in token_embeddings:
    sm = torch.sum(token[-4:],dim=0)
    token_vec_sum.append(sm)

![](http://jalammar.github.io/images/bert-feature-extraction-contextualized-embeddings.png)

In [60]:
# this is the sentence vector
token_vecs = encoded_layers[11][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [61]:
for i,token_str in enumerate(tokenized_text):
    print(i,token_str)

0 [CLS]
1 the
2 good
3 good
4 is
5 good
6 ,
7 right
8 ?
9 [SEP]


In [46]:
# chech the 5 value of each 'bank' vector
print(token_vec_cat[6][0:5],token_vec_cat[6].size())
print(token_vec_cat[10][0:5],token_vec_cat[10].size())
print(token_vec_cat[19][0:5],token_vec_cat[19].size())

tensor([ 0.6338, -0.1551, -0.1847,  0.1380,  1.0491]) torch.Size([3072])
tensor([ 0.5344, -0.0954, -0.1938,  0.1298,  0.9821]) torch.Size([3072])
tensor([ 0.2947, -0.2835, -0.0351,  0.3193,  0.8034]) torch.Size([3072])


In [47]:
print(token_vec_sum[6][0:5],token_vec_sum[6].size())
print(token_vec_sum[10][0:5],token_vec_sum[10].size())
print(token_vec_sum[19][0:5],token_vec_sum[19].size())

tensor([ 2.1319, -2.1413, -1.6260,  0.8638,  3.3173]) torch.Size([768])
tensor([ 1.1868, -1.5298, -1.3770,  1.0648,  3.1446]) torch.Size([768])
tensor([ 1.1295, -1.4724, -0.7296, -0.0901,  2.4970]) torch.Size([768])


In [48]:
from scipy.spatial.distance import cosine

diff_bank = 1-cosine(token_vec_cat[10],token_vec_cat[6])
same_bank = 1-cosine(token_vec_cat[10],token_vec_cat[19])

print('this is for cat')
print('Vector simary: {}'.format(diff_bank))
print('Vector simary: {}'.format(same_bank)) 

diff_bank = 1-cosine(token_vec_sum[10],token_vec_sum[6])
same_bank = 1-cosine(token_vec_sum[10],token_vec_sum[19])
print('this is for sum')
print('Vector simary: {}'.format(diff_bank))
print('Vector simary: {}'.format(same_bank)) 

# 代表 bank robber的 bank 跟 bank vault是比較像的


this is for cat
Vector simary: 0.9463571310043335
Vector simary: 0.6720850467681885
this is for sum
Vector simary: 0.945675253868103
Vector simary: 0.6797333359718323
