In [17]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
import json
import numpy as np
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt

# Load pre-trained model tokenizer (vocabulary)

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [4]:
text = "Олег зашел в банк тинькофф и наорал на себя"
marked_text = "[CLS] " + text + " [SEP]"

print (marked_text)

[CLS] Олег зашел в банк тинькофф и наорал на себя [SEP]


In [5]:
tokenized_text = tokenizer.tokenize(marked_text)
print (tokenized_text)

['[CLS]', 'Олег', 'за', '##шел', 'в', 'банк', 'ти', '##нь', '##ко', '##ф', '##ф', 'и', 'на', '##ора', '##л', 'на', 'себя', '[SEP]']


In [6]:
list(tokenizer.vocab.keys())[30000:30040]

['faz',
 'Bild',
 'Kenny',
 'Bourgogne',
 'lanean',
 '##보',
 '##lí',
 'jogos',
 'північний',
 'natura',
 '##яти',
 '##ต',
 'Malmö',
 '##vert',
 'quinto',
 'Pau',
 'Maison',
 '##oen',
 'Henderson',
 '##xar',
 '##ーム',
 '##nner',
 '##ank',
 '##নে',
 '##yal',
 '##II',
 '##ائية',
 'propose',
 '##tno',
 'Gay',
 'Grad',
 'maka',
 'Willy',
 'något',
 'compétition',
 '##tatea',
 'Italiana',
 'nahe',
 'самых',
 'soldados']

In [7]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for tup in zip(tokenized_text, indexed_tokens):
    print (tup)

('[CLS]', 101)
('Олег', 43360)
('за', 10234)
('##шел', 42171)
('в', 543)
('банк', 65727)
('ти', 30176)
('##нь', 15266)
('##ко', 11623)
('##ф', 13582)
('##ф', 13582)
('и', 549)
('на', 10122)
('##ора', 19553)
('##л', 10517)
('на', 10122)
('себя', 17900)
('[SEP]', 102)


In [8]:
segments_ids = [1] * len(tokenized_text)
print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [9]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

100%|██████████| 662804195/662804195 [01:46<00:00, 6201409.79B/s] 


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Li

In [10]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [21]:
print(encoded_layers[0].shape)
print(segments_tensors.shape)

torch.Size([1, 18, 768])
torch.Size([1, 18])


In [23]:
torch.stack(encoded_layers).shape

torch.Size([12, 1, 18, 768])