In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

Using TensorFlow backend.


'1.5.0'

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [4]:
import transformers
from transformers import BertForTokenClassification, AdamW

transformers.__version__


'2.11.0'

In [5]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=18,
    output_attentions = False,
    output_hidden_states = False
)

In [6]:
model_save_name = 'NER_classifier.pt'
path = F"app/{model_save_name}" 

In [7]:
type(model)

transformers.modeling_bert.BertForTokenClassification

In [8]:
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [9]:
print(model)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [10]:
test_sentence = """
In Beirut , a string of officials voiced their anger , while at the United Nations summit in New York , Prime Minister Fouad Siniora said the Lebanese people are resolute in preventing such attempts from destroying their spirit.
"""

In [17]:
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [18]:
tokenized_sentence = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized_sentence]).cuda()

In [19]:
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [20]:
with torch.no_grad():
    output = model(input_ids)

In [21]:
output

(tensor([[[ 1.3198e+00,  8.4605e-01,  3.0260e-02,  8.0380e+00, -1.3658e+00,
           -4.9560e-01, -1.8437e+00, -2.0520e+00, -8.1012e-01, -1.3026e+00,
           -2.6056e+00,  9.5538e-01, -1.9251e+00, -1.3876e+00, -1.5094e+00,
           -2.4208e+00, -5.5662e-01, -3.3630e+00],
          [ 9.6614e-01, -9.9160e-02,  7.3338e-01,  9.3002e+00, -1.2412e+00,
            3.9952e-01, -9.8899e-01, -2.0781e+00, -6.8062e-01, -1.3913e+00,
           -2.5543e+00,  2.9436e-01, -1.7450e+00, -1.3245e+00, -2.0120e+00,
           -2.6262e+00, -7.9022e-01, -3.2465e+00],
          [ 6.7351e+00,  1.5594e+00,  1.8720e+00, -7.9920e-01, -5.3024e-01,
           -3.9077e-02, -9.0184e-01, -2.5240e+00, -6.6130e-01, -8.2713e-01,
           -2.7073e+00,  2.7239e+00, -2.3971e+00, -1.1473e+00, -1.6230e+00,
           -2.9950e+00,  6.3140e-01, -2.4996e+00],
          [-9.5647e-01, -1.1961e+00, -1.1453e-01,  1.0114e+01, -4.0637e-01,
            1.3349e+00, -1.6491e+00, -1.4772e+00,  3.9034e-01,  5.6417e-03,
           

In [22]:
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(label_idx)
        new_tokens.append(token)
for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(label, token))

3	[CLS]
3	In
0	Beirut
3	,
3	a
3	string
3	of
3	officials
3	voiced
3	their
3	anger
3	,
3	while
3	at
3	the
11	United
5	Nations
3	summit
3	in
0	New
9	York
3	,
1	Prime
3	Minister
1	Fouad
4	Siniora
3	said
3	the
16	Lebanese
3	people
3	are
3	resolute
3	in
3	preventing
3	such
3	attempts
3	from
3	destroying
3	their
3	spirit
3	.
3	[SEP]


In [23]:
!python --version

Python 3.7.7
