In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import DistilBertTokenizerFast, DistilBertModel
import pytorch_lightning as pl
import spacy
import numpy as np
import json

# Load model & tokenizer

In [2]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')

In [3]:
with open('index.json', 'r') as file:
    index_tag = json.loads(file.read())

In [4]:
class NerClassifier(pl.LightningModule):
    def __init__(self, n_classes=38):
        super(NerClassifier, self).__init__()
        self.bert_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(768, 256),
            nn.GELU(),
            nn.Linear(256, n_classes)
        )
        
        self.n_classes = n_classes
        self.criterion = nn.CrossEntropyLoss()
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids, attention_mask)
        return self.classifier(outputs[0])

In [5]:
model = NerClassifier()
model.load_state_dict(torch.load('weights/ontotnote_model.pth'))
model.to('cuda');

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Inference

In [12]:
sentence = 'Today is the 5th of the year 2020'
sent = sentence.split(' ')

tokenized_input = tokenizer(
    [sent], is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True,
    return_tensors='pt', max_length=128
)

preds = model(tokenized_input['input_ids'].to('cuda'), tokenized_input['attention_mask'].to('cuda'))
preds = preds.squeeze(0).detach().cpu().numpy()

predictions = np.argmax(preds, axis=1)

entities = []
for index, pred in enumerate(predictions[:len(sentence.split(' '))]):
    tag = index_tag[str(pred)]
    if tag != 'O':
        entities.append([sent[index - 1], tag])
        
entities

[['Today', 'B-DATE'],
 ['5th', 'B-ORDINAL'],
 ['the', 'I-DATE'],
 ['year', 'I-DATE']]

In [11]:
nlp = spacy.blank('en')
doc = nlp(sentence)

ents = []
# for entity in entities:
#     span_start ,span_end, label = entity
ent = doc.char_span(0, 5, label='Date')
# if ent is None:
#     continue

ents.append(ent)

doc.ents = ents
spacy.displacy.render(doc, style="ent", jupyter=True)