In [1]:
import numpy as np

import joblib
import torch

import config
import dataset
import engine
from model import EntityModel



In [2]:
meta_data = joblib.load("demo_meta.bin")
enc_tag = meta_data["enc_tag"]

num_tag = len(list(enc_tag.classes_))





In [3]:
model = EntityModel(num_tag=num_tag)
model.load_state_dict(torch.load(config.MODEL_PATH))
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 
if torch.cuda.is_available(): model.to(device)


In [4]:
sentence = """
President Donald Trump may have broken a U.S. federal law and a Georgia law against election tampering by pressuring the state's top election official to "find" enough votes to overturn his loss to President-elect Joe Biden in the state, according to some legal experts.
"""
sentence = sentence.lower()
tokenized_sentence = config.TOKENIZER.encode(sentence)

sentence = sentence.split()
print(sentence)
print(tokenized_sentence)
print(config.TOKENIZER.convert_ids_to_tokens(tokenized_sentence))

test_dataset = dataset.EntityDataset(
    texts=[sentence],
    tags=[[0] * len(sentence)], O_tag_id= enc_tag.transform(["O"])[0]
)



['president', 'donald', 'trump', 'may', 'have', 'broken', 'a', 'u.s.', 'federal', 'law', 'and', 'a', 'georgia', 'law', 'against', 'election', 'tampering', 'by', 'pressuring', 'the', "state's", 'top', 'election', 'official', 'to', '"find"', 'enough', 'votes', 'to', 'overturn', 'his', 'loss', 'to', 'president-elect', 'joe', 'biden', 'in', 'the', 'state,', 'according', 'to', 'some', 'legal', 'experts.']
[101, 2343, 6221, 8398, 2089, 2031, 3714, 1037, 1057, 1012, 1055, 1012, 2976, 2375, 1998, 1037, 4108, 2375, 2114, 2602, 17214, 4842, 2075, 2011, 2811, 12228, 1996, 2110, 1005, 1055, 2327, 2602, 2880, 2000, 1000, 2424, 1000, 2438, 4494, 2000, 2058, 22299, 2010, 3279, 2000, 2343, 1011, 11322, 3533, 7226, 2368, 1999, 1996, 2110, 1010, 2429, 2000, 2070, 3423, 8519, 1012, 102]
['[CLS]', 'president', 'donald', 'trump', 'may', 'have', 'broken', 'a', 'u', '.', 's', '.', 'federal', 'law', 'and', 'a', 'georgia', 'law', 'against', 'election', 'tam', '##per', '##ing', 'by', 'press', '##uring', 'the', 

## Custom Data

In [5]:
import train

In [6]:
from sklearn import preprocessing
from sklearn import model_selection

In [7]:
sentences, tag, enc_tag = train.process_data_conll("../input/train.tsv")

num_tag = len(list(enc_tag.classes_))

(
    train_sentences,
    test_sentences,
    train_tag,
    test_tag
) = model_selection.train_test_split(sentences, tag, random_state=config.RANDOM_STATE, test_size=0.1)

In [8]:
valid_dataset = dataset.EntityDataset(
    texts=test_sentences, tags=test_tag, O_tag_id= enc_tag.transform(["O"])[0]
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
)

In [9]:
y_pred = []
y_ground = []

model.eval()
for data in (valid_data_loader):
    sentence_lengths = data["length"].tolist()
    for k, v in data.items(): #BioBERT is taking alot of space
        data[k] = v.to(device)
    tags, loss = model(data["ids"],data["mask"], data["token_type_ids"], data["target_tag"])
    for ix, tag in enumerate(tags):
        result = enc_tag.inverse_transform(
                tag.argmax(1).cpu().numpy()
            )
        
        y_pred.extend(result[:sentence_lengths[ix]])

        ground_truth_seq = data["target_tag"][ix][:sentence_lengths[ix]].tolist()
        ground_truth = enc_tag.inverse_transform(ground_truth_seq)
        y_ground.extend(ground_truth)


In [10]:
from sklearn.metrics import classification_report

In [15]:
print(classification_report(y_pred=(y_pred), y_true=(y_ground), labels=enc_tag.classes_))

              precision    recall  f1-score   support

       B-art       0.00      0.05      0.00        41
       B-eve       0.00      0.00      0.00        32
       B-geo       0.78      0.29      0.43      3572
       B-gpe       1.00      0.00      0.00      1360
       B-nat       0.00      0.00      0.00        22
       B-org       0.21      0.12      0.15      2166
       B-per       0.45      0.05      0.10      1765
       B-tim       0.87      0.28      0.43      1578
       I-art       0.00      0.00      0.00        14
       I-eve       0.00      0.00      0.00        28
       I-geo       0.72      0.15      0.24       527
       I-gpe       0.00      0.00      0.00        13
       I-nat       0.00      0.00      0.00         3
       I-org       0.07      0.00      0.01      1408
       I-per       0.76      0.03      0.06      1582
       I-tim       0.85      0.47      0.60       434
           O       0.87      0.94      0.90     76937

    accuracy              

In [12]:
np.array(y_pred)

array(['B-art', 'O', 'O', ..., 'O', 'O', 'O'], dtype='<U5')

In [9]:
bpe_tok_sent = config.TOKENIZER.convert_ids_to_tokens(tokenized_sentence)
result = enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1)
        )[:len(tokenized_sentence)]

['president', 'donald', 'trump', 'may', 'have', 'broken', 'a', 'u.s.', 'federal', 'law', 'and', 'a', 'georgia', 'law', 'against', 'election', 'tampering', 'by', 'pressuring', 'the', "state's", 'top', 'election', 'official', 'to', '"find"', 'enough', 'votes', 'to', 'overturn', 'his', 'loss', 'to', 'president-elect', 'joe', 'biden', 'in', 'the', 'state,', 'according', 'to', 'some', 'legal', 'experts.']

[101, 2343, 6221, 8398, 2089, 2031, 3714, 1037, 1057, 1012, 1055, 1012, 2976, 2375, 1998, 1037, 4108, 2375, 2114, 2602, 17214, 4842, 2075, 2011, 2811, 12228, 1996, 2110, 1005, 1055, 2327, 2602, 2880, 2000, 1000, 2424, 1000, 2438, 4494, 2000, 2058, 22299, 2010, 3279, 2000, 2343, 1011, 11322, 3533, 7226, 2368, 1999, 1996, 2110, 1010, 2429, 2000, 2070, 3423, 8519, 1012, 102]

['[CLS]', 'president', 'donald', 'trump', 'may', 'have', 'broken', 'a', 'u', '.', 's', '.', 'federal', 'law', 'and', 'a', 'georgia', 'law', 'against', 'election', 'tam', '##per', '##ing', 'by', 'press', '##uring', 'the', 'state', "'", 's', 'top', 'election', 'official', 'to', '"', 'find', '"', 'enough', 'votes', 'to', 'over', '##turn', 'his', 'loss', 'to', 'president', '-', 'elect', 'joe', 'bid', '##en', 'in', 'the', 'state', ',', 'according', 'to', 'some', 'legal', 'experts', '.', '[SEP]']

### Assign first BPE token's tag to other BPE tokens

In [39]:
prev_tok_tag = ''


concatenated_bpe = ''
concatenated_bpes = []
concatenated_tags = []

new_result = []
for idx, (bpe_tok, tag) in enumerate(zip(bpe_tok_sent, result)):
    if not "##" in bpe_tok:
        if idx!=0: 
            concatenated_bpes.append(concatenated_bpe)
            concatenated_tags.append(main_tok_tag)
        concatenated_bpe = ''
        concatenated_bpe+=(bpe_tok).replace("##","")
        main_tok_tag = tag
    else:
        concatenated_bpe+=(bpe_tok).replace("##","")

### Sklearn sample

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('data/ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:10000]
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
df = df.fillna(method='ffill')

In [4]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [5]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,28
1,B-eve,10
2,B-geo,244
3,B-gpe,303
4,B-nat,5
5,B-org,176
6,B-per,160
7,B-tim,149
8,I-art,20
9,I-eve,10


In [6]:
X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values

classes = np.unique(y)
classes = classes.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

((6700, 3242), (6700,))

In [10]:
y_test

array(['O', 'O', 'O', ..., 'O', 'B-geo', 'O'], dtype=object)

In [None]:
y

In [None]:
X[0].shape

In [14]:
y[0]

'O'