In [1]:
from torch.utils.data import DataLoader
from model import SimpleSequenceTagger
from ner_dataset import NERDataset
import numpy as np

In [2]:
dev_file = './data/dev.conll'  # path to training data
test_file = './data/test.conll'  # path to validation data
train_file = './data/train.conll'  # path to test data
num_epochs = 1
train_dataset = NERDataset(file=train_file)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
dev_dataset = NERDataset(file=dev_file)
dev_dataloader = DataLoader(dev_dataset, batch_size=1, shuffle=True)
test_dataset = NERDataset(file=test_file)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)

In [3]:
seq_tagger = SimpleSequenceTagger(input_dim=50, hidden_dim=100, num_layers=1, class_size=9)
for epoch in range(num_epochs):
    loss = seq_tagger.train(data=train_dataloader, learning_rate=0.01)
    print("Iteration:", epoch, "Loss:", loss)
    metrics = seq_tagger.evaluate(dev_dataloader)
    print('DEV-Data','macro', metrics['f1_scores']['macro'], 'micro', metrics['f1_scores']['micro'])

Iteration: 0 Loss: 3.7049267292022705
r [0.8258029395753946, 0.588937093275488, 0.6032811334824758, 0.7736156351791531, 0.08949416342412451, 0.2543352601156069, 0.1278295605858855, 0.35960214231063503, 0.9851727121775533]
p [0.7364077669902913, 0.8510971786833855, 0.6059925093632959, 0.5818701510820743, 0.5227272727272727, 0.5569620253164557, 0.37209302325581395, 0.5696969696969697, 0.9662805367588027]
r 4.608070640126317
p 5.763127433874361
classes 9
macro_recall 0.5120078489029241
macro_precison 0.6403474926527069
tp 47096.0
fn 51362.0
fp 51362.0
DEV-Data macro 0.5690309758462043 micro 0.9169424866632919


In [5]:
metrics = seq_tagger.evaluate(test_dataloader)
print('TEST-Data','macro', metrics['f1_scores']['macro'], 'micro', metrics['f1_scores']['micro'])
wrong_word_predictions = metrics['word_statistics']['fail']
wrong_word_predictions = [v for v in wrong_word_predictions if v[1] >= 8]
print('TEST-Data', 'wrong_predicted_words', wrong_word_predictions)
context_statistics = metrics['context_statistics']
print('TEST-Data', 'context_statistics', context_statistics)

[0.7889688249400479, 0.688034188034188, 0.6742925948223961, 0.7136672850958565, 0.0, 0.0, 0.15449101796407186, 0.2378892733564014, 0.984187041724291]
[0.706766917293233, 0.7134416543574594, 0.5530864197530864, 0.6151385927505331, 0.0, 0.0, 0.3467741935483871, 0.46928327645051193, 0.9664334947600379]
tp 42194.0
fn 46435.0
fp 46435.0
TEST-Data macro 0.4783617079158998 micro 0.908668030580381
TEST-Data wrong_predicted_words [('cup', 84), ('new', 74), ('of', 36), ('national', 36), ('york', 31), ('real', 25), ('open', 24), ('united', 24), ('international', 24), ('indies', 23), ('world', 22), ('states', 21), ('south', 20), ('fe', 20), ('coast', 19), ('santa', 19), ('wto', 19), ('league', 18), ('bre-x', 15), ('chicago', 15), ('steers', 15), ('city', 15), ('state', 15), ('west', 14), ('korea', 14), ('san', 14), ('and', 13), ('east', 13), ('albright', 13), ('super', 12), ('zealand', 12), ('jersey', 12), ('lara', 12), ('major', 12), ('american', 12), ('van', 12), ('bay', 11), ('newsroom', 11), (

In [8]:
c_m = metrics['confusion_matrix']
c_m

array([[1.3160e+03, 9.0000e+00, 1.6700e+02, 2.7000e+01, 0.0000e+00,
        0.0000e+00, 2.3000e+01, 3.0000e+01, 9.6000e+01],
       [2.5000e+01, 4.8300e+02, 3.9000e+01, 1.6000e+01, 0.0000e+00,
        0.0000e+00, 5.0000e+00, 4.0000e+00, 1.3000e+02],
       [1.0100e+02, 3.8000e+01, 1.1200e+03, 5.0000e+01, 0.0000e+00,
        0.0000e+00, 6.5000e+01, 1.4000e+01, 2.7300e+02],
       [3.6000e+01, 7.0000e+00, 8.8000e+01, 1.1540e+03, 0.0000e+00,
        0.0000e+00, 3.0000e+00, 1.8200e+02, 1.4700e+02],
       [1.0900e+02, 7.0000e+00, 4.5000e+01, 7.0000e+00, 0.0000e+00,
        0.0000e+00, 4.4000e+01, 4.0000e+00, 4.1000e+01],
       [7.0000e+00, 1.2000e+01, 8.8000e+01, 2.0000e+00, 0.0000e+00,
        0.0000e+00, 1.2000e+01, 1.0000e+00, 9.4000e+01],
       [1.1300e+02, 2.1000e+01, 2.0700e+02, 1.9000e+01, 4.0000e+00,
        0.0000e+00, 1.2900e+02, 6.0000e+00, 3.3600e+02],
       [1.9000e+01, 1.1000e+01, 1.1300e+02, 5.3700e+02, 2.0000e+00,
        0.0000e+00, 6.0000e+00, 2.7500e+02, 1.9300e+02],


In [10]:
curr_class = 0
for c in c_m:
    print('row', np.sum(c))
    print('column',np.sum(c_m[:,curr_class]))
    curr_class += 1

row 1668.0
column 1862.0
row 702.0
column 677.0
row 1661.0
column 2025.0
row 1617.0
column 1876.0
row 257.0
column 8.0
row 216.0
column 2.0
row 835.0
column 372.0
row 1156.0
column 586.0
row 38323.0
column 39027.0


In [9]:
len(train_dataset)

14041

In [None]:
# world = ambiguity
# 1996-12-06 date

In [5]:
def _generate_vocab(embeddings_file):
    # generating vocab by loading embeddings
    vocab = {}
    f = open(embeddings_file, encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        vocab[word] = coefs
    f.close()
    return vocab

In [6]:
vocab = _generate_vocab('./embeddings/glove.6B.50d.txt')

In [18]:
vocab['indies']

array([-7.1778e-01, -9.6705e-01, -9.9380e-01,  2.3378e-01, -3.3539e-01,
       -1.0643e+00, -6.3922e-01,  6.3635e-01,  7.8194e-01, -4.6878e-01,
        5.0464e-04,  9.9005e-01, -8.0727e-01, -6.3993e-01, -3.1274e-01,
       -3.0091e-02,  5.1863e-01, -5.9698e-01, -1.8351e+00,  3.1572e-02,
       -5.3637e-01,  9.2632e-01,  5.0294e-01,  1.1958e-01,  5.4456e-01,
       -4.2571e-01,  8.1545e-01, -5.9448e-01, -4.7707e-01,  5.3242e-01,
        1.7212e+00,  6.6461e-01,  4.9676e-01,  1.0170e+00,  1.2407e+00,
        5.7347e-02,  5.6608e-01,  1.7260e-02, -1.0164e+00, -9.2046e-01,
       -7.1847e-02,  2.4281e-01,  1.8554e-01, -1.8488e-01, -9.0554e-01,
        2.4608e+00, -3.0118e-01, -1.8025e-01,  1.0110e-01, -8.5820e-01],
      dtype=float32)

In [None]:
# 2022 https://arxiv.org/pdf/2204.04391.pdf
# https://arxiv.org/pdf/1910.02403.pdf

# https://kiarashk76.github.io/docs/DL4NLP.pdf