In [74]:
pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-1.5.8-py3-none-any.whl (526 kB)
[K     |████████████████████████████████| 526 kB 5.5 MB/s eta 0:00:01
[?25hCollecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 55.0 MB/s eta 0:00:01
Collecting tensorboard>=2.2.0
  Downloading tensorboard-2.7.0-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 47.0 MB/s eta 0:00:01
[?25hCollecting torchmetrics>=0.4.1
  Downloading torchmetrics-0.7.0-py3-none-any.whl (396 kB)
[K     |████████████████████████████████| 396 kB 31.3 MB/s eta 0:00:01
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 57.5 MB/s eta 0:00:01
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp38-cp38-macosx_10_9_x86_64.whl (574 kB)
[K

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np

## Data Processing

In [2]:
df = pd.read_csv(
    "../../data/ner.csv",
     encoding = "ISO-8859-1",
     error_bad_lines=False,
     usecols=['sentence_idx', 'word', 'tag']
)



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
df  = df[df['sentence_idx'] != 'prev-lemma'].dropna(subset=['sentence_idx']).reset_index(drop=True)

In [36]:
def get_data(df):
    sentences, sentence_tags = [], []

    for sid, group in tqdm(df.groupby("sentence_idx")):
        words = group["word"].tolist()
        tags = group["tag"].tolist()
        
        assert len(words) == len(tags)
        
        sentences.append(words)
        sentence_tags.append(tags)
    
    return sentences, sentence_tags

In [5]:
sentences, tags = get_data(df)

100%|██████████| 36684/36684 [00:03<00:00, 10078.43it/s]


In [6]:
print(
    list(zip(sentences[0], tags[0]))
)

[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('Iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')]


## Tokenization

In [37]:
def tokenize(data):
    word_2_idx, idx_2_word = {}, {}
    tokenized_data = []
    
    for row in tqdm(data):
        tokenized_row = []
        for i, word in enumerate(row):
            if word not in word_2_idx:
                word_id = len(word_2_idx)
                word_2_idx[word] = word_id
                idx_2_word[word_id] = word
            tokenized_row.append(word_2_idx[word])
        tokenized_data.append(tokenized_row)
            
    return tokenized_data, word_2_idx, idx_2_word

In [38]:
tokenized_sentences, word_2_idx, idx_2_word = tokenize(sentences)

100%|██████████| 36684/36684 [00:00<00:00, 103819.52it/s]


In [39]:
tokenized_tags, tag_2_idx, idx_2_tag = tokenize(tags)

100%|██████████| 36684/36684 [00:00<00:00, 217283.76it/s]


In [40]:
list(zip(tokenized_sentences[0], sentences[0], tags[0], tokenized_tags[0]))

[(0, 'Thousands', 'O', 0),
 (1, 'of', 'O', 0),
 (2, 'demonstrators', 'O', 0),
 (3, 'have', 'O', 0),
 (4, 'marched', 'O', 0),
 (5, 'through', 'O', 0),
 (6, 'London', 'B-geo', 1),
 (7, 'to', 'O', 0),
 (8, 'protest', 'O', 0),
 (9, 'the', 'O', 0),
 (10, 'war', 'O', 0),
 (11, 'in', 'O', 0),
 (12, 'Iraq', 'B-geo', 1),
 (13, 'and', 'O', 0),
 (14, 'demand', 'O', 0),
 (9, 'the', 'O', 0),
 (15, 'withdrawal', 'O', 0),
 (1, 'of', 'O', 0),
 (16, 'British', 'B-gpe', 2),
 (17, 'troops', 'O', 0),
 (18, 'from', 'O', 0),
 (19, 'that', 'O', 0),
 (20, 'country', 'O', 0),
 (21, '.', 'O', 0)]

In [41]:
PAD_IDX = len(word_2_idx)
word_2_idx["<PAD>"] = PAD_IDX
idx_2_word[PAD_IDX] = "<PAD>"

In [42]:
len(word_2_idx), len(idx_2_word), len(tag_2_idx), len(idx_2_tag)

(30173, 30173, 17, 17)

In [43]:
tag_2_idx

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16}

In [44]:
t = []
for _ in tokenized_tags:
    t.extend(_)
np.bincount(t)

array([889973,  37525,  16392,  17011,   7409,  20184,  16537,  20193,
          434,    280,  17382,    229,   6298,    226,    348,    297,
           76])

## Train Test datasets

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(tokenized_sentences, tokenized_tags, test_size=0.15)

In [47]:
len(X_train), len(y_train), len(X_test), len(y_test)

(31181, 31181, 5503, 5503)

In [48]:
list(zip(X_train[0], y_train[0]))

[(16303, 8),
 (59, 0),
 (14807, 0),
 (31, 0),
 (296, 0),
 (297, 0),
 (1343, 0),
 (1891, 0),
 (254, 0),
 (18, 0),
 (1839, 3),
 (8893, 10),
 (93, 0),
 (45, 0),
 (8592, 0),
 (1, 0),
 (9, 0),
 (8513, 5),
 (6223, 6),
 (21, 0),
 (16303, 8),
 (59, 0),
 (14807, 0),
 (31, 0),
 (296, 0),
 (297, 0),
 (1343, 0),
 (1891, 0),
 (254, 0),
 (18, 0),
 (1839, 3),
 (8893, 10),
 (93, 0),
 (45, 0),
 (8592, 0),
 (1, 0),
 (9, 0),
 (8513, 5),
 (6223, 6),
 (21, 0)]

In [49]:
class NERDataset(torch.utils.data.Dataset):
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, i):
        words = self.x[i]
        tags = self.y[i]
        
        return torch.Tensor(words), torch.Tensor(tags)

def collate(batch):
    sentences, tags = zip(*batch)
    
    sentences = torch.nn.utils.rnn.pad_sequence(sentences, batch_first=True, padding_value=word_2_idx["<PAD>"]).long()
    tags = torch.nn.utils.rnn.pad_sequence(tags, batch_first=True, padding_value=-1).long()
    
    return sentences, tags

In [50]:
train_dataset = NERDataset(X_train, y_train)
train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn=collate)

test_dataset = NERDataset(X_test, y_test)
test_dataloader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=64, collate_fn=collate)

## CNN

In [51]:
import sys
sys.path.append("..")

In [52]:
from models.cnn import CnnNER
import torch.nn as nn
from sklearn.metrics import f1_score

In [53]:
model_config = {
    "n_classes": len(tag_2_idx),
    "n_embeddings": len(word_2_idx),
    "embed_dims": 50,
    "n_cnn_layers": 4,
    "n_cnn_channels": 64,
    "cnn_kernel_size": 5,
    "cnn_padding": 2,
}

In [54]:
model = CnnNER(**model_config)

In [55]:
model

CnnNER(
  (embedding_layer): Embedding(30173, 50)
  (cnn_layers): Sequential(
    (0): Sequential(
      (0): Conv1d(50, 64, kernel_size=(5,), stride=(1,), padding=(2,))
      (1): ReLU()
      (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Dropout(p=0.5, inplace=False)
    )
    (1): Sequential(
      (0): Conv1d(64, 64, kernel_size=(5,), stride=(1,), padding=(2,))
      (1): ReLU()
      (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Dropout(p=0.5, inplace=False)
    )
    (2): Sequential(
      (0): Conv1d(64, 64, kernel_size=(5,), stride=(1,), padding=(2,))
      (1): ReLU()
      (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (3): Dropout(p=0.5, inplace=False)
    )
    (3): Sequential(
      (0): Conv1d(64, 64, kernel_size=(5,), stride=(1,), padding=(2,))
      (1): ReLU()
      (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, tra

In [56]:
loss_fn = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

In [None]:
def train(epochs, model):
    def train_batch(x, y, train):
        if train:
            model.train()
            optimizer.zero_grad()
        else:
            model.eval()

        preds = model(x)
        loss = loss_fn(preds.transpose(1, 2), y)
        if train:
            loss.backward()
            optimizer.step()

        preds = preds.argmax(dim=-1)

        mask = (y >= 0)
        y_masked = y[mask]
        preds_masked = preds[mask]

        corr = (preds_masked == y_masked).sum().item()
        f1 = f1_score(y_masked, preds_masked, average='macro')

        return preds_masked, y_masked, corr, y_masked.numel(), loss.item(), f1


    for e in range(epochs):

        train_corr = 0
        train_total = 0
        train_loss = 0.
        train_preds = []
        train_true = []
        train_loop = tqdm(train_dataloader, leave=False, position=0)
        train_loop.set_description(f"Epoch {e+1}")
        for batch_num, (x, y) in enumerate(train_loop):
            batch_preds, batch_true, batch_corr, batch_total, batch_loss, f1 = train_batch(x, y, train=True)

            train_corr += batch_corr
            train_total += batch_total
            train_loss += batch_loss
            train_preds.extend(batch_preds.tolist())
            train_true.extend(batch_true.tolist())

            train_loop.set_postfix(acc=(batch_corr/batch_total) * 100., loss=batch_loss, f1=f1)

        print(f"Train Epoch {e+1} loss={train_loss/len(train_dataloader)} acc={train_corr/train_total * 100} f1={f1_score(train_true, train_preds, average='weighted')}")

        val_corr = 0
        val_total = 0
        val_loss = 0.
        val_preds = []
        val_true = []
        val_loop = tqdm(test_dataloader, leave=False, position=0)
        val_loop.set_description(f"Epoch {e+1}")
        for batch_num, (x, y) in enumerate(val_loop):
            batch_preds, batch_true, batch_corr, batch_total, batch_loss, f1 = train_batch(x, y, train=False)

            val_corr += batch_corr
            val_total += batch_total
            val_loss += batch_loss
            val_preds.extend(batch_preds.tolist())
            val_true.extend(batch_true.tolist())

            val_loop.set_postfix(acc=batch_corr/batch_total * 100., loss=batch_loss, f1=f1)

        print(f"Val Epoch {e+1} loss={val_loss/len(test_dataloader)} acc={val_corr/val_total * 100} f1={f1_score(val_true, val_preds, average='weighted')}")

In [59]:
train(5, model)

                                                                                          

Train Epoch 0 loss=0.31728164698989664 acc=90.92238642632468 f1=0.900732996794532


Epoch 1:   0%|          | 0/86 [00:01<?, ?it/s, acc=91.5, f1=0.412, loss=0.303]

Val Epoch 0 loss=0.2591884900317636 acc=92.59051825639786 f1=0.921351201632474


                                                                                          

Train Epoch 1 loss=0.2906424852668262 acc=91.74808178848815 f1=0.9111680432421218


Epoch 2:   0%|          | 0/86 [00:01<?, ?it/s, acc=92, f1=0.429, loss=0.275]  

Val Epoch 1 loss=0.2375396994310756 acc=93.24541919387995 f1=0.9286728527694147


                                                                                          

Train Epoch 2 loss=0.26774571258880075 acc=92.47071909013496 f1=0.9196284950723157


Epoch 3:   0%|          | 0/86 [00:01<?, ?it/s, acc=92.7, f1=0.447, loss=0.251]

Val Epoch 2 loss=0.22309148744788282 acc=93.72275223869198 f1=0.9340931831578947


                                                                                          

Train Epoch 3 loss=0.2485033975089671 acc=93.08168218505469 f1=0.9266667654721753


Epoch 4:   0%|          | 0/86 [00:01<?, ?it/s, acc=93.4, f1=0.483, loss=0.234]

Val Epoch 3 loss=0.20792901931807053 acc=94.01678939429618 f1=0.9378462032257242


                                                                                          

Train Epoch 4 loss=0.23263662888622674 acc=93.56351498482104 f1=0.9321187519208465


Epoch 5:   0%|          | 0/86 [00:01<?, ?it/s, acc=93.8, f1=0.502, loss=0.213]

Val Epoch 4 loss=0.19837765066429627 acc=94.37638028805459 f1=0.9419144090111531


## LSTM

In [60]:
from models.lstm import LSTMNER

In [61]:
model_config = {
    "n_classes": len(tag_2_idx),
    "n_embeddings": len(word_2_idx),
    "embed_dims": 50,
    "n_lstm_layers": 2,
    "lstm_dims": 128,
}

In [62]:
model = LSTMNER(**model_config)

In [63]:
model

LSTMNER(
  (embedding_layer): Embedding(30173, 50)
  (lstm_layers): LSTM(50, 128, num_layers=2, batch_first=True, bidirectional=True)
  (output_layer): Linear(in_features=256, out_features=17, bias=True)
)

In [66]:
loss_fn = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

In [None]:
train(10, model)

                                                                                           

Train Epoch 1 loss=0.6473111892577077 acc=85.97481623550502 f1=0.8138471382900698


Epoch 1:   0%|          | 0/86 [00:13<?, ?it/s, acc=88.8, f1=0.319, loss=0.423]

Val Epoch 1 loss=0.3863755727923194 acc=89.6660578018495 f1=0.8736550118635578


                                                                                          

Train Epoch 2 loss=0.3016435936337612 acc=91.9036200122864 f1=0.9092482044492394


Epoch 2:   0%|          | 0/86 [00:14<?, ?it/s, acc=93.5, f1=0.529, loss=0.247]

Val Epoch 2 loss=0.2482122607355894 acc=93.31033648797438 f1=0.9282904077231984


                                                                                           

Train Epoch 3 loss=0.2102910633519536 acc=94.16921887361232 f1=0.9375413813972469


Epoch 3:   0%|          | 0/86 [00:13<?, ?it/s, acc=95, f1=0.575, loss=0.189]  

Val Epoch 3 loss=0.19839633221543113 acc=94.46739178859875 f1=0.9420542942193333


                                                                                           

Train Epoch 4 loss=0.16648826364916366 acc=95.21759126121357 f1=0.949519244822825


Epoch 4:   0%|          | 0/86 [00:14<?, ?it/s, acc=94.1, f1=0.663, loss=0.219]

## CNN+LSTM