In [1]:
import torch
import pandas as pd
from transformers import BertTokenizerFast, BertForTokenClassification

from tqdm.notebook import tqdm
import random

In [68]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, lines, labels, labels_to_ids):
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        self.texts = [tokenizer(line, padding='max_length', max_length=25, truncation=True, return_tensors="pt", is_split_into_words=True) for line in lines]
        self.labels = [create_label_array(self.texts[j].word_ids(), labels[j], labels_to_ids) for j in range(len(lines))]    

    def __len__(self):

        return len(self.labels)

    def __getitem__(self, idx):

        batch_data = self.texts[idx]
        batch_labels = torch.LongTensor(self.labels[idx])

        return {
            'input_ids': batch_data['input_ids'].flatten(),
            'attention_mask': batch_data['attention_mask'].flatten(),
            'labels': batch_labels
        }
    
    
def clean_line(line, unique_labels):
    line = line.strip().lower()
    line_data = [i.split('/') for i in line.split("\t")[2].split(' ') if 'fsep' not in i]
    line_data = [list(i) for i in zip(*line_data)]
    unique_labels.update(line_data[1])
    
    return line_data

def create_label_array(word_ids, original_labels, labels_to_ids):
    try:
        t = [original_labels[i] if i is not None else -100 for i in word_ids]
        return [labels_to_ids[tt] if tt in labels_to_ids else tt for tt in t]
    except IndexError:
        print(f"Error for index {idx}")
        raise
        

def get_data_sequences(fh, num_lines, train_percent, seed=0):
    random.seed(seed)
    unique_labels = set()
    
    train_lines = []
    train_labels = []
    test_lines = []
    test_labels = []
    for i in range(num_lines):
        clean_lines, clean_labels = clean_line(fh.readline(), unique_labels)
        if random.random() < train_percent:
            target_lines = train_lines
            target_labels = train_labels
        else:
            target_lines = test_lines
            target_labels = test_labels
            
        target_lines.append(clean_lines)
        target_labels.append(clean_labels)

        
    labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
    ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

    return len(unique_labels), labels_to_ids, DataSequence(train_lines, train_labels, labels_to_ids), DataSequence(test_lines, test_labels, labels_to_ids)

In [69]:
fh = open("data/uk_openaddresses_formatted_addresses_tagged.random.tsv", "r")
num_labels, labels_to_ids, dseq_train, dseq_test = get_data_sequences(fh, 5000, 0.9, seed=20220807)


In [70]:
print(f"dseq_test: {len(dseq_test.texts)}")
print(f"dseq_train: {len(dseq_train.texts)}")
print(dseq_test.labels[0])
print(num_labels)
print(labels_to_ids)

dseq_test: 532
dseq_train: 4468
[-100, 3, 5, 5, 5, 4, 4, 4, 4, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
7
{'': 0, 'city': 1, 'country': 2, 'house_number': 3, 'postcode': 4, 'road': 5, 'sep': 6}


In [72]:
class BertModel(torch.nn.Module):

    def __init__(self, num_labels):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [None]:
def train_loop(model, train_dataset, val_dataset):

    train_dataloader = torch.utils.data.DataLoader(train_dataset, num_workers=0, batch_size=4, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, num_workers=0, batch_size=1)

    use_cuda = False # out of memory :(  torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for sample_batch in tqdm(train_dataloader, total=len(train_dataloader)):

            train_label = sample_batch['labels'].to(device)
            mask = sample_batch['attention_mask'].to(device)
            input_id = sample_batch['input_ids'].to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            # todo better loss - consider counting bad tokens
                
            loss.sum().backward()
            optimizer.step()
            total_loss_train += loss

        # todo - save model
            
        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for test_batch in train_dataloader:

            val_label = test_batch['labels'].to(device)
            mask = test_batch['attention_mask'].to(device)
            input_id = test_batch['input_ids'].to(device)

            loss, logits = model(input_id, mask, val_label)

            logits_clean = logits[val_label != -100]
            label_clean = val_label[val_label != -100]

            predictions = logits_clean.argmax(dim=1)          

            acc = (predictions == label_clean).float().mean()
            total_acc_val += acc
            total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(dseq_test.texts)
        val_loss = total_loss_val / len(dseq_test.texts)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(dseq_train.texts): .3f} | Accuracy: {total_acc_train / len(dseq_train.texts): .3f} | Val_Loss: {total_loss_val / len(dseq_test.texts): .3f} | Accuracy: {total_acc_val / len(dseq_test.texts): .3f}')

LEARNING_RATE = 1e-2
EPOCHS = 5

model = BertModel(num_labels)
train_loop(model, dseq_train, dseq_test)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

HBox(children=(FloatProgress(value=0.0, max=1117.0), HTML(value='')))


Epochs: 1 | Loss:  0.385 | Accuracy:  0.000 | Val_Loss:  3.140 | Accuracy:  0.757


HBox(children=(FloatProgress(value=0.0, max=1117.0), HTML(value='')))


Epochs: 2 | Loss:  0.377 | Accuracy:  0.000 | Val_Loss:  3.148 | Accuracy:  0.759


HBox(children=(FloatProgress(value=0.0, max=1117.0), HTML(value='')))


Epochs: 3 | Loss:  0.376 | Accuracy:  0.000 | Val_Loss:  3.145 | Accuracy:  0.759


HBox(children=(FloatProgress(value=0.0, max=1117.0), HTML(value='')))

In [None]:
# below here is debug/diagnose

In [96]:
model = BertModel(num_labels)
train_dataloader = torch.utils.data.DataLoader(dseq_train, num_workers=0, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(dseq_test, num_workers=0, batch_size=1)

use_cuda = False # out of memory :(  torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

if use_cuda:
    model = model.cuda()

best_acc = 0
best_loss = 1000

for epoch_num in range(EPOCHS):

    total_acc_train = 0
    total_loss_train = 0

    model.train()

    for sample_batch in tqdm(train_dataloader, total=len(train_dataloader)):

        train_label = sample_batch['labels'].to(device)
        mask = sample_batch['attention_mask'].to(device)
        input_id = sample_batch['input_ids'].to(device)

        optimizer.zero_grad()
        loss, logits = model(input_id, mask, train_label)
        print(logits)

        # todo better loss - consider counting bad tokens

        loss.sum().backward()
        optimizer.step()
        total_loss_train += loss
        break
    # todo - save model

    model.eval()

    total_acc_val = 0
    total_loss_val = 0

    for test_batch in train_dataloader:

        val_label = test_batch['labels'].to(device)
        val_mask = test_batch['attention_mask'].to(device)
        val_input_id = test_batch['input_ids'].to(device)

        test_loss, test_logits = model(val_input_id, val_mask, val_label)

        logits_clean = test_logits[val_label != -100]
        label_clean = val_label[val_label != -100]

        predictions = logits_clean.argmax(dim=1)          

        acc = (predictions == label_clean).float().mean()
        total_acc_val += acc
        total_loss_val += loss.item()
        break

    break
    val_accuracy = total_acc_val / len(dseq_test.texts)
    val_loss = total_loss_val / len(dseq_test.texts)
    
    print(
        f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(dseq_train.texts): .3f} | Accuracy: {total_acc_train / len(dseq_train.texts): .3f} | Val_Loss: {total_loss_val / len(dseq_test.texts): .3f} | Accuracy: {total_acc_val / len(dseq_test.texts): .3f}')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

HBox(children=(FloatProgress(value=0.0, max=140.0), HTML(value='')))

tensor([[[ 0.5702,  0.2096,  0.2330,  ...,  0.1489,  0.5878,  0.0542],
         [ 0.2619, -0.3364, -0.6031,  ...,  0.2050,  0.1321, -0.3279],
         [ 0.6245,  0.1645, -0.1261,  ...,  0.1423,  0.1530, -0.9259],
         ...,
         [ 0.2028, -0.2921, -0.1160,  ...,  0.0211,  0.2855, -0.4436],
         [ 0.1668, -0.3710, -0.2973,  ..., -0.1429, -0.1355, -0.4453],
         [ 0.1527, -0.0837, -0.1774,  ...,  0.0602, -0.0205, -0.3852]],

        [[ 0.0346, -0.1563,  0.1150,  ...,  0.1650,  0.5020, -0.1938],
         [ 0.1976,  0.2642, -0.4011,  ..., -0.0282,  0.4597, -0.4557],
         [ 0.4485,  0.3500, -0.1816,  ..., -0.0381,  0.1853, -0.4056],
         ...,
         [ 0.0510, -0.1547, -0.3990,  ...,  0.0501,  0.0459, -0.2227],
         [ 0.1806,  0.0454, -0.2764,  ...,  0.0999, -0.1085, -0.1984],
         [ 0.3257, -0.0235, -0.0421,  ..., -0.0959,  0.1986, -0.1690]],

        [[-0.1253, -0.1298,  0.3417,  ..., -0.0546,  0.2912, -0.1931],
         [ 0.2762,  0.0596,  0.1098,  ..., -0

In [130]:
test_logits[val_label != -100].shape

torch.Size([303, 7])

In [133]:
test_logits

tensor([[[ 2.2821, -0.2896, -0.9865,  ..., -1.4229, -1.7006,  0.7171],
         [ 2.2823, -0.2896, -0.9865,  ..., -1.4231, -1.7007,  0.7172],
         [ 2.2822, -0.2896, -0.9864,  ..., -1.4229, -1.7007,  0.7171],
         ...,
         [ 2.2821, -0.2896, -0.9866,  ..., -1.4228, -1.7006,  0.7171],
         [ 2.2821, -0.2896, -0.9866,  ..., -1.4228, -1.7006,  0.7171],
         [ 2.2821, -0.2896, -0.9865,  ..., -1.4228, -1.7006,  0.7171]],

        [[ 2.1944, -0.2764, -0.9519,  ..., -1.3060, -1.6817,  0.6570],
         [ 2.1943, -0.2764, -0.9519,  ..., -1.3060, -1.6817,  0.6570],
         [ 2.1943, -0.2764, -0.9519,  ..., -1.3060, -1.6817,  0.6570],
         ...,
         [ 2.1943, -0.2764, -0.9519,  ..., -1.3060, -1.6817,  0.6570],
         [ 2.1943, -0.2764, -0.9519,  ..., -1.3060, -1.6817,  0.6570],
         [ 2.1943, -0.2764, -0.9519,  ..., -1.3060, -1.6817,  0.6570]],

        [[ 2.2161, -0.2807, -0.9698,  ..., -1.3319, -1.6847,  0.6717],
         [ 2.2162, -0.2807, -0.9699,  ..., -1

In [124]:
val_label[val_label != -100].shape

torch.Size([303])

In [103]:
logits.argmax(dim=2)

tensor([[5, 0, 0, 4, 0, 0, 0, 4, 3, 5, 0, 5, 3, 0, 0, 4, 0, 0, 3, 5, 3, 3, 5, 0,
         0],
        [5, 5, 0, 0, 6, 0, 6, 0, 0, 4, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0,
         0],
        [2, 0, 0, 0, 0, 6, 3, 0, 4, 4, 4, 0, 4, 0, 3, 0, 0, 0, 0, 0, 0, 5, 0, 0,
         4],
        [2, 0, 3, 1, 0, 0, 3, 4, 0, 0, 5, 3, 3, 3, 2, 0, 3, 3, 0, 0, 0, 5, 3, 4,
         0],
        [5, 5, 0, 1, 0, 2, 0, 3, 5, 0, 6, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0],
        [6, 2, 6, 6, 3, 4, 4, 1, 1, 4, 1, 0, 4, 0, 0, 4, 2, 4, 2, 0, 4, 0, 4, 4,
         0],
        [2, 0, 0, 4, 6, 3, 0, 3, 2, 5, 2, 4, 4, 0, 0, 3, 2, 0, 3, 5, 0, 0, 0, 3,
         2],
        [5, 5, 0, 0, 0, 1, 0, 5, 4, 4, 0, 1, 0, 3, 5, 5, 0, 5, 0, 5, 5, 0, 0, 0,
         0],
        [2, 0, 0, 6, 4, 0, 0, 0, 6, 4, 0, 0, 0, 0, 3, 0, 2, 0, 0, 0, 3, 0, 0, 0,
         3],
        [4, 5, 4, 4, 6, 4, 3, 4, 4, 4, 5, 4, 0, 4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4,
         4],
        [5, 5, 5, 0, 0, 0, 0, 4, 4, 0, 0, 4, 3, 0, 0, 0, 0, 

In [105]:
loss

tensor(2.0226, grad_fn=<NllLossBackward0>)

In [90]:
train_label = sample_batch['labels'].to(device)
mask = sample_batch['attention_mask'].to(device)
input_id = sample_batch['input_ids'].to(device)
loss, logits = model(input_id, mask, train_label)

In [91]:
logits[0, :, :].argmax(dim=1)

tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4])

In [75]:
logits[0, :, :]

tensor([[-0.0345,  0.3386,  0.1161,  0.0342,  0.1419, -0.4926, -0.0822],
        [ 0.1287,  0.1278, -0.2769, -0.0328,  0.2805, -0.2797, -0.2588],
        [-0.1182,  0.0642,  0.2909, -0.1873, -0.0376, -0.5870,  0.6340],
        [ 0.2929,  0.1122,  0.3456, -0.5560, -0.1414, -0.6846,  0.0358],
        [-0.0971, -0.3623,  0.3397,  0.3219,  0.0487, -0.5433,  0.0436],
        [-0.0797, -0.0513, -0.1123,  0.4865,  0.1557, -0.3265,  0.1303],
        [ 0.1666, -0.1134,  0.0131, -0.5321, -0.2246, -0.1194,  0.0015],
        [-0.1751,  0.1675,  0.0526, -0.0153,  0.0076, -0.3112, -0.3186],
        [-0.2348,  0.1760,  0.0915,  0.1199,  0.1011, -0.3250, -0.3097],
        [-0.1697,  0.1996,  0.0621,  0.0056, -0.0347, -0.2210, -0.3963],
        [-0.2471,  0.1525,  0.0947,  0.1459,  0.1054, -0.3120, -0.2813],
        [-0.1382,  0.1749,  0.0549,  0.0837,  0.0231, -0.2560, -0.2628],
        [ 0.1957,  0.1232,  0.1402,  0.0383, -0.1520, -0.0341, -0.0914],
        [-0.0515,  0.1926, -0.0053, -0.0053, -0.031

In [76]:
print(labels_to_ids)

{'': 0, 'city': 1, 'country': 2, 'house_number': 3, 'postcode': 4, 'road': 5, 'sep': 6}


In [110]:
input_id[:5, :]

tensor([[  101, 15876,  2509, 26424,  2100,  6738,  2142,  2983,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1015,  3614,  3702,  2346, 11503,  5677,  3051,  2142,  2983,
         19739, 16576,  1014,  3240,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1022,  2146,  4644,  6174, 15154,  5172,  2142,  2983,  1059,
          2094,  2509,  1022,  6672,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  6146, 17916,  2395,  3393,  2509,  1019,  3022, 11258,  2142,
          2983,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101, 15355,  2015,  2346, 22545,  4642, 16147,  1019,  2546,  2078,
          2142,  2983,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0, 

In [111]:
mask[:5, :]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]])

In [112]:
train_label[:5, :]

tensor([[-100,    4,    4,    4,    4,    1,    2,    2, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100],
        [-100,    3,    5,    5,    5,    1,    1,    1,    2,    2,    4,    4,
            4,    4, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100],
        [-100,    3,    5,    5,    1,    1,    1,    2,    2,    4,    4,    4,
            4,    4, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100],
        [-100,    3,    5,    5,    4,    4,    4,    4,    1,    2,    2, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100],
        [-100,    5,    5,    5,    1,    4,    4,    4,    4,    4,    2,    2,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100]])