In [1]:
import torch
import pandas as pd
from transformers import BertTokenizerFast, BertForTokenClassification, DistilBertTokenizer, DistilBertForTokenClassification

from tqdm.notebook import tqdm
import random

In [2]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, lines, labels, labels_to_ids):
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        self.texts = [tokenizer(line, padding='max_length', max_length=25, truncation=True, return_tensors="pt", is_split_into_words=True) for line in lines]
        self.labels = [create_label_array(self.texts[j].word_ids(), labels[j], labels_to_ids) for j in range(len(lines))]    

    def __len__(self):

        return len(self.labels)

    def __getitem__(self, idx):

        batch_data = self.texts[idx]
        batch_labels = torch.LongTensor(self.labels[idx])

        return {
            'input_ids': batch_data['input_ids'].flatten(),
            'attention_mask': batch_data['attention_mask'].flatten(),
            'labels': batch_labels
        }
    
    
def clean_line(line, unique_labels):
    line = line.strip().lower()
    line_data = [i.split('/') for i in line.split("\t")[2].split(' ') if 'fsep' not in i]
    line_data = [list(i) for i in zip(*line_data)]
    unique_labels.update(line_data[1])
    
    return line_data

def create_label_array(word_ids, original_labels, labels_to_ids):
    try:
        t = [original_labels[i] if i is not None else -100 for i in word_ids]
        return [labels_to_ids[tt] if tt in labels_to_ids else tt for tt in t]
    except IndexError:
        print(f"Error for index {idx}")
        raise
        

def get_data_sequences(fh, num_lines, train_percent, seed=0):
    random.seed(seed)
    unique_labels = set()
    
    train_lines = []
    train_labels = []
    test_lines = []
    test_labels = []
    for i in range(num_lines):
        clean_lines, clean_labels = clean_line(fh.readline(), unique_labels)
        if random.random() < train_percent:
            target_lines = train_lines
            target_labels = train_labels
        else:
            target_lines = test_lines
            target_labels = test_labels
            
        target_lines.append(clean_lines)
        target_labels.append(clean_labels)

        
    labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
    ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

    return len(unique_labels), labels_to_ids, DataSequence(train_lines, train_labels, labels_to_ids), DataSequence(test_lines, test_labels, labels_to_ids)

In [3]:
fh = open("data/uk_openaddresses_formatted_addresses_tagged.random.tsv", "r")
num_labels, labels_to_ids, dseq_train, dseq_test = get_data_sequences(fh, 5000, 0.9, seed=20220807)


In [4]:
print(f"dseq_test: {len(dseq_test.texts)}")
print(f"dseq_train: {len(dseq_train.texts)}")
print(dseq_test.labels[0])
print(num_labels)
print(labels_to_ids)

dseq_test: 532
dseq_train: 4468
[-100, 3, 5, 5, 5, 4, 4, 4, 4, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
7
{'': 0, 'city': 1, 'country': 2, 'house_number': 3, 'postcode': 4, 'road': 5, 'sep': 6}


In [5]:
class BertModel(torch.nn.Module):

    def __init__(self, num_labels):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [6]:
def train_loop(model, train_dataset, val_dataset, batch_size):

    train_dataloader = torch.utils.data.DataLoader(train_dataset, num_workers=0, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, num_workers=0, batch_size=batch_size)

    use_cuda = False # out of memory :(  torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for sample_batch in tqdm(train_dataloader, total=len(train_dataloader)):

            train_label = sample_batch['labels'].to(device)
            mask = sample_batch['attention_mask'].to(device)
            input_id = sample_batch['input_ids'].to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            # todo better loss - consider counting bad tokens
                
            loss.sum().backward()
            optimizer.step()
            total_loss_train += loss

        # todo - save model
            
        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for test_batch in train_dataloader:

            val_label = test_batch['labels'].to(device)
            mask = test_batch['attention_mask'].to(device)
            input_id = test_batch['input_ids'].to(device)

            loss, logits = model(input_id, mask, val_label)

            logits_clean = logits[val_label != -100]
            label_clean = val_label[val_label != -100]

            predictions = logits_clean.argmax(dim=1)          

            acc = (predictions == label_clean).float().mean() / batch_size
            total_acc_val += acc
            total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(dseq_test.texts)
        val_loss = total_loss_val / len(dseq_test.texts)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(dseq_train.texts): .3f} | Accuracy: {total_acc_train / len(dseq_train.texts): .3f} | Val_Loss: {total_loss_val / len(dseq_test.texts): .3f} | Accuracy: {total_acc_val / len(dseq_test.texts): .3f}')

LEARNING_RATE = 1e-3
EPOCHS = 5

model = BertModel(num_labels)
train_loop(model, dseq_train, dseq_test, 4)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

HBox(children=(FloatProgress(value=0.0, max=1117.0), HTML(value='')))


Epochs: 1 | Loss:  0.380 | Accuracy:  0.000 | Val_Loss:  3.151 | Accuracy:  0.189


HBox(children=(FloatProgress(value=0.0, max=1117.0), HTML(value='')))




KeyboardInterrupt: 

In [159]:
# below here is debug/diagnose

model = BertModel(num_labels)
train_loop(model, dseq_train, dseq_test, 2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

HBox(children=(FloatProgress(value=0.0, max=2234.0), HTML(value='')))


Epochs: 1 | Loss:  0.774 | Accuracy:  0.000 | Val_Loss:  6.834 | Accuracy:  1.097


HBox(children=(FloatProgress(value=0.0, max=2234.0), HTML(value='')))


Epochs: 2 | Loss:  0.759 | Accuracy:  0.000 | Val_Loss:  6.339 | Accuracy:  1.100


HBox(children=(FloatProgress(value=0.0, max=2234.0), HTML(value='')))


Epochs: 3 | Loss:  0.758 | Accuracy:  0.000 | Val_Loss:  6.314 | Accuracy:  1.475


HBox(children=(FloatProgress(value=0.0, max=2234.0), HTML(value='')))


Epochs: 4 | Loss:  0.759 | Accuracy:  0.000 | Val_Loss:  6.361 | Accuracy:  1.479


HBox(children=(FloatProgress(value=0.0, max=2234.0), HTML(value='')))


Epochs: 5 | Loss:  0.759 | Accuracy:  0.000 | Val_Loss:  6.349 | Accuracy:  1.477


In [9]:
model = BertModel(num_labels)
train_dataloader = torch.utils.data.DataLoader(dseq_train, num_workers=0, batch_size=1, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(dseq_test, num_workers=0, batch_size=1)

use_cuda = False # out of memory :(  torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

if use_cuda:
    model = model.cuda()

best_acc = 0
best_loss = 1000
model.train()

i= 0
for sample_batch in tqdm(train_dataloader, total=len(train_dataloader)):
    i+=1
    if i > 1:
        break
    train_label = sample_batch['labels'].to(device)
    mask = sample_batch['attention_mask'].to(device)
    input_id = sample_batch['input_ids'].to(device)

    optimizer.zero_grad()
    loss, logits = model(input_id, mask, train_label)

    # todo better loss - consider counting bad tokens

    loss.sum().backward()
    optimizer.step()



model.eval()

total_acc_val = 0
total_loss_val = 0

for test_batch in train_dataloader:

    val_label = test_batch['labels'].to(device)
    val_mask = test_batch['attention_mask'].to(device)
    val_input_id = test_batch['input_ids'].to(device)

    test_loss, test_logits = model(val_input_id, val_mask, val_label)

    logits_clean = test_logits[val_label != -100]
    label_clean = val_label[val_label != -100]

    predictions = logits_clean.argmax(dim=1)          

    acc = (predictions == label_clean).float().mean()
    total_acc_val += acc
    total_loss_val += loss.item()
    break

val_accuracy = total_acc_val / len(dseq_test.texts)
val_loss = total_loss_val / len(dseq_test.texts)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

HBox(children=(FloatProgress(value=0.0, max=4468.0), HTML(value='')))




In [11]:
test_logits.argmax(dim=2)

tensor([[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4]])

In [146]:
val_label[val_label != -100].shape

torch.Size([328])

In [173]:
test_logits.argmax(dim=2)

tensor([[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5]])

In [105]:
loss

tensor(2.0226, grad_fn=<NllLossBackward0>)

In [138]:
train_label = sample_batch['labels'].to(device)
mask = sample_batch['attention_mask'].to(device)
input_id = sample_batch['input_ids'].to(device)
loss, logits = model(input_id, mask, train_label)

In [142]:
logits[0, :, :]

tensor([[-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377,  0.0756, -0.0279, -0.8616,  0.8167,  0.4795, -6.1726, -9.0814],
        [-9.2377

In [75]:
logits[0, :, :]

tensor([[-0.0345,  0.3386,  0.1161,  0.0342,  0.1419, -0.4926, -0.0822],
        [ 0.1287,  0.1278, -0.2769, -0.0328,  0.2805, -0.2797, -0.2588],
        [-0.1182,  0.0642,  0.2909, -0.1873, -0.0376, -0.5870,  0.6340],
        [ 0.2929,  0.1122,  0.3456, -0.5560, -0.1414, -0.6846,  0.0358],
        [-0.0971, -0.3623,  0.3397,  0.3219,  0.0487, -0.5433,  0.0436],
        [-0.0797, -0.0513, -0.1123,  0.4865,  0.1557, -0.3265,  0.1303],
        [ 0.1666, -0.1134,  0.0131, -0.5321, -0.2246, -0.1194,  0.0015],
        [-0.1751,  0.1675,  0.0526, -0.0153,  0.0076, -0.3112, -0.3186],
        [-0.2348,  0.1760,  0.0915,  0.1199,  0.1011, -0.3250, -0.3097],
        [-0.1697,  0.1996,  0.0621,  0.0056, -0.0347, -0.2210, -0.3963],
        [-0.2471,  0.1525,  0.0947,  0.1459,  0.1054, -0.3120, -0.2813],
        [-0.1382,  0.1749,  0.0549,  0.0837,  0.0231, -0.2560, -0.2628],
        [ 0.1957,  0.1232,  0.1402,  0.0383, -0.1520, -0.0341, -0.0914],
        [-0.0515,  0.1926, -0.0053, -0.0053, -0.031

In [76]:
print(labels_to_ids)

{'': 0, 'city': 1, 'country': 2, 'house_number': 3, 'postcode': 4, 'road': 5, 'sep': 6}


In [110]:
input_id[:5, :]

tensor([[  101, 15876,  2509, 26424,  2100,  6738,  2142,  2983,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1015,  3614,  3702,  2346, 11503,  5677,  3051,  2142,  2983,
         19739, 16576,  1014,  3240,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1022,  2146,  4644,  6174, 15154,  5172,  2142,  2983,  1059,
          2094,  2509,  1022,  6672,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  6146, 17916,  2395,  3393,  2509,  1019,  3022, 11258,  2142,
          2983,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  101, 15355,  2015,  2346, 22545,  4642, 16147,  1019,  2546,  2078,
          2142,  2983,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0, 

In [154]:
labels = {}
for l in dseq_train.labels:
    for ll in l:
        if ll == -100:
            continue
        if ll not in labels:
            labels[ll] = 0
        labels[ll] += 1

In [155]:
labels

{3: 3088, 5: 10157, 4: 15250, 1: 6371, 2: 6760, 6: 18}