In [184]:
import torch
import pandas as pd
from transformers import BertTokenizerFast, BertForTokenClassification

from tqdm.notebook import tqdm
import random

In [419]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, lines, labels, labels_to_ids):
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        self.texts = [tokenizer(line, padding='max_length', max_length=25, truncation=True, return_tensors="pt", is_split_into_words=True) for line in lines]
        self.labels = [create_label_array(self.texts[j].word_ids(), labels[j], labels_to_ids) for j in range(len(lines))]    

    def __len__(self):

        return len(self.labels)

    def __getitem__(self, idx):

        batch_data = self.texts[idx]
        batch_labels = torch.LongTensor(self.labels[idx])

        return {
            'input_ids': batch_data['input_ids'].flatten(),
            'attention_mask': batch_data['attention_mask'].flatten(),
            'labels': batch_labels
        }
    
    
def clean_line(line, unique_labels):
    line = line.strip().lower()
    line_data = [i.split('/') for i in line.split("\t")[2].split(' ') if 'fsep' not in i]
    line_data = [list(i) for i in zip(*line_data)]
    unique_labels.update(line_data[1])
    
    return line_data

def create_label_array(word_ids, original_labels, labels_to_ids):
    try:
        t = [original_labels[i] if i is not None else -100 for i in word_ids]
        return [labels_to_ids[tt] if tt in labels_to_ids else tt for tt in t]
    except IndexError:
        print(f"Error for index {idx}")
        raise
        

def get_data_sequences(fh, num_lines, train_percent, seed=0):
    random.seed(seed)
    unique_labels = set()
    
    train_lines = []
    train_labels = []
    test_lines = []
    test_labels = []
    for i in range(num_lines):
        clean_lines, clean_labels = clean_line(fh.readline(), unique_labels)
        if random.random() < train_percent:
            target_lines = train_lines
            target_labels = train_labels
        else:
            target_lines = test_lines
            target_labels = test_labels
            
        target_lines.append(clean_lines)
        target_labels.append(clean_labels)

        
    labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
    ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

    return len(unique_labels), labels_to_ids, DataSequence(train_lines, train_labels, labels_to_ids), DataSequence(test_lines, test_labels, labels_to_ids)

In [420]:
fh = open("data/uk_openaddresses_formatted_addresses_tagged.random.tsv", "r")
num_labels, labels_to_ids, dseq_train, dseq_test = get_data_sequences(fh, 10000, 0.9, seed=20220807)


In [421]:
print(f"dseq_test: {len(dseq_test.texts)}")
print(f"dseq_train: {len(dseq_train.texts)}")
print(dseq_test.labels[0])
print(num_labels)
print(labels_to_ids)
print(unique_labels)

dseq_test: 1048
dseq_train: 8952
[-100, 3, 5, 5, 5, 4, 4, 4, 4, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
7
{'': 0, 'city': 1, 'country': 2, 'house_number': 3, 'postcode': 4, 'road': 5, 'sep': 6}
{'', 'road', 'country', 'house_number', 'postcode', 'sep', 'city'}


In [427]:
class BertModel(torch.nn.Module):

    def __init__(self, num_labels):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [428]:
def train_loop(model, train_dataset, val_dataset):

    train_dataloader = torch.utils.data.DataLoader(train_dataset, num_workers=0, batch_size=32, shuffle=False) # switch to true
    val_dataloader = torch.utils.data.DataLoader(val_dataset, num_workers=0, batch_size=32)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for sample_batch in tqdm(train_dataloader, total=len(train_dataloader)):

            train_label = sample_batch['labels'].to(device)
            mask = sample_batch['attention_mask'].to(device)
            input_id = sample_batch['input_ids'].to(device)

            optimizer.zero_grad()
            try:
                loss, logits = model(input_id, mask, train_label)
            except:
                print(f"Label: {train_label}")
                print(f"Input Ids: {input_id.shape}")
                print(f"Mask: {mask.shape}")
                raise

            # todo better loss
                
                
            loss.sum().backward()
            optimizer.step()
            total_loss_train += loss

        # todo - save model
            
        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label[0].to(device)
            mask = val_data['attention_mask'][0].to(device)

            input_id = val_data['input_ids'][0].to(device)

            loss, logits = model(input_id, mask, val_label)

            logits_clean = logits[0][val_label != -100]
            label_clean = val_label[val_label != -100]

            predictions = logits_clean.argmax(dim=1)          

            acc = (predictions == label_clean).float().mean()
            total_acc_val += acc
            total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 1e-2
EPOCHS = 5

model = BertModel(num_labels+1) # include -100
train_loop(model, dseq_train, dseq_test)

HBox(children=(HTML(value='Downloading pytorch_model.bin'), FloatProgress(value=0.0, max=440473133.0), HTML(va…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=280.0), HTML(value='')))




ValueError: too many values to unpack (expected 2)

In [384]:
train_dataloader = torch.utils.data.DataLoader(dseq_train, num_workers=0, batch_size=32, shuffle=False) # switch to true

In [385]:
f = next(iter(train_dataloader))

In [387]:
f.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [426]:
f['input_ids'].max()

tensor(29504)

In [410]:
model = BertModel(num_labels+1) # include -100


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas