In [1]:
import math
from tokenizers import Tokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm
import random

C:\Users\riguy\Anaconda3\envs\work37\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\riguy\Anaconda3\envs\work37\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
tokenizer = Tokenizer.from_file("tokenizers/ukonly.json")       

In [3]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, lines, labels, labels_to_ids):
        self.texts = [tokenizer.encode(line, is_pretokenized=True) for line in lines]
        self.labels = [create_label_array(self.texts[j].word_ids, labels[j], labels_to_ids) for j in range(len(lines))]    

    def __len__(self):

        return len(self.labels)

    def __getitem__(self, idx):

        batch_data = self.texts[idx]
        batch_labels = torch.tensor(self.labels[idx])

        return {
            'input_ids': torch.tensor(batch_data.ids).flatten(),
            'attention_mask': torch.tensor(batch_data.attention_mask),
            'labels': batch_labels
        }


def clean_line(line, unique_labels):
    line = line.strip().lower()
    line_data = [i.split('/') for i in line.split("\t")[2].split(' ') if 'fsep' not in i]
    line_data = [list(i) for i in zip(*line_data)]
    unique_labels.update(line_data[1])
    
    return line_data

def create_label_array(word_ids, original_labels, labels_to_ids):
    try:
        t = [original_labels[i] if i is not None else -100 for i in word_ids]
        return [labels_to_ids[tt] if tt in labels_to_ids else tt for tt in t]
    except IndexError:
        print(f"Error for index {idx}")
        raise
        

def get_data_sequences(fh, num_lines, train_percent, seed=0):
    random.seed(seed)
    unique_labels = set()
    
    train_lines = []
    train_labels = []
    test_lines = []
    test_labels = []
    for i in range(num_lines):
        clean_lines, clean_labels = clean_line(fh.readline(), unique_labels)
        if random.random() < train_percent:
            target_lines = train_lines
            target_labels = train_labels
        else:
            target_lines = test_lines
            target_labels = test_labels
            
        target_lines.append(clean_lines)
        target_labels.append(clean_labels)

        
    labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}

    return len(unique_labels), labels_to_ids, DataSequence(train_lines, train_labels, labels_to_ids), DataSequence(test_lines, test_labels, labels_to_ids)

In [4]:
fh = open("data/uk_openaddresses_formatted_addresses_tagged.random.tsv", "r")
num_labels, labels_to_ids, train_dataset, test_dataset = get_data_sequences(fh, 5000, 0.9, seed=20220807)

In [5]:
train_dataset[3]

{'input_ids': tensor([   0, 2201,   63, 8311,   57,   56, 8322, 6525,    1,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]),
 'labels': tensor([-100,    5,    5,    1,    2,    2,    4,    4, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100])}

In [6]:
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens in the sequence.
        The positional encodings have the same dimension as the embeddings, so that the two can be summed.
        Here, we use sine and cosine functions of different frequencies.
    .. math:
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, nlabels, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, nlabels)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

In [8]:
def train_loop(model, train_dataset, val_dataset):

    train_dataloader = torch.utils.data.DataLoader(train_dataset, num_workers=0, batch_size=4, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, num_workers=0, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    cel = nn.CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for sample_batch in tqdm(train_dataloader, total=len(train_dataloader)):

            train_label = sample_batch['labels'].to(device)
            mask = sample_batch['attention_mask'].to(device)
            input_id = sample_batch['input_ids'].to(device)

            optimizer.zero_grad()
            outputs = model(input_id)
            loss = cel(outputs.permute(0, 2, 1), train_label)

            loss.sum().backward()
            optimizer.step()
            total_loss_train += loss

        # todo - save model
            
        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for test_batch in val_dataloader:

            val_label = test_batch['labels'].to(device)
            val_mask = test_batch['attention_mask'].to(device)
            val_input_id = test_batch['input_ids'].to(device)

            output = model(val_input_id)

            logits_clean = output[val_label != -100]
            label_clean = val_label[val_label != -100]

            predictions = logits_clean.argmax(dim=1)

            acc = (predictions == label_clean).float().mean() / val_dataloader.batch_size
            total_acc_val += acc
            loss = cel(outputs.permute(0, 2, 1), train_label)
            total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(test_dataset.texts)
        val_loss = total_loss_val / len(test_dataset.texts)
        
        desc_str = f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(train_dataset.texts): .3f} | Accuracy: {total_acc_train / len(train_dataset.texts): .3f} | Val_Loss: {total_loss_val / len(test_dataset.texts): .3f} | Accuracy: {total_acc_val / len(test_dataset.texts): .3f}'
        
        if val_loss < best_loss:
            print(f"Best new loss found {val_loss}")
            best_loss = val_loss
            torch.save(model, f"models/simpler_model_{epoch_num}.pt")
            fh = open(f"models/simpler_model_{epoch_num}_meta.txt")
            fh.write(f"desc_str\n")
            fh.write(f"Learning Rate: {LEARNING_RATE}\n")

        print(desc_str)
            

LEARNING_RATE = 1e-3
EPOCHS = 25

In [9]:
model = TransformerModel(ntoken=tokenizer.get_vocab_size(), nlabels=num_labels, nhid=2048, ninp=512, nhead=8, nlayers=2)
model.init_weights()
train_loop(model, train_dataset, test_dataset)

  return torch._C._cuda_getDeviceCount() > 0


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1117.0), HTML(value='')))


Best new loss found 0.5842580199241638


FileNotFoundError: [Errno 2] No such file or directory: 'models/simpler_model_0.pt'

In [20]:
val_dataloader = torch.utils.data.DataLoader(test_dataset, num_workers=0, batch_size=1)
test_batch = next(iter(val_dataloader))    

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda:
    model = model.cuda()

val_label = test_batch['labels'].to(device)
val_mask = test_batch['attention_mask'].to(device)
val_input_id = test_batch['input_ids'].to(device)
output = model(val_input_id)


In [21]:
output.argmax(dim=2)

tensor([[4, 3, 5, 5, 5, 4, 4, 1, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4]], device='cuda:0')

In [22]:
val_label

tensor([[-100,    3,    5,    5,    5,    4,    4,    1,    2, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100]], device='cuda:0')

In [18]:
labels_to_ids

{'': 0,
 'city': 1,
 'country': 2,
 'house_number': 3,
 'postcode': 4,
 'road': 5,
 'sep': 6}