# Train model

In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification
from transformers import TrainingArguments, Trainer
from datasets import load_metric

In [2]:
train_df = pd.read_json("train_processed.json").sample(1000)
val_df = pd.read_json("val_processed.json")
train_df

Unnamed: 0,id,tokens,label
176939,176939,"[bank, mand, ir, haji, jua, ,, kota, baru]","[street, street, street, O, O, O, O, O]"
10853,10853,"[perumahan, taman, setiabudi, blok, d23]","[POI, POI, POI, O, O]"
35892,35892,"[batung, taba, nan, xx, kamp, jua, 5, 25155, l...","[O, O, O, O, street, street, O, O, O, O]"
150941,150941,"[d, &amp;, k, batteray, ,, pasar, hitam, ,]","[POI, POI, POI, POI, O, street, street, O]"
133188,133188,"[gemi, (, dea, huper, optik, ), ,, haji, nawi,...","[POI, POI, POI, POI, POI, POI, O, street, stre..."
...,...,...,...
28569,28569,"[zae, zakse, i, kotalama, kedungkandang]","[street, street, street, O, O]"
192368,192368,"[gal, antik, cv, jati, permata, indah, raya, c...","[O, O, O, O, O, O, O, O, O]"
289922,289922,"[sekar, jepun, iii, 11, kesiman, kertalangu, d...","[street, street, street, O, O, O, O, O]"
251185,251185,"[taman, licin]","[street, O]"


Visualize the tokens and their corresponding labels

In [3]:
train_texts = train_df["tokens"].to_list()
train_tags = train_df["label"].to_list()

unique_tags = set(tag for label in train_tags for tag in label)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

train_tag_ids = [[tag2id[tag] for tag in sample] for sample in train_tags]

print(unique_tags)
df = pd.DataFrame()
df["text"] = train_texts[0]
df["tag"] = train_tags[0]
df["tag_id"] = train_tag_ids[0]
df.T

{'POI', 'O', 'street'}


Unnamed: 0,0,1,2,3,4,5,6,7
text,bank,mand,ir,haji,jua,",",kota,baru
tag,street,street,street,O,O,O,O,O
tag_id,2,2,2,1,1,1,1,1


## Create the tokenizer

In [4]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')

train_encodings = tokenizer(train_texts, is_split_into_words=True, padding=True, truncation=True)

Visualize the results of tokenizer

In [11]:
idx = 0

df = pd.DataFrame()
df["input ids"] = train_encodings["input_ids"][idx]
df["attention mask"] = train_encodings["attention_mask"][idx]
df["tokens"] = tokenizer.convert_ids_to_tokens(train_encodings["input_ids"][idx])
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
input ids,0,4620,14134,193,117790,1129,11,6,4,20553,...,1,1,1,1,1,1,1,1,1,1
attention mask,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
tokens,<s>,▁bank,▁mand,▁ir,▁haji,▁ju,a,▁,",",▁kota,...,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>


## Align the labels

The tokenizer use sub-word tokenization, thus we need to re-align the labels with the tokens.
- Use `.word_ids()`

In [9]:
print(train_texts[idx])
print(tokenizer.convert_ids_to_tokens(train_encodings["input_ids"][idx]))
print(tokenizer(train_texts, is_split_into_words=True).word_ids())

['bank', 'mand', 'ir', 'haji', 'jua', ',', 'kota', 'baru']
['<s>', '▁bank', '▁mand', '▁ir', '▁haji', '▁ju', 'a', '▁', ',', '▁kota', '▁baru', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[None, 0, 1, 2, 3, 4, 4, 5, 5, 6, 7, None]


In [12]:
def tokenize_and_align_labels(texts, tags, tokenizer, label_all_tokens=True):
    tokenized_inputs = tokenizer(texts, padding=True, truncation=True, is_split_into_words=True)

    labels = []

    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
                
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_encodings = tokenize_and_align_labels(train_texts, train_tag_ids, tokenizer)

Visualize the re-aligned labels

In [13]:
df = pd.DataFrame()
df["input_ids"] = train_encodings.input_ids[0]
df["tokens"] = tokenizer.convert_ids_to_tokens(train_encodings.input_ids[0])
df["labels"] = train_encodings.labels[0]
df[:20].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
input_ids,0,4620,14134,193,117790,1129,11,6,4,20553,3510,2,1,1,1,1,1,1,1,1
tokens,<s>,▁bank,▁mand,▁ir,▁haji,▁ju,a,▁,",",▁kota,▁baru,</s>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>
labels,-100,2,2,2,1,1,1,1,1,1,1,-100,-100,-100,-100,-100,-100,-100,-100,-100


## Create the dataset

In [14]:
class AddressDataset(torch.utils.data.Dataset):
    def __init__(self, df, tag2id, tokenizer, tokenize_and_align_labels):
        tokens = df["tokens"].to_list()
        labels = df["label"].to_list()
        tags = [[tag2id[x] for x in sample] for sample in labels]
        
        self.encodings = tokenize_and_align_labels(tokens, tags, tokenizer)
    
    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["labels"])
    
train_dataset = AddressDataset(train_df, tag2id, tokenizer, tokenize_and_align_labels)
val_dataset = AddressDataset(val_df, tag2id, tokenizer, tokenize_and_align_labels)

Make sure the dataset is correct

In [15]:
sample = train_dataset[0]
df = pd.DataFrame()
df["input_ids"] = sample["input_ids"]
df["tokens"] = tokenizer.convert_ids_to_tokens(sample["input_ids"])
df["labels"] = sample["labels"]
df[:20].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
input_ids,0,4620,14134,193,117790,1129,11,6,4,20553,3510,2,1,1,1,1,1,1,1,1
tokens,<s>,▁bank,▁mand,▁ir,▁haji,▁ju,a,▁,",",▁kota,▁baru,</s>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>
labels,-100,2,2,2,1,1,1,1,1,1,1,-100,-100,-100,-100,-100,-100,-100,-100,-100


## Model fine-tuning

In [10]:
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(unique_tags))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

Define evaluation metrics

In [16]:
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score

class ComputeMetrics:
    def __init__(self, id2tag):
        self.id2tag = id2tag
    
    def compute(self, p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [self.id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [self.id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        return {
            "precision": precision_score(true_labels, true_predictions),
            "recall": recall_score(true_labels, true_predictions),
            "f1":f1_score(true_labels, true_predictions),
            "accuracy": accuracy_score(true_labels, true_predictions),
        }

compute_metrics = ComputeMetrics(id2tag).compute

Use `Trainer` to train the model

In [26]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss
10,1.2735


TrainOutput(global_step=12, training_loss=1.2702716588974, metrics={'train_runtime': 2.7208, 'train_samples_per_second': 4.41, 'total_flos': 50173244100000.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 66156536, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 283842, 'init_mem_gpu_peaked_delta': 544821248, 'train_mem_cpu_alloc_delta': 164478, 'train_mem_gpu_alloc_delta': 534162944, 'train_mem_cpu_peaked_delta': 786081, 'train_mem_gpu_peaked_delta': 983777792})