# Token Classification Finetuner

In [1]:
%load_ext tensorboard
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(level=logging.INFO)

#misc
import math
import csv
import numpy as np
import pandas as pd
import re
import glob
import argparse
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm
tqdm.pandas()

#torch 
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

#lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

#huggingface; only works with tokenizers==0.7.0 on mac now
from transformers import (
    AdamW, 
    get_linear_schedule_with_warmup, 
    AutoTokenizer, 
    AutoModel,
    AutoModelForSequenceClassification, 
    AutoConfig,
    Trainer, 
    TrainingArguments
)
from transformers.data.processors.utils import InputFeatures

#thai2transformers
from thai2transformers.datasets import TokenClassificationDataset
from thai2transformers.finetuners import TokenClassificationFinetuner
from thai2transformers.metrics import classification_metrics

In [2]:
# !wget https://www.dropbox.com/s/1n919nldzt2aste/ner_newmm.zip; unzip ner_newmm.zip
# !mkdir data; mv ner_newmm data; ls

## Data Preparation

In [3]:
#tags
with open('data/ner_newmm/tags.txt','r') as f:
    tags = [i.strip() for i in f.readlines()]
tags_dict = {v:k for k,v in enumerate(tags)}
len(tags),tags[:10]

(42,
 ['pad',
  'O',
  'I-person',
  'I-time',
  'I-organisation',
  'E-person',
  'B-person',
  'E-organisation',
  'B-organisation',
  'B-time'])

In [4]:
def get_df(fname):
    with open(f'{fname}.src','r') as f:
        src = [i.strip().replace('||','| |') for i in f.readlines()]
    with open(f'{fname}.trg','r') as f:
        trg = [i.strip() for i in f.readlines()]
    lab = []
    for l in trg:
        lab.append('|'.join([str(tags_dict[i]) for i in l.split('|')]))
    df = pd.DataFrame({'src':src,'label':lab,'trg':trg})
    df['nb_src'] = df.src.map(lambda x: len(x.split('|')))
    df['nb_label'] = df.label.map(lambda x: len(x.split('|')))
    assert (df.nb_src==df.nb_label).sum() == df.shape[0]
    return df

In [5]:
train_df = get_df('data/ner_newmm/train')
valid_df = get_df('data/ner_newmm/valid')
test_df = get_df('data/ner_newmm/test')
train_df.shape, valid_df.shape, test_df.shape, train_df.nb_src.max(), valid_df.nb_src.max(), test_df.nb_src.max()

((1077, 5), (359, 5), (360, 5), 516, 521, 517)

In [6]:
def trunc_df(df,n=40):
    df['src'] = df.src.map(lambda x: '|'.join(x.split('|')[:n]))
    df['label'] = df.label.map(lambda x: '|'.join(x.split('|')[:n]))
    return df
train_df = trunc_df(train_df)
valid_df = trunc_df(valid_df)
test_df = trunc_df(test_df)

In [7]:
# #save
# !rm -r data/train_ner; rm -r data/valid_ner; rm -r data/test_ner;
# !mkdir data/train_ner; mkdir data/valid_ner; mkdir data/test_ner; 
# train_df.iloc[:,:2].to_csv('data/train_ner/train.csv',index=False)
# valid_df.iloc[:,:2].to_csv('data/valid_ner/valid.csv',index=False)
# test_df.iloc[:,:2].to_csv('data/test_ner/test.csv',index=False)

## Dataset

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    'xlm-roberta-base',
)

In [9]:
%%time
train_dataset = TokenClassificationDataset(tokenizer,'data/train_ner')
valid_dataset = TokenClassificationDataset(tokenizer,'data/valid_ner')
test_dataset = TokenClassificationDataset(tokenizer,'data/test_ner')
len(train_dataset), len(valid_dataset), len(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

◎| |การเมือง|เรื่อง|ชนชั้น| |​| |เลว|หมด|คนโง่|เข้า|ประณต| |​​​| |กราบไหว้|สูงส่ง|ช่าง|งาม|งด| |​​​| |พิ|โธ่| |พิ|ถัง|ลา|งั่ง|ถูก|หลอก|ใช้| |​​​| |ไป่|รู้|  1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1
เป็น|​| |Hotel| |Buffet| |ราคา|กลางๆ|​|หัว|ละ|​| |198| |232| |HKD| |Nett| |ดังนั้น|อาหาร|ให้|เลือก|มี|น้อย|มาก|​| |ดิ่ม|ซำ|มี|แค่|2|อย่าง| |ขนมจีบ|เนื้อ 1|1|1|1|1|1|1|1|1|1|1|1|1|1|18|12|12|12|17|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|13|13|1|1|1
วันนี้|พา|มา|ร้าน|ชิล|ๆ| |ชิวๆ|​| |กับ|โรตี|และ|นม|ต่างๆ| |วันนี้|ได้|สั่ง| |โอ|ดิบ|นมสด| |อพอลโล|ชาเขียว| |และ|ก็|ทิชชู|โอวัน|ติ|ล|รสชาติ|จะ|ไล่|ตามลำดับ|เนอะ| |โอ 1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…





HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…



CPU times: user 10.1 s, sys: 184 ms, total: 10.2 s
Wall time: 10 s


(1074, 359, 360)

## Model

In [10]:
args_dict = {'label_pad_token': '0',
             'label_first_subword': False,
             'num_labels':42,
             'num_hidden':768,
             'train_dir': 'data/train_ner',
             'valid_dir': 'data/valid_ner',
             'test_dir': 'data/test_ner',
             'output_dir': './results',
             'model_name_or_path':'xlm-roberta-base',
             'max_length':128,
             'drop_p': 0.1,
             'learning_rate': 5e-5,
             'weight_decay': 0.01,
             'adam_epsilon': 1e-8,
             'warmup_steps': 100,
             'per_device_train_batch_size':32,
             'per_device_eval_batch_size':64,
             'num_train_epochs': 2,
             'gradient_accumulation_steps':1,
             'max_grad_norm': 1.0,
             'n_gpu': torch.cuda.device_count(),
             'fp_16': False,
             'opt_level': 'O1',
             'seed': 1412,
             'save_total_limit': 1,
             'early_stopping': True,
             'patience': 3
            }

args = argparse.Namespace(**args_dict)
model = TokenClassificationFinetuner(args)

In [11]:
dl = DataLoader(train_dataset, batch_size=7)
batch = next(iter(dl))
label = batch['label']

In [12]:
preds = model(
    input_ids=batch["input_ids"], attention_mask=batch["attention_mask"],
)
pred_labs = preds.argmax(2).numpy()
pred_labs.shape

(7, 128)

In [13]:
i=0
dfs = []
for i in range(batch['input_ids'].shape[0]):
    df = pd.DataFrame({'word_ids':batch['word_ids'][i].numpy(),'label':batch['label'][i].numpy(),'pred_labs':pred_labs[i]})
    df = df[df.label!=0].groupby('word_ids').max().reset_index(drop=True)
    dfs.append(df)
df_batch = pd.concat(dfs)
df_batch

Unnamed: 0,label,pred_labs
0,30,25
1,25,18
2,25,23
3,29,37
4,30,37
...,...,...
28,1,18
29,1,2
30,1,37
31,1,29


In [14]:
pred = argparse.Namespace(
    label_ids=df_batch.label, predictions=df_batch.pred_labs
)
classification_metrics(pred,pred_labs=True)

{'accuracy': 0.004149377593360996,
 'f1_micro': 0.004149377593360996,
 'precision_micro': 0.004149377593360996,
 'recall_micro': 0.004149377593360996,
 'f1_macro': 0.0024691358024691358,
 'precision_macro': 0.0015873015873015873,
 'recall_macro': 0.005555555555555555,
 'nb_samples': 241}

In [15]:
batch['word_ids'].shape

torch.Size([7, 128])

## Lightning

In [16]:
args_dict = {'label_pad_token': '0',
             'label_first_subword': False,
             'num_labels':42,
             'num_hidden':768,
             'train_dir': 'data/train_ner',
             'valid_dir': 'data/valid_ner',
             'test_dir': 'data/test_ner',
             'output_dir': './results',
             'model_name_or_path':'xlm-roberta-base',
             'max_length':128,
             'drop_p': 0.1,
             'learning_rate': 5e-5,
             'weight_decay': 0.01,
             'adam_epsilon': 1e-8,
             'warmup_steps': 100,
             'per_device_train_batch_size':32,
             'per_device_eval_batch_size':64,
             'num_train_epochs': 2,
             'gradient_accumulation_steps':1,
             'max_grad_norm': 1.0,
             'n_gpu': torch.cuda.device_count(),
             'fp_16': False,
             'opt_level': 'O1',
             'seed': 1412,
             'save_total_limit': 1,
             'early_stopping': True,
             'patience': 3
            }

args = argparse.Namespace(**args_dict)

checkpoint_callback = ModelCheckpoint(
    filepath=args.output_dir,
    save_top_k=args.save_total_limit,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

early_stop_callback = EarlyStopping(
   monitor='val_loss',
   min_delta=0.00,
   patience=args.patience,
   verbose=False,
   mode='min'
)
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    precision=16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback = checkpoint_callback,
    early_stop_callback=early_stop_callback if args.early_stopping else None
)

In [17]:
model = TokenClassificationFinetuner(args)
trainer = pl.Trainer(**train_params)

GPU available: True, used: True
INFO:lightning:GPU available: True, used: True
TPU available: False, using: 0 TPU cores
INFO:lightning:TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


In [18]:
!rm -r lightning_logs; rm -r output; mkdir output
trainer.fit(model)


  | Name    | Type             | Params
---------------------------------------------
0 | model   | XLMRobertaModel  | 278 M 
1 | head    | Sequential       | 622 K 
2 | loss_fn | CrossEntropyLoss | 0     
INFO:lightning:
  | Name    | Type             | Params
---------------------------------------------
0 | model   | XLMRobertaModel  | 278 M 
1 | head    | Sequential       | 622 K 
2 | loss_fn | CrossEntropyLoss | 0     


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

◎| |การเมือง|เรื่อง|ชนชั้น| |​| |เลว|หมด|คนโง่|เข้า|ประณต| |​​​| |กราบไหว้|สูงส่ง|ช่าง|งาม|งด| |​​​| |พิ|โธ่| |พิ|ถัง|ลา|งั่ง|ถูก|หลอก|ใช้| |​​​| |ไป่|รู้|  1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1
เป็น|​| |Hotel| |Buffet| |ราคา|กลางๆ|​|หัว|ละ|​| |198| |232| |HKD| |Nett| |ดังนั้น|อาหาร|ให้|เลือก|มี|น้อย|มาก|​| |ดิ่ม|ซำ|มี|แค่|2|อย่าง| |ขนมจีบ|เนื้อ 1|1|1|1|1|1|1|1|1|1|1|1|1|1|18|12|12|12|17|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|13|13|1|1|1
วันนี้|พา|มา|ร้าน|ชิล|ๆ| |ชิวๆ|​| |กับ|โรตี|และ|นม|ต่างๆ| |วันนี้|ได้|สั่ง| |โอ|ดิบ|นมสด| |อพอลโล|ชาเขียว| |และ|ก็|ทิชชู|โอวัน|ติ|ล|รสชาติ|จะ|ไล่|ตามลำดับ|เนอะ| |โอ 1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00000: val_loss reached 2.12986 (best 2.12986), saving model to /home/cstorm125/thai2transformers/_ckpt_epoch_0.ckpt as top 1
INFO:lightning:
Epoch 00000: val_loss reached 2.12986 (best 2.12986), saving model to /home/cstorm125/thai2transformers/_ckpt_epoch_0.ckpt as top 1


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00001: val_loss reached 1.37551 (best 1.37551), saving model to /home/cstorm125/thai2transformers/_ckpt_epoch_1_v2.ckpt as top 1
INFO:lightning:
Epoch 00001: val_loss reached 1.37551 (best 1.37551), saving model to /home/cstorm125/thai2transformers/_ckpt_epoch_1_v2.ckpt as top 1
Saving latest checkpoint..
INFO:lightning:Saving latest checkpoint..





1

In [19]:
trainer.test()

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'avg_test_acc': 0.72392429771705,
 'avg_test_f1_macro': 0.05039030268977899,
 'avg_test_f1_micro': 0.72392429771705,
 'avg_test_precision_macro': 2.0562163293594474e-05,
 'avg_test_precision_micro': 0.72392429771705,
 'avg_test_recall_macro': 0.07322287732988887,
 'avg_test_recall_micro': 0.72392429771705,
 'test_loss': 1.4044627986293814,
 'total_samples': 11783}
--------------------------------------------------------------------------------



[{'test_loss': 1.4044627986293814,
  'avg_test_acc': 0.72392429771705,
  'avg_test_f1_micro': 0.72392429771705,
  'avg_test_precision_micro': 0.72392429771705,
  'avg_test_recall_micro': 0.72392429771705,
  'avg_test_f1_macro': 0.05039030268977899,
  'avg_test_precision_macro': 2.0562163293594474e-05,
  'avg_test_recall_macro': 0.07322287732988887,
  'total_samples': 11783}]

In [20]:
# %load_ext tensorboard
# %tensorboard --logdir lightning_logs

In [24]:
test_df.shape

(360, 5)