# Document Classification Using KR-SBERT via Transformers

- Check the accuracy of model to apply our pre-trained KoRean S-BERT model to a document classification task, using HuggingFace's `transformers` library.

In [1]:
!nvidia-smi

Tue Jul 18 01:59:53 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB           Off| 00000000:41:00.0 Off |                    0 |
| N/A   35C    P0               38W / 250W|      0MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                         

## 0. Preparation

In [2]:
!pip install -U transformers sentence-transformers kss accelerate 

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting kss
  Downloading kss-4.5.4.tar.gz (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-

### BNC dataset

Download the Balanced News Corpus for a sentiment classification task.

In [3]:
"""
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Lg2jL89n3lqkKCulAnk4WwmI8G1hNfIA' -O BalancedNewsCorpusShuffled.zip
!unzip BalancedNewsCorpusShuffled.zip

SyntaxError: EOF while scanning triple-quoted string literal (3846666902.py, line 3)

## 1. Setting on Python

In [3]:
import torch
import pandas as pd
import numpy as np

# For Transformer models
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer

# For train/dev/test datasets
from torch.utils.data import Dataset
from torch.utils.data import random_split
from torch.nn.functional import pad

# For evaluation
from torch import manual_seed
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

Let us load a `SentenceTransformer` model for sentence embddings and a `BertForSequenceClassification` for classification.

In [40]:
sbert_model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
sbert_model = SentenceTransformer(sbert_model_name)
config = sbert_model._first_module().auto_model.config # for bert token embeddings
#from transformers import BertConfig
#config = BertConfig()
config.num_labels=9
config.max_position_embeddings = sbert_model.max_seq_length    #128
model = BertForSequenceClassification(config)
model.main_input_name = 'inputs_embeds'
max_seq_length = sbert_model.max_seq_length
manual_seed(1234)

<torch._C.Generator at 0x7f6297cef870>

In [41]:
print(len(model.main_input_name))

13


In [42]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

## 2. Building the BNC datasets

We define a new `Dataset` class loading the Balanced News Corpus dataset for the `BertForSequenceClassification`model.

In [43]:
import re

def clean(text:str):
  # https://github.com/YongWookHa/kor-text-preprocess/blob/master/src/clean.py
    not_used = re.compile('[^ .?!/@$%~|0-9|ㄱ-ㅣ가-힣]+')
    dup_space = re.compile('[ \t]+')  # white space duplicate
    dup_stop = re.compile('[\.]+')  # full stop duplicate

    cleaned = not_used.sub('', text.strip())
    cleaned = dup_space.sub(' ', cleaned)
    cleaned = dup_stop.sub('.', cleaned)

    return cleaned

In [44]:
# from kss import split_sentences # Sentence segmentation for the Korean Language
# sent_tokenize = split_sentences

import nltk
nltk.download('punkt')
from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
def get_sentence_embeddings(text:str, cls_token='[CLS]', sep_token='[SEP]', padding=True, truncate=True, max_len=128):
    sentences = [cls_token] + sent_tokenize(text) + [sep_token]
    embeddings = sbert_model.encode(sentences, convert_to_tensor=True)
    d = sbert_model.get_sentence_embedding_dimension()
    n = len(sentences)

    seq_len = n

    if padding:
        seq_len = max(n, max_len)

    if truncate:
        seq_len = min(seq_len, max_len)

    output = torch.zeros((seq_len, d), dtype=torch.float32).to(device)
    for i in range(min(n, seq_len)):
        output[i] = embeddings[i]

    return output

In [46]:
class BNCDataset(Dataset):

    labels = ['IT/과학', '경제', '문화', '미용/건강', '사회', '생활', '스포츠', '연예', '정치']

    def __init__(self, data_file='BalancedNewsCorpus_train.csv'):
        data = pd.read_csv(data_file)
        self.text = data['News'].apply(lambda text: text.replace('<p>', '\n').replace('</p>', '\n'))
        self.text = self.text.apply(clean).tolist()
        self.label = data['Topic'].apply(lambda label: self.labels.index(label)).tolist()

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        label = torch.tensor(self.label[idx]).to(device)
        feature = {'inputs_embeds': get_sentence_embeddings(text), 'labels': label}
        return feature

Load the BNC dataset files we have downloaded.

In [47]:
train_df = pd.read_csv("./BalancedNewsCorpus_train.csv")

In [48]:
train_df.head()

Unnamed: 0,filename,date,NewsPaper,Topic,News
0,NLRW1900000141,20170324,부산일보,스포츠,"<p> 야구 종가, 마침내 정상에 서다 </p> <p> '야구 종가' 미국이 푸에르..."
1,NPRW1900000003,20110209,한국경제신문사,정치,"<p> 외통위 27명중 15명 ""FTA 추가협상안만 처리"" </p> <p> 국회 외..."
2,NLRW1900000144,20100406,영남일보,사회,"<p> 한나라 ""地選후보, 희망연대 당원 구함"" 공천변수 작용 주목 </p> <p>..."
3,NLRW1900000064,20100804,광주매일신문,스포츠,<p> 모처럼 살아난 ‘CK포’ 7타점 합작 </p> <p> KIA 12 3 LG ...
4,NLRW1900000070,20160615,광주매일신문,문화,<p> 亞문화전당서 동방의 등불 만나다 </p> <p> “일찍이 아시아의 황금 시기...


In [31]:
test_df = pd.read_csv("./BalancedNewsCorpus_test.csv")

In [32]:
len(test_df)

1800

In [33]:
train_dataset = BNCDataset('BalancedNewsCorpus_train.csv')
test_dataset = BNCDataset('BalancedNewsCorpus_test.csv')

In [34]:
train_dataset, val_dataset = random_split(train_dataset, [8100, 900], generator=manual_seed(1234))

## 3. Training

In [35]:
#!pip install accelerate -U

In [54]:
args = TrainingArguments(
    output_dir="./bnc-results",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    # eval_steps=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    dataloader_pin_memory=False, # False for GPU
)

We will evaluate our classifier using Accuracy, F1, Precision, and Recall scores. This should be defined as the following.

In [55]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    print(confusion_matrix(labels, preds))
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

Instantiate the `Trainer`.

In [56]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

Let's train!

In [57]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.978445,0.66,0.632912,0.675187,0.65861
2,No log,0.744105,0.756667,0.752039,0.756107,0.756831
3,No log,0.680357,0.775556,0.767813,0.774262,0.771219
4,No log,0.626381,0.792222,0.787378,0.792799,0.790302
5,No log,0.651894,0.771111,0.768801,0.772233,0.774228
6,No log,0.584599,0.806667,0.799083,0.801566,0.801908
7,No log,0.586437,0.808889,0.806225,0.807693,0.808273
8,0.733000,0.573891,0.817778,0.813671,0.813909,0.814402
9,0.733000,0.576083,0.808889,0.803079,0.804306,0.806536
10,0.733000,0.571981,0.811111,0.805833,0.80661,0.808552


[[ 42   5  10   4   2   3   2   0   7]
 [  9  84   0   2   1   1   2   0  11]
 [  1   1  87   1   1   1   0   4   3]
 [  4   3   2  70   1   0   1   0   1]
 [  1  16   4   9  19   4   3   2  47]
 [  6  18   3  13   5  29   9   1   6]
 [  0   0   0   1   0   4  92   1   3]
 [  1   0  45   1   2   0   4  70   3]
 [  0   8   1   0   2   0   0   0 101]]
[[ 60   2   4   1   2   2   0   2   2]
 [  7  82   0   3   7   3   1   0   7]
 [  1   0  84   1   2   1   0   8   2]
 [  5   2   2  68   2   1   1   0   1]
 [  4   6   4   5  49   7   2   0  28]
 [  6   9   4   5   8  52   1   1   4]
 [  0   0   0   1   1   4  92   1   2]
 [  5   0  16   0   3   0   1 100   1]
 [  1   6   1   0  10   0   0   0  94]]
[[ 60   1   4   0   1   4   1   2   2]
 [  7  91   0   2   2   2   3   0   3]
 [  3   1  72   1   3   5   0  12   2]
 [  6   2   1  69   2   1   1   0   0]
 [  5  15   3   1  54   5   2   4  16]
 [  6  14   3   3   4  52   4   2   2]
 [  0   1   0   1   0   2  95   0   2]
 [  3   0   1   0   1  

TrainOutput(global_step=640, training_loss=0.6814454078674317, metrics={'train_runtime': 1787.9271, 'train_samples_per_second': 45.304, 'train_steps_per_second': 0.358, 'total_flos': 5.12290727165952e+17, 'train_loss': 0.6814454078674317, 'epoch': 10.0})

## 4. Evaluation

In [53]:
trainer.evaluate(test_dataset)

[[137   0  27   9   0  16   0   0  11]
 [ 16  50   2   2   0  53   0   0  77]
 [  2   0 183   0   0   4   0   0  11]
 [ 11   2  26 111   0  25   1   1  23]
 [ 24   2  22   3   1  28   5   1 114]
 [ 14   3  39   8   0  90   2   1  43]
 [  3   0  24   0   0  14 143   2  14]
 [  2   0 145   0   0   1   3  45   4]
 [  0   0   1   0   0   1   0   1 197]]


{'eval_loss': 1.9156763553619385,
 'eval_accuracy': 0.5316666666666666,
 'eval_f1': 0.49262213079186995,
 'eval_precision': 0.706123840883612,
 'eval_recall': 0.5316666666666667,
 'eval_runtime': 33.2014,
 'eval_samples_per_second': 54.215,
 'eval_steps_per_second': 0.452,
 'epoch': 1.0}