In [1]:
!git clone https://github.com/dpquoc/Vietnamese-Text-Classification

Cloning into 'Vietnamese-Text-Classification'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 31 (delta 4), reused 25 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (31/31), 2.38 MiB | 4.62 MiB/s, done.
Resolving deltas: 100% (4/4), done.


In [2]:
import os
import re

from sklearn.model_selection import train_test_split
from typing import Optional, Union
import pandas as pd, numpy as np, torch

from datasets import Dataset
from dataclasses import dataclass

from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import  AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.optim import AdamW

# Configs

In [3]:
CUR_DIR = os.getcwd()

In [4]:
USE_PEFT = False
FREEZE_LAYERS = 0 # NUMBER OF LAYERS TO FREEZE , DEBERTA LARGE HAS TOTAL OF 24 LAYERS
FREEZE_EMBEDDINGS = True # BOOLEAN TO FREEZE EMBEDDINGS
MAX_INPUT = 128 # LENGTH OF CONTEXT PLUS QUESTION ANSWER
# MODEL = 'deberta_w_phobert_embed' # HUGGING FACE MODEL
MODEL = 'vinai/phobert-base-v2' 
TOKENIZER = f'{CUR_DIR}/Vietnamese-Text-Classification/tokenizer'

# Data Loader

In [5]:
train_data = pd.read_csv(f'{CUR_DIR}/Vietnamese-Text-Classification/data/train.csv')
valid_data = pd.read_csv(f'{CUR_DIR}/Vietnamese-Text-Classification/data/val.csv')

In [6]:
train_data.head()

Unnamed: 0,id,content,index_spans,toxic
0,0,Dừa lắm :)),[],False
1,1,Bấp bênh vl thế,"[9, 10]",True
2,2,Chắc cũng biết ko tồn tại đc bao lâu nữa nên c...,"[53, 54, 55]",True
3,3,Thấy chán ad page này kiến thức thì nông cản c...,"[5, 6, 7, 8, 36, 37, 38, 39, 40, 41, 42, 43, 6...",True
4,4,Giang Giang Đỗ Thị Ngọc Hà trend mới kìa kìa,[],False


In [7]:
def preprocess(example):
    # Tokenize the 'content' (text) column
    tokenized_example = tokenizer("<s> " + example['content'] + "</s>", truncation=True, max_length=128, padding='max_length')
    
    # Convert the 'toxic' column (True/False) into integer labels (1/0)
    tokenized_example['label'] = int(example['toxic'])
    
    return tokenized_example


@dataclass
class DataCollatorForClassification:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, input_batch):
        # Extract labels from the input batch
        labels = [example.pop('label') for example in input_batch]

        # Tokenizer padding (make sure all sequences are the same length)
        batch = self.tokenizer.pad(
            input_batch,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt'  # Return tensors (PyTorch format)
        )

        # Add labels to the batch
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        
        return batch

In [8]:
tokenizer = AutoTokenizer.from_pretrained(f"{CUR_DIR}/Vietnamese-Text-Classification/tokenizer")

# CHECK TOKENIZER
# Tokenize the text
tokens = tokenizer.tokenize("<s> xin chào, bạn thế nào ?")
print(tokens)

# Convert tokens back to string
decoded_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens))
print(decoded_text)

## Save the tokenizer to a local folder
# tokenizer.save_pretrained("./tokenizer")

['<s>', 'xin', 'ch@@', 'à@@', 'o@@', ',', 'bạn', 'thế', 'nào', '?']
<s> xin chào, bạn thế nào ?


In [9]:
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)

In [10]:
train_tokenized_dataset = train_dataset.map(preprocess, remove_columns=['id', 'content', 'index_spans', 'toxic'])
valid_tokenized_dataset = valid_dataset.map(preprocess, remove_columns=['id', 'content', 'index_spans', 'toxic'])

# train_tokenized_dataset.save_to_disk('train_tokenized_dataset')
# valid_tokenized_dataset.save_to_disk('valid_tokenized_dataset')

Map:   0%|          | 0/8844 [00:00<?, ? examples/s]

Map:   0%|          | 0/1106 [00:00<?, ? examples/s]

# Build Model

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
if USE_PEFT:
    print('We are using PEFT.')
    from peft import LoraConfig, get_peft_model, TaskType
    peft_config = LoraConfig(
        r=8, lora_alpha=4, task_type=TaskType.SEQ_CLS, lora_dropout=0.1, 
        bias="none", inference_mode=False, 
        target_modules=["query_proj", "value_proj"],
        modules_to_save=['classifier','pooler'],
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()

In [13]:
if FREEZE_EMBEDDINGS:
    print('Freezing embeddings.')
#     for param in model.deberta.embeddings.parameters():
    for param in model.roberta.embeddings.parameters():

        param.requires_grad = False
if FREEZE_LAYERS>0:
    print(f'Freezing {FREEZE_LAYERS} layers.')
    for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

Freezing embeddings.


# Metric

In [14]:
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(p):
    # Get the predicted labels by taking the argmax over the logits
    predictions = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    
    # Compute F1 score (for binary classification)
    f1 = f1_score(labels, predictions, average="binary")
    
    # Compute accuracy
    accuracy = accuracy_score(labels, predictions)
    
    return {"f1": f1, "accuracy": accuracy}


# Train and Save

In [15]:
# Define custom learning rate for embedding layer and other layers

training_args = TrainingArguments(
    learning_rate=3e-5,  # Set base learning rate for other layers
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    report_to='none',
    output_dir='./checkpoints',
    overwrite_output_dir=True,
    fp16=True,
    gradient_accumulation_steps=1,
    logging_steps=150,
    eval_strategy='steps',
    eval_steps=150,
    save_strategy="steps",
    save_steps=150,
    load_best_model_at_end=False,
    metric_for_best_model='accuracy',
    lr_scheduler_type='constant',  # Can use 'cosine' or 'linear' based on preference
    weight_decay=0.01,
    save_total_limit=2,
)

# # Create the optimizer with parameter-specific learning rates
# def get_optimizer(model):
#     optimizer_params = [
#         {'params': model.deberta.embeddings.parameters(), 'lr': embedding_lr},  # Learning rate for embedding layer
#         {'params': model.deberta.encoder.parameters(), 'lr': other_lr},  # Learning rate for encoder layers
#         {'params': model.classifier.parameters(), 'lr': other_lr},  # Learning rate for the classifier
#     ]
#     return AdamW(optimizer_params, weight_decay=0.01)


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForClassification(tokenizer=tokenizer),
    train_dataset=train_tokenized_dataset,
    eval_dataset=valid_tokenized_dataset,
    compute_metrics = compute_metrics,
#     optimizers=(get_optimizer(model), None)
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [17]:
trainer.train()
trainer.save_model(f'trained_model')

Step,Training Loss,Validation Loss,F1,Accuracy
150,0.4959,0.342996,0.832487,0.850814
300,0.3854,0.302915,0.867308,0.875226
450,0.3724,0.323963,0.885553,0.889693
600,0.3569,0.414926,0.842105,0.861664
750,0.3747,0.288019,0.878002,0.885172
900,0.3344,0.306521,0.892532,0.893309
1050,0.3439,0.314579,0.889693,0.889693
1200,0.2903,0.361345,0.878906,0.887884
1350,0.2482,0.358582,0.888483,0.889693
1500,0.2806,0.362647,0.891221,0.896926


In [18]:
torch.cuda.empty_cache()

# Test Saved Model

In [19]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

In [20]:
if USE_PEFT:
    model = AutoModelForSequenceClassification.from_pretrained(f'{CUR_DIR}/trained_model')
    model = get_peft_model(model, peft_config)
else:
    model = AutoModelForSequenceClassification.from_pretrained(f'{CUR_DIR}/trained_model')

In [21]:
training_args = TrainingArguments(
    learning_rate=3e-5,  # Set base learning rate for other layers
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    report_to='none',
    output_dir='./checkpoints',
    overwrite_output_dir=True,
    fp16=True,
    gradient_accumulation_steps=1,
    logging_steps=150,
    eval_strategy='steps',
    eval_steps=150,
    save_strategy="steps",
    save_steps=150,
    load_best_model_at_end=False,
    metric_for_best_model='accuracy',
    lr_scheduler_type='constant',  # Can use 'cosine' or 'linear' based on preference
    weight_decay=0.01,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForClassification(tokenizer=tokenizer),
    compute_metrics = compute_metrics,
#     optimizers=(get_optimizer(model), None)
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [22]:
test_data = pd.read_csv(f'{CUR_DIR}/Vietnamese-Text-Classification/data/test.csv')
test_dataset = Dataset.from_pandas(test_data)
test_tokenized_dataset = test_dataset.map(preprocess, remove_columns=['id', 'content', 'index_spans', 'toxic'])

Map:   0%|          | 0/1106 [00:00<?, ? examples/s]

In [23]:
test_predictions = trainer.predict(test_tokenized_dataset)
res = compute_metrics(test_predictions)
print('Accuracy: ', res['accuracy'])
print('F1: ', res['f1'])

Accuracy:  0.9014466546112115
F1:  0.8948891031822565
