## Reading data

In [1]:
import os
import pandas as pd
    

In [3]:
folder_paths = {
    'train': '../data/processed/train.csv',
    'valid': '../data/processed/val.csv',
    'test': '../data/processed/test.csv'
}

train_df = pd.read_csv(folder_paths['train'])
valid_df = pd.read_csv(folder_paths['valid'])
test_df = pd.read_csv(folder_paths['test'])

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score  
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import  Dataset
base_model = 'vinai/phobert-base-v2'



In [5]:
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(base_model)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)
model = get_peft_model(model, lora_config)

In [7]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(64001, 768, padding_idx=1)
          (position_embeddings): Embedding(258, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): Mod

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=256, return_tensors="pt")

In [9]:
train_df['sentence'] = train_df['sentence'].astype(str)
train_df['label'] = train_df['label'].astype(int)

train_dataset = Dataset.from_pandas(train_df)
valid_df['sentence'] = valid_df['sentence'].astype(str)
valid_df['label'] = valid_df['label'].astype(int)

valid_dataset = Dataset.from_pandas(valid_df)

test_df['sentence'] = test_df['sentence'].astype(str)
test_df['label'] = test_df['label'].astype(int)

test_dataset = Dataset.from_pandas(test_df)

In [10]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/29737 [00:00<?, ? examples/s]

Map:   0%|          | 0/9921 [00:00<?, ? examples/s]

Map:   0%|          | 0/9921 [00:00<?, ? examples/s]

In [11]:
train_dataset

Dataset({
    features: ['sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 29737
})

In [12]:
train_dataset.set_format(
    type='torch',
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']
)

In [13]:
valid_dataset.set_format(
    type='torch',
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']
)

In [14]:
test_dataset.set_format(
    type='torch',
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']
)

In [15]:
train_dataset = train_dataset.remove_columns(["sentence"])
valid_dataset = valid_dataset.remove_columns(["sentence"])
test_dataset = test_dataset.remove_columns(["sentence"])

In [16]:
train_dataset

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 29737
})

In [17]:
train_dataset = train_dataset.rename_column("label", "labels")
valid_dataset = valid_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

In [18]:
from transformers import TrainingArguments, Trainer


In [19]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
)

In [20]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_score(labels, predictions)
    report = classification_report(labels, predictions, output_dict=True)
    return {
        'accuracy': accuracy,
        'f1': report['macro avg']['f1-score'],
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall']
    }

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,  # You can define a custom compute_metrics function if needed
)
    

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [22]:
trainer.train()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: vohoanghuy8811 (vohoanghuy8811-ho-chi-minh-city-university-of-technology) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2356,0.250287,0.907267,0.907265,0.907286,0.907261
2,0.2839,0.235435,0.914726,0.914673,0.915888,0.914777
3,0.2707,0.229969,0.914122,0.914096,0.914705,0.914158
4,0.1776,0.227133,0.915432,0.915418,0.915778,0.91546
5,0.229,0.229715,0.915331,0.915305,0.915942,0.915368


TrainOutput(global_step=9295, training_loss=0.23764666366346296, metrics={'train_runtime': 2607.3084, 'train_samples_per_second': 57.026, 'train_steps_per_second': 3.565, 'total_flos': 1.983026847673344e+16, 'train_loss': 0.23764666366346296, 'epoch': 5.0})

In [23]:
# test with test dataset
trainer.evaluate(test_dataset)


{'eval_loss': 0.22713260352611542,
 'eval_accuracy': 0.9154319121056345,
 'eval_f1': 0.9154178325856118,
 'eval_precision': 0.9157779143955743,
 'eval_recall': 0.9154599786153804,
 'eval_runtime': 59.1459,
 'eval_samples_per_second': 167.738,
 'eval_steps_per_second': 10.499,
 'epoch': 5.0}

In [25]:
# Save the model
model.save_pretrained('../models/finetuned_model')
# Save the tokenizer
tokenizer.save_pretrained('../models/finetuned_model')
# Save the training arguments
trainer.save_model('../models/finetuned_model')


In [26]:
# push to hub
hub = 'edith81/phobert_vietnamese_sentiment_analysis'
model.push_to_hub(hub)
tokenizer.push_to_hub(hub)



adapter_model.safetensors:   0%|          | 0.00/4.74M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/edith81/phobert_vietnamese_sentiment_analysis/commit/d506d794b4d6f55c1d1911d1d9267849d6b6d426', commit_message='Upload tokenizer', commit_description='', oid='d506d794b4d6f55c1d1911d1d9267849d6b6d426', pr_url=None, repo_url=RepoUrl('https://huggingface.co/edith81/phobert_vietnamese_sentiment_analysis', endpoint='https://huggingface.co', repo_type='model', repo_id='edith81/phobert_vietnamese_sentiment_analysis'), pr_revision=None, pr_num=None)