# Prepare Libraries

In [1]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 11.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 44.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 578 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Atte

In [2]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import Trainer
from transformers import TrainingArguments
from datasets import load_metric
import numpy as np

# Prepare model

**Note**

To skip the training process and use the pre-trained model rightaway, please scroll down to the **Evaluation section**

In [None]:
#Load pre-trained phoBERT for finetuning on our dataset
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/518M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['

Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Prepare data

In [None]:
#Load word-segmented training set and dev set
train_dataset = load_dataset("csv", data_files='/content/drive/MyDrive/DeepLearningProject_Group3/segmented_data/train_segmented.csv', split="train")
dev_dataset = load_dataset("csv", data_files='/content/drive/MyDrive/DeepLearningProject_Group3/segmented_data/dev_segmented.csv', split="train")

Using custom data configuration default-ebcf23a169f69d09


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ebcf23a169f69d09/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ebcf23a169f69d09/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


Using custom data configuration default-67ab5485aedff044


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-67ab5485aedff044/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-67ab5485aedff044/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


In [None]:
#Tokenizing function
def tokenize_function(examples):
    return tokenizer(examples["sents"], padding="max_length", truncation=True)

In [None]:
#Map the tokenizer to the data
train_dataset = train_dataset.map(tokenize_function, batched=True).shuffle(seed=101)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

# Setup training hyperparameters

In [None]:
#Below is the best set of hyperparameters after tuning
training_args = TrainingArguments(
  output_dir="checkpoint",
  group_by_length=False,
  per_device_train_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=3,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  dataloader_num_workers=6,
  learning_rate=3e-5,
  warmup_steps=0,
  save_total_limit=15,
  eval_accumulation_steps=2000,
  report_to='tensorboard'
)

PyTorch: setting up devices


In [None]:
#Load weighted f1 for assessing the training process
metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

# Model training

In [None]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=dev_dataset, compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: topics, sents.
  cpuset_checked))
***** Running training *****
  Num examples = 11426
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4287


Step,Training Loss,Validation Loss,F1
500,0.3001,0.322796,0.915159
1000,0.2617,0.275466,0.932645
1500,0.2154,0.279629,0.944194


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: topics, sents.
***** Running Evaluation *****
  Num examples = 1583
  Batch size = 8
Saving model checkpoint to checkpoint/checkpoint-500
Configuration saved in checkpoint/checkpoint-500/config.json
Model weights saved in checkpoint/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: topics, sents.
  cpuset_checked))
***** Running Evaluation *****
  Num examples = 1583
  Batch size = 8
Saving model checkpoint to checkpoint/checkpoint-1000
Configuration saved in checkpoint/checkpoint-1000/config.json
Model weights saved in checkpoint/checkpoint-1000/pytorch_model.bin
  cpuset_checked))
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` 

KeyboardInterrupt: ignored

# Evaluation

In [3]:
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm

In [4]:
device = 'cuda'

We have the following models which can be used for evaluation:

- **phobert-base**: Pre-trained model with PhoBERT-base.
- **phobert-large**: Pre-trained model with PhoBERT-large.
- **phobert-base-RemoveName**: Pre-trained model with PhoBERT-base and removing names in the training data.
- **phobert-base-ReplaceName**: Pre-trained model with PhoBERT-base and replacing names in the training data with <name> tokens.

In [None]:
#Please change the path accordingly
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/DeepLearningProject_Group3/checkpoint/BERT/phobert-base", num_labels=3).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


For evaluating the models, one should use the appropriate preprocessed test set:

- /DeepLearningProject_Group3/segmented_data/test_segmented.csv : for evaluating phobert-base and phobert-large, which are trained with segmented data.

- /DeepLearningProject_Group3/checkpoint/BERT/bert-external-testset/test_name.csv: For evaluating phobert-base-ReplaceName.

- /DeepLearningProject_Group3/checkpoint/BERT/bert-external-testset/test_remove_name.csv: For evaluating phobert-base-RemoveName.

In [None]:
#Please change the path accordingly
data = pd.read_csv('/content/drive/MyDrive/DeepLearningProject_Group3/segmented_data/test_segmented.csv')

predictions = []
labels = []

In [None]:
for i in tqdm(range(data.shape[0])):
    input_str = data['sents'][i]

    input_values = tokenizer(input_str, return_tensors="pt").input_ids.to(device)
    logits = model(input_values).logits

    prediction = np.argmax(logits.cpu().detach().numpy(), axis=-1)[0]

    labels.append(int(data['labels'][i]))
    predictions.append(int(prediction))

100%|██████████| 3166/3166 [02:12<00:00, 23.87it/s]


In [None]:
target_names = ['negative', 'neutral', 'positive']
print(classification_report(labels, predictions, target_names=target_names, digits=4))

              precision    recall  f1-score   support

    negative     0.9491    0.9659    0.9574      1409
     neutral     0.6911    0.5090    0.5862       167
    positive     0.9466    0.9579    0.9522      1590

    accuracy                         0.9378      3166
   macro avg     0.8622    0.8109    0.8319      3166
weighted avg     0.9342    0.9378    0.9352      3166



## Ensemble evaluation

In [5]:
#Please change the path accordingly
model1 = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/DeepLearningProject_Group3/checkpoint/BERT/phobert-base-RemoveName", num_labels=3).to(device).eval()
model2 = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/DeepLearningProject_Group3/checkpoint/BERT/phobert-base-ReplaceName", num_labels=3).to(device).eval()


In [6]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
#Please change the path accordingly
data1 = pd.read_csv('/content/drive/MyDrive/DeepLearningProject_Group3/checkpoint/BERT/bert-external-testset/test_remove_name.csv')
data2 = pd.read_csv('/content/drive/MyDrive/DeepLearningProject_Group3/checkpoint/BERT/bert-external-testset/test_name.csv')

predictions = []
labels = []

In [12]:
for i in tqdm(range(data1.shape[0])):
    input_str1 = data1['sents'][i]
    input_values1 = tokenizer(input_str1, return_tensors="pt").input_ids.to(device)

    input_str2 = data2['sents'][i]
    input_values2 = tokenizer(input_str2, return_tensors="pt").input_ids.to(device)
        
    logits1 = model1(input_values1).logits
    logits2 = model2(input_values2).logits

    logits = (logits1  + logits2) /2

    prediction = np.argmax(logits.cpu().detach().numpy(), axis=-1)[0]
    labels.append(int(data1['labels'][i]))
    predictions.append(int(prediction))

100%|██████████| 3166/3166 [01:44<00:00, 30.40it/s]


In [13]:
target_names = ['negative', 'neutral', 'positive']
print(classification_report(labels, predictions, target_names=target_names, digits=4))

              precision    recall  f1-score   support

    negative     0.9436    0.9744    0.9588      1409
     neutral     0.7679    0.5150    0.6165       167
    positive     0.9543    0.9597    0.9570      1590

    accuracy                         0.9428      3166
   macro avg     0.8886    0.8164    0.8441      3166
weighted avg     0.9397    0.9428    0.9399      3166

