In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U --q transformers datasets accelerate sentencepiece wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
%cd '/content/drive/MyDrive/NLU_NCKH/notebook/res_data_preprocesed/'

/content/drive/.shortcut-targets-by-id/1btr8FZI1SJ1bZdaS-BqFfYlWxPaXfQOB/NLU_NCKH/notebook/res_data_preprocesed


# Import library

In [None]:
import numpy as np
import os
import random
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch
import datasets
from sklearn.metrics import *

In [None]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load data

In [None]:
df_train = pd.read_csv("train_res_preprocesed.csv")
df_dev = pd.read_csv("dev_res_preprocesed.csv")
df_test = pd.read_csv("test_res_preprocesed.csv")

print("Train: ", df_train.shape)
print("Dev: ",  df_dev.shape)
print("Test: ", df_test.shape )

Train:  (7028, 13)
Dev:  (771, 13)
Test:  (1938, 13)


# Define model `XML_Roberta_base` customize

In [None]:
import torch
from torch import nn
from transformers import AutoModel, AutoTokenizer, Trainer, DataCollatorWithPadding, TrainingArguments
from sklearn.preprocessing import OneHotEncoder
import warnings

class XML_Roberta_Class(torch.nn.Module):
    def __init__(self, MODEL_NAME, num_labels, load_dict=None):
        super(XML_Roberta_Class, self).__init__()
        self.xmlr_pretrained = AutoModel.from_pretrained(MODEL_NAME, output_hidden_states=True)
        self.dropout = nn.Dropout(0.1)
        self.num_labels = num_labels
        self.classifier = nn.ModuleList(
            [nn.Linear(self.xmlr_pretrained.config.hidden_size, 4)
             for _ in range(num_labels)]
        )
        self.bce = torch.nn.BCEWithLogitsLoss()


    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        outputs = self.xmlr_pretrained(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        # hidden_states = torch.cat(
        #     tuple(outputs.hidden_states[-4:]), dim=-1
        # )

        # pooled_output = hidden_states[:, 0, :]
        # x = self.dropout(pooled_output)
        pooler_output = outputs['pooler_output']
        x = self.dropout(pooler_output)

        logits = torch.cat(
            [linear(x).unsqueeze(-1) for linear in self.classifier], dim=-1
        )

        if labels is not None:
            loss = self.bce(logits.view(-1, self.num_labels*4), labels)
            return loss, logits
        return logits

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, aspects, tokenizer, MAX_LEN, type_model="train") -> None:
        self.df = df
        self.aspects = aspects
        self.tokenizer = tokenizer
        self.MAX_LEN = MAX_LEN
        self.type_model = type_model

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        review = row['review']
        aspects = row[self.aspects]

        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.MAX_LEN,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        if self.type_model == "test":
            return {
                'input_ids': torch.tensor(ids, dtype=torch.long),
                'attention_mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
            }

        label = self.label_one_hot(aspects)
        label = label.reshape(-1)

        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.float)
        }

    def label_one_hot(self, label):
        encoder = OneHotEncoder(categories=[[0, 1, 2, 3]], dtype='uint8', sparse=False)
        one_hot_outputs = encoder.fit_transform(label.values.reshape(-1, 1))
        return one_hot_outputs


# Initialize tokenizer
MODEL_NAME = "FacebookAI/xlm-roberta-base"
SAVE_MODEL = "/content/drive/MyDrive/NLU_NCKH/notebook/model/xlmr_base/"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 12
LEARNING_RATE = 1e-05

# Assuming df_train, df_test, df_dev are already defined
aspects = df_train.drop(columns='review').columns.tolist()

# Create dataset objects
dataset_train = CustomDataset(df_train, aspects, tokenizer, MAX_LEN, type_model="train")
dataset_test = CustomDataset(df_test, aspects, tokenizer, MAX_LEN, type_model="test")
dataset_dev = CustomDataset(df_dev, aspects, tokenizer, MAX_LEN, type_model="train")

print("Encoded: ", dataset_dev.__getitem__(1)['input_ids'].size())
print("Decoded: ", tokenizer.decode(dataset_dev.__getitem__(1)['input_ids'], skip_special_tokens=True))
print("Label: ", dataset_dev.__getitem__(1)['labels'].size())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Encoded:  torch.Size([256])
Decoded:  <s> nay đi uống mới biết giá_thành hơi cao nhưng thật_sự đi_đôi với chất_lượng</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [None]:
from transformers import TrainingArguments

model = XML_Roberta_Class(MODEL_NAME, len(aspects))

training_args = TrainingArguments(
    output_dir='./xlmr_base_v2',
    num_train_epochs=4,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    warmup_steps=500,
    eval_strategy="epoch",
    save_strategy='no'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_dev,
)

# Train the model
trainer.train()

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
predictions = trainer.predict(dataset_test)[0]
y_pred = np.argmax(predictions.reshape(-1, 12, 4), axis=-1)

In [None]:
from sklearn.metrics import classification_report

aspect_test = []
aspect_pred = []
y_test_argmax = df_test[aspects].to_numpy()

for row_test, row_pred in zip(y_test_argmax, y_pred):
    for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
        aspect_test.append(bool(col_test) * aspects[index])
        aspect_pred.append(bool(col_pred) * aspects[index])

aspect_report = classification_report(aspect_test, aspect_pred, digits=4, zero_division=1, output_dict=True)
print(classification_report(aspect_test, aspect_pred, digits=4, zero_division=1))

In [None]:
y_test_flat = y_test_argmax.flatten()
y_pred_flat = y_pred.flatten()
replacements = {0: None, 1: 'positive', 2: 'negative', 3: 'neutral'}
target_names = list(map(str, replacements.values()))

polarity_report = classification_report(y_test_flat, y_pred_flat, digits=4, output_dict=True)
print(classification_report(y_test_flat, y_pred_flat, target_names=target_names, digits=4))

In [None]:
aspect_polarity_test = []
aspect_polarity_pred = []

for row_test, row_pred in zip(y_test_argmax, y_pred):
    for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
        aspect_polarity_test.append(f'{aspects[index]},{replacements[col_test]}')
        aspect_polarity_pred.append(f'{aspects[index]},{replacements[col_pred]}')

aspect_polarity_report = classification_report(aspect_polarity_test, aspect_polarity_pred, digits=4, zero_division=1, output_dict=True)
print(classification_report(aspect_polarity_test, aspect_polarity_pred, digits=4, zero_division=1))

In [None]:
save_path = "/content/drive/MyDrive/NLU_NCKH/notebook/model/xlmr_base_v2/"
trainer.save_model(save_path)

In [None]:
import gc
del model, tokenizer,
torch.cuda.empty_cache()
gc.collect()