In [None]:
from torch import nn
from transformers import Trainer,AutoTokenizer, RobertaTokenizer, RobertaForTokenClassification, RobertaConfig, RobertaModel, RobertaPreTrainedModel
from transformers.trainer_pt_utils import nested_detach


tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base-v2')

# Tạo mô hình RoBERTa cho token classification
class RobertaForSequenceAndTokenClassification(RobertaPreTrainedModel):
    def __init__(self, config, num_labels_seq_cls, num_labels_token_cls):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier_seq_cls = nn.Linear(config.hidden_size, num_labels_seq_cls)
        self.classifier_token_cls = nn.Linear(config.hidden_size, num_labels_token_cls)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None,selabel=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds)

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        logits_seq_cls = self.classifier_seq_cls(pooled_output)

        last_hidden_states = outputs[0]
        logits_token_cls = self.classifier_token_cls(last_hidden_states)

        outputs = (logits_seq_cls, logits_token_cls,) + outputs[2:]
        return outputs
# Tạo mô hình multitask
config = RobertaConfig.from_pretrained('vinai/phobert-base-v2')
model = RobertaForSequenceAndTokenClassification.from_pretrained('vinai/phobert-base-v2',config=config, num_labels_seq_cls=9, num_labels_token_cls=7)



class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        selabel = inputs.pop("selabel")
        # forward pass
        outputs = model(**inputs)
        logits_seq_cls, logits_token_cls = outputs[:2]  # Unpack the tuple of outputs
        # compute custom loss for sequence classification
        loss_fct = nn.CrossEntropyLoss()

        loss_seq_cls = loss_fct(logits_seq_cls, selabel)
        
        # compute custom loss for token classification
        loss_fct = nn.CrossEntropyLoss()
        loss_token_cls = loss_fct(logits_token_cls.permute((0,2,1)), labels)
        
        # You might want to adjust how you combine these losses if needed
        total_loss = loss_seq_cls + loss_token_cls
        
        return (total_loss, outputs) if return_outputs else total_loss
    def prediction_step(
        self,
        model,
        inputs,
        prediction_loss_only,
        ignore_keys= None):
        labels = inputs.pop("labels")
        selabel = inputs.pop("selabel")
        with torch.no_grad():
            outputs = model(**inputs)
            logits_seq_cls, logits_token_cls = outputs[:2]  # Unpack the tuple of outputs
            # compute custom loss for sequence classification
            loss_fct = nn.CrossEntropyLoss()
            loss_seq_cls = loss_fct(logits_seq_cls, selabel)

            # compute custom loss for token classification
            loss_fct = nn.CrossEntropyLoss()
            loss_token_cls = loss_fct(logits_token_cls.permute((0,2,1)), labels)

            # You might want to adjust how you combine these losses if needed
            total_loss = loss_seq_cls + loss_token_cls
        outputs=nested_detach(outputs)
        return (total_loss,outputs,(labels,selabel))
        
        


In [5]:
import numpy as np
import torch
inputs=['chào em yêu']
encodeing = tokenizer(inputs,return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
  outputs = model(**encodeing)
print(outputs)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


(tensor([[-0.0214, -0.0570, -0.1141,  0.0656, -0.0336,  0.0468,  0.0734,  0.1910,
          0.0282]]), tensor([[[ 0.0793,  0.0848, -0.1656, -0.2289, -0.1791, -0.0817,  0.0384],
         [ 0.2342,  0.0806,  0.2049, -0.0243, -0.0028, -0.1689,  0.1099],
         [-0.2590, -0.0626,  0.0434,  0.0167, -0.0590, -0.2817, -0.1183],
         [-0.1058,  0.0391,  0.0480, -0.1536, -0.0940, -0.0569, -0.2356],
         [-0.0773, -0.0123, -0.1489, -0.1227, -0.1786,  0.0435,  0.1126]]]))


In [None]:
!pip install datasets

In [None]:
!pip install gdown

In [3]:
import gdown

file_id = "your_file_id"
url = f"https://drive.google.com/uc?id=1iiaUl1mmY9vZFNGjfaXxSmIb0aSfOLmv"

output = "frchat.xlsx"
  # Specify the name of the downloaded file

gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1iiaUl1mmY9vZFNGjfaXxSmIb0aSfOLmv
To: /kaggle/working/frchat.xlsx
100%|██████████| 381k/381k [00:00<00:00, 105MB/s]


'frchat.xlsx'

In [None]:
import pandas as pd

# Đọc dữ liệu từ tệp Excel
df = pd.read_excel('/kaggle/working/frchat.xlsx')

# Lấy những cột nhất định, ví dụ: cột 'cột_1', 'cột_2'
columns_of_interest = ['Text', 'address', 'product', 'phone','Intent']
selected_columns_df = df[columns_of_interest]

# In một số dòng đầu tiên của DataFrame mới
print(selected_columns_df.head())
data_dict = selected_columns_df.to_dict(orient='list')

In [None]:
import numpy as np
def addSpecialToken(special_token, A, B):
    B=str(B)
    start_index = A.find(B)
    end_index = start_index + len(B)

    # Add the special token to both ends of string B within string A
    result = A[:start_index] + special_token + B + special_token + A[end_index:]

    return result
for index, row in selected_columns_df.iterrows():
    print(f"Index: {index}")
    if pd.notnull(row['address']):
        selected_columns_df.loc[index, 'Text'] = addSpecialToken('<FR>', row['Text'], row['address'])
    if pd.notnull(row['product']):
        selected_columns_df.loc[index, 'Text'] = addSpecialToken('<TT>', row['Text'], row['product'])
    if pd.notnull(row['phone']):
        selected_columns_df.loc[index, 'Text'] = addSpecialToken('<D>', row['Text'], row['phone'])
    print(f"Text: {row['Text']}, address: {row['address']}, product: {row['product']}, phone: {row['phone']}")

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# create an example list of labels
import pickle
import numpy as np

y=data_dict['Intent']

labels = y
# create a LabelEncoder object
le = LabelEncoder()
# fit the encoder to the labels and transform the labels
y = le.fit_transform(labels)
print(len(le.classes_))
# print the original labels and the encoded labels
print("Encoded labels:", y)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)
# Save the encoded data as a NumPy array
np.save('encoded_data.npy', y)

9
Encoded labels: [2 1 1 ... 4 4 2]


In [None]:
!pip install datasets

In [10]:
import datasets
x = data_dict['Text']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)
dataset_train = datasets.Dataset.from_dict({
  'text' :x_train,
    'label':y_train
})
dataset_test = datasets.Dataset.from_dict({
  'text' :x_test,
    'label':y_test
})

In [12]:
import transformers
special_tokens_dict = {"additional_special_tokens": ["<FR>", "<TT>", "<D>"]}
tokenizer.add_special_tokens(special_tokens_dict)
token = tokenizer("<FR><TT><D>")
input_ids = token['input_ids']
input_ids

[0, 64001, 64002, 64003, 2]

In [13]:
def collator_fr(features):
    from torch.nn.utils.rnn import pad_sequence
# batch["labels"] = torch.tensor([f["Rext"] for f in features], dtype=int)
# encodings = fe([f['audio']['array'] for f in features], sampling_rate=fe.sampling_rate, return_tensors="pt", padding='max_length', truncation=True, max_length=80000 )
    encodings = tokenizer([i['text'] for i in features],return_tensors = 'pt',padding=True,max_length=258, truncation=True)
    encoding_new={
        'input_ids': [],
        'attention_mask': [],
        'labels':[],
        'selabel':[],
    }
    for input_ids, attention_mask in zip(encodings['input_ids'], encodings['attention_mask']):
        mask = torch.logical_and(torch.logical_and(input_ids !=64001 , input_ids !=64003), input_ids !=64002)
        encoding_new['input_ids'].append(input_ids[mask])
        encoding_new['attention_mask'].append(attention_mask[mask])
        address = 0
        phone = 0
        product = 0
        labels = []
        for i in input_ids :
            if(i==64001 and address==0): 
                address+=1
                continue
            if(i==64001 and address!=0):
                address = 0
                continue
            if(i==64003 and phone==0): 
                phone+=1
                continue
            if(i==64003 and phone!=0):
                phone = 0
                continue   
            if(i==64002 and product==0): 
                product+=1
                continue
            if(i==64002 and product!=0):
                product = 0
                continue              
            if (address == 1): 
                labels.append(1)
                address +=1
            elif(address ==2):labels.append(2)
            elif (phone==1): 
                labels.append(3)
                phone+=1
            elif(phone ==2): labels.append(4)
            elif (product==1): 
                labels.append(5)
                product+=1
            elif (product==2):labels.append(6)
            else: labels.append(0)

        encoding_new['labels'].append(torch.tensor(labels))
    encoding_new['labels'] = pad_sequence(encoding_new['labels'], batch_first = True, padding_value = -100)
    encoding_new['selabel'] = torch.tensor([i['label'] for i in features])

    return tokenizer.pad(encoding_new)

#     return encoding_new

In [None]:
import torch
collator_fr([dataset_train[i] for i in range(6)])

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments("test-trainer",remove_unused_columns = False, num_train_epochs=5, logging_steps=200,per_device_train_batch_size=4,evaluation_strategy="steps", eval_steps=200)
training_args

In [28]:
label_names=['0','1','2','3','4','5','6']

In [32]:
from sklearn.metrics import accuracy_score
from transformers import Trainer
def compute_metrics(eval_preds):
    (logit1,logit2), (labels,selabel) = eval_preds
    predictions = np.argmax(logit2,axis=-1)
    preds = logit1.argmax(-1)
#     return {"accuracy": accuracy_score(selabel, preds)}

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
#         "precision": all_metrics["overall_precision"],
#         "recall": all_metrics["overall_recall"],
#         "f1": all_metrics["overall_f1"],
        "accuracyNamed": all_metrics["overall_accuracy"],
        "accuracyClass": accuracy_score(selabel, preds),
    }

In [40]:
from sklearn.metrics import accuracy_score
from transformers import Trainer
def compute_accuracy(pred):
    (logit1,logit2), (labels,selabel) = pred
#     print(logit1.shape,logit2.shape,selabel.shape)
#     print(labels.shape)
    preds = logit1.argmax(-1)
    return {"accuracy": accuracy_score(selabel, preds)}


In [None]:
!pip install evaluate

In [None]:
!pip install seqeval

In [22]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [33]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics = compute_metrics,
    
    data_collator=collator_fr,
    tokenizer=tokenizer,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracynamed,Accuracyclass
200,0.2844,2.004216,0.977472,0.701325
400,0.3858,1.710316,0.973454,0.713558
600,0.4317,1.339239,0.979054,0.769623
800,0.4484,1.29876,0.980609,0.752294
1000,0.3949,1.511183,0.981983,0.753313
1200,0.2786,1.762034,0.977602,0.749235
1400,0.4984,1.494216,0.978017,0.751274
1600,0.58,1.208013,0.980531,0.740061
1800,0.5448,1.428088,0.98092,0.753313
2000,0.4938,1.265898,0.980557,0.767584


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier,

TrainOutput(global_step=4905, training_loss=0.33773533999008504, metrics={'train_runtime': 450.7913, 'train_samples_per_second': 43.49, 'train_steps_per_second': 10.881, 'total_flos': 291440859534432.0, 'train_loss': 0.33773533999008504, 'epoch': 5.0})