In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!pip install datasets
!pip install -U accelerate
!pip install -U transformers



# Pre process input


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments

In [None]:
def map_input_feature(batch):
  return {
      'input' : torch.tensor([ batch['entropy'] , batch['length'] , batch['num_Percent'] , batch['have_Special'] ])
  }

In [None]:
#load data
train_data = load_dataset("csv", data_files="/content/data_train_nn.csv", sep="," , names=["link", "entropy", "length", "num_Percent", "have_Special", "label"] , split="train[0:14000]")
val_data = load_dataset("csv", data_files="/content/data_train_nn.csv", sep="," , names=["link", "entropy", "length", "num_Percent", "have_Special", "label"] , split="train[14000:]")
test_data = load_dataset("csv", data_files="/content/data_test_nn.csv", sep="," , names=["link", "entropy", "length", "num_Percent", "have_Special", "label"] , split="train[0:]")

print(train_data)
print(val_data)
print(test_data)

# map input_feature gộp các đặc trưng lại thành 1 vector input  và xóa những cột ko cần thiết khi cho vào model
train_data_pre = train_data.map(map_input_feature, remove_columns=['entropy', 'length', 'num_Percent', 'have_Special'])
val_data_pre = val_data.map(map_input_feature, remove_columns=['entropy', 'length', 'num_Percent', 'have_Special'])
test_data_pre = test_data.map(map_input_feature, remove_columns=['entropy', 'length', 'num_Percent', 'have_Special'])

# chuyển cột label , input qua dạnh tensor
train_data_pre.set_format(type="torch", columns=['label' , 'input'])
val_data_pre.set_format(type="torch", columns=['label', 'input'])
test_data_pre.set_format(type="torch", columns=['label', 'input'])

print(train_data_pre)
print(val_data_pre)
print(test_data_pre)


Dataset({
    features: ['link', 'entropy', 'length', 'num_Percent', 'have_Special', 'label'],
    num_rows: 14000
})
Dataset({
    features: ['link', 'entropy', 'length', 'num_Percent', 'have_Special', 'label'],
    num_rows: 1000
})
Dataset({
    features: ['link', 'entropy', 'length', 'num_Percent', 'have_Special', 'label'],
    num_rows: 3105
})
Dataset({
    features: ['link', 'label', 'input'],
    num_rows: 14000
})
Dataset({
    features: ['link', 'label', 'input'],
    num_rows: 1000
})
Dataset({
    features: ['link', 'label', 'input'],
    num_rows: 3105
})


In [None]:
train_data_pre[0:5]

{'label': tensor([1, 1, 1, 1, 1]),
 'input': tensor([[ 3.0000,  8.0000,  0.0000,  1.0000],
         [ 3.3083, 18.0000,  0.0000,  0.0000],
         [ 2.6416,  9.0000, 22.2222,  0.0000],
         [ 3.0958, 11.0000, 18.1818,  0.0000],
         [ 1.9219,  5.0000,  0.0000,  0.0000]])}



```
# This is formatted as code
```

# Chuẩn bị model

In [None]:
from transformers.modeling_outputs import SequenceClassifierOutput

#mang no ron
class NN(nn.Module):
  def __init__(self, input_size, num_classes):
    super(NN, self).__init__()
    # self.phoBert =
    self.num_classes = num_classes
    self.fc1 = nn.Linear(input_size, 256)
    self.fc2 = nn.Linear(256, 256)
    self.out = nn.Linear(256, num_classes)

  def forward(self, input , labels):
    x = F.relu(self.fc1(input.to(device)))
    x = F.relu(self.fc2(x))
    logits = self.out(x)

    # tính loss cái này chỉ để hiện kq loss tập valid
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      print(logits.shape)
      print(labels.shape)
      print(logits.view(-1, self.num_classes).shape)
      print(labels.view(-1))
      loss = loss_fct(logits.view(-1, self.num_classes), labels.view(-1))

    return SequenceClassifierOutput(loss = loss , logits=logits) # hàm trainer cần cái này nó mới chịu train

#thiet bị
device = torch.device('cuda' if  torch.cuda.is_available() else 'cpu')
print(device)


#sieu tham so
input_size = 4 # fix 4
num_classes = 2
learning_rate = 0.001
batch_size = 32
num_epochs = 10

cuda


In [None]:
#tao mang no ron
model = NN(input_size=input_size, num_classes=num_classes).to(device)
model

NN(
  (fc1): Linear(in_features=4, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=2, bias=True)
)

In [None]:
def compute_metrics(pred):
 labels = pred.label_ids
 preds = pred.predictions.argmax(-1)
 f1 = f1_score(labels, preds, average="weighted")
 acc = accuracy_score(labels, preds)
 return {"accuracy": acc, "f1": f1}



In [None]:
logging_steps = len(train_data) // batch_size
model_name = f"nn_URL_model"
training_args = TrainingArguments(output_dir=model_name, # đường dẫn model xuất ra
                                  num_train_epochs=num_epochs, # số lần train
                                  learning_rate=learning_rate, # LR
                                  per_device_train_batch_size=batch_size, #batch_size
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch", # chiến lược đánh giá 2 option epoch vs step (nếu step thì bổ sung thêm evl_steps)
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  eval_steps=40,
                                  save_total_limit = 5, # save bn model thường thì nó sẽ giữa lại model có best model từ cao đến thâp
                                  logging_dir='./logs',
                                  log_level="error",
                                  save_strategy="epoch",
                                  load_best_model_at_end=True, # load model tối nhất trong đống model đã lưu kia (bổ sung thêm metri_for_best_model)
                                  )


In [None]:
# class MyTrainer(Trainer):
#   def compute_loss(self , model , inputs , return_outputs = False):


In [None]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_data_pre,
                  eval_dataset=val_data_pre)
trainer.train();

# Lưu model và đánh giá


In [None]:
#save model
torch.save(model , 'model')
the_model = torch.load('/content/drive/MyDrive/model_url_nn')

In [21]:
from transformers import AutoTokenizer, RobertaModel
import torch
tokenizer = AutoTokenizer.from_pretrained("gechim/phobert-base-v2-finetuned")
model = RobertaModel.from_pretrained("gechim/phobert-base-v2-finetuned")
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)
outputs
# last_hidden_states = outputs.last_hidden_sta

Some weights of RobertaModel were not initialized from the model checkpoint at gechim/phobert-base-v2-finetuned and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0146, -0.0700, -0.1169,  ...,  0.1103,  0.1094,  0.1948],
         [-0.1185,  0.1188, -0.4532,  ...,  0.0472,  0.3281,  0.2066],
         [-0.1195,  0.0243, -0.5375,  ...,  0.4129,  0.2689,  0.3015],
         ...,
         [ 0.0432,  0.1896, -0.2310,  ..., -0.3814,  0.5556, -0.5428],
         [ 0.0560,  0.0153, -0.0598,  ...,  0.3561,  0.2030, -0.0266],
         [ 0.0307, -0.0421, -0.0879,  ...,  0.0820,  0.0647,  0.2323]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 1.8114e-01,  1.5267e-01,  2.6328e-01,  2.3842e-01,  8.7790e-02,
         -1.5624e-01,  1.8803e-02,  4.6023e-02,  2.1273e-01, -8.7220e-02,
          8.1869e-02,  1.8275e-01, -4.7798e-02,  3.2150e-02, -1.4706e-01,
         -7.0745e-02, -2.4476e-01,  9.9409e-02,  1.0493e-01,  7.0500e-02,
         -1.7365e-01, -2.4417e-02, -1.2071e-01, -7.3499e-02,  2.0065e-01,
          7.9973e-02,  2.9614e-01, -3.0005e-01,  1.6484e-01,  2.869

In [25]:
import torch.nn as nn
from transformers import RobertaConfig

from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import  RobertaPreTrainedModel
from transformers.models.roberta.modeling_roberta import  RobertaModel

class RobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = RobertaConfig
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        # định nghĩa các layer bên model của bảo
        self.phoBertFineTunedURL = RobertaModel(config, add_pooling_layer=False) 
        

        #định nghĩa các layer bên sang
        
        # định nghĩa các layer bên cls
        self.dense = nn.Linear(config.hidden_size, config.hidden_size) 
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
        # load pretrained model 
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        
        # output bên bảo  
        # #output_phoBertFineTunedURL[0][:,0,:] 
        # (batch , 1 , 768)
        output_phoBertFineTunedURL = self.phoBertFineTunedURL(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 
        
        # out put bên sang 
        # .....
        
        #phần cls
        dense_output =  self.dense(output_phoBertFineTunedURL[0][:,0,:]) # chổ nãy sẽ cộng lại nề
        sequence_output = self.dropout(dense_output)
        logits = self.classifier(sequence_output)
        
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [28]:
import torch.nn as nn
from transformers import AutoConfig , AutoModel , AutoModelForPreTraining
from transformers.modeling_outputs import TokenClassifierOutput

class PhoBERT_NN_Classifier(AutoModelForPreTraining):
    

In [17]:
id2label = {0: "Bình thường", 1: "Bất thường"}
label2id = {"Bình thường": 0, "Bất thường": 1}

In [31]:
from transformers import AutoConfig
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

phobert_config = AutoConfig.from_pretrained('gechim/phobert-base-v2-finetuned', num_labels= 2, id2label=id2label, label2id=label2id)
phobert_nn_cls_model = (PhoBERT_NN_Classifier.from_pretrained('vinai/phobert-base-v2', config=phobert_config).to(device))
phobert_nn_cls_model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [20]:
phobert_model.config

RobertaConfig {
  "_name_or_path": "gechim/phobert-base-v2-finetuned",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "B\u00ecnh th\u01b0\u1eddng",
    "1": "B\u1ea5t th\u01b0\u1eddng"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B\u00ecnh th\u01b0\u1eddng": 0,
    "B\u1ea5t th\u01b0\u1eddng": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 258,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "tokenizer_class": "PhobertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.32.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 64001
}

"Hi there! My name is [Your Name], and I'm excited to be a part of this program today. Let me know if you have any questions or need help with anything."


