In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install datasets
!pip install -U accelerate
!pip install -U transformers

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)
model_ckpt = "vinai/phobert-base-v2" #vinai/phobert-large #vinai/phobert-large
from transformers import AutoModel, AutoTokenizer

phobert = AutoModel.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
phobert.to(device)

In [None]:
def tokenize(batch):
 return tokenizer(batch["url"], padding=True, truncation=True)

In [None]:
# slipt train and test dataset
train_data = load_dataset("csv", data_files="/content/train.csv", sep="," , names=["url", "label"] , split="train[0:23000]")
val_data = load_dataset("csv", data_files="/content/train.csv", sep="," , names=["url", "label"] , split="train[23000:]")
test_data = load_dataset("csv", data_files="/content/test.csv", sep="," , names=["url", "label"] , split="train[0:]")
print(train_data)
print(val_data)
print(test_data)

#tokenize dataset
val_data_tokenized = val_data.map(tokenize, batched=True, batch_size=None)
test_data_tokenized = test_data.map(tokenize, batched=True, batch_size=None)
train_data_tokenized = train_data.map(tokenize, batched=True, batch_size=None)

# format dataset to torch [input_ids, attention_mask, label, token_type_ids]
train_data_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label" , "token_type_ids"])
test_data_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label" , "token_type_ids"])
val_data_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label" , "token_type_ids"])

print("------------------")
print(train_data_tokenized)
print(test_data_tokenized)
print(val_data_tokenized)

In [None]:
train_data_tokenized

In [None]:
num_labels = 2
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))
model

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
#  print(pred.label_ids)
 labels = pred.label_ids    
 preds = pred.predictions.argmax(-1)
 f1 = f1_score(labels, preds, average="weighted")
 acc = accuracy_score(labels, preds)
 return {"accuracy": acc, "f1": f1}

In [None]:
batch_size = 64 # 16
logging_steps = len(train_data_tokenized) // batch_size
model_name = f"{model_ckpt}-finetuned"
training_args = TrainingArguments(
                                  learning_rate=2e-5,
    
                                  # số vòng train
                                  num_train_epochs=10,
                                  
                                  #max_steps là số lượng steps tối đa mà bạn muốn train
                                  max_steps= 3 # default là -1
                                  
                                  # bach size
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
    
                                  # lưu checkpoint trong quá trình train
                                  evaluation_strategy="steps",
                                  save_strategy="steps",
                                  eval_steps = 50,
                                  save_steps = 50,
                                  save_total_limit = 2,
                                  metric_for_best_model = "eval_accuracy", #accuracy
                                  load_best_model_at_end=True,
                                  
                                  # đường dẫn lưu model
                                  output_dir=model_name,
                                  
                                  # other parameters
                                  optimizer="adamw", # "adamw", "adamax", "sgd" "adafactor", "paged_adamw_8bit"
                                  lr_scheduler_type='cosine_with_restarts' #cosine_with_warmup #polynomial #linear #cosine
                                  warmup_steps=1, # số steps warmup cho learning rate
                                  gradient_accumulation_steps=1, # số steps cập nhật gradient trước khi update weights
                                  gradient_checkpointing=False, # bật sử dụng gradient checkpointing
                                  weight_decay=0.01,
                                  
                                  # quantization
                                  fp16=True , # bật sử dụng half-precision training
                                  #bf16=False, # bật sử dụng bfloat16 training
                                  
                                  
                                  
                                  disable_tqdm=False,  
                                  report_to = "wandb", # "azure_ml", "clearml", "codecarbon", "comet_ml", "dagshub", "dvclive", "flyte", "mlflow", "neptune", "tensorboard", and "wandb"
                                  logging_steps=logging_steps,
                                  logging_dir='./logs',
                                  log_level="error",
                                  )


In [None]:
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
 compute_metrics=compute_metrics,
 train_dataset=train_data_tokenized,
 eval_dataset=test_data_tokenized,
 tokenizer=tokenizer)
trainer.train();


In [None]:
preds_output = trainer.predict(val_data_tokenized)
preds_output.metrics

In [None]:
!huggingface-cli login
trainer.push_to_hub()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
def plot_confusion_matrix(y_preds, y_true, labels):
 cm = confusion_matrix(y_true, y_preds, normalize="true")
 fig, ax = plt.subplots(figsize=(6, 6))
 disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
 disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
 plt.title("Normalized confusion matrix")
 plt.show()



In [None]:
import numpy as np
y_preds = np.argmax(preds_output.predictions, axis=1)
plot_confusion_matrix(y_preds, val_data_tokenized["label"].numpy(), ["0" , "1"])

In [None]:
import numpy as np
preds_output = trainer.predict(test_data_tokenized)
y_preds = np.argmax(preds_output.predictions, axis=1)
plot_confusion_matrix(y_preds, test_data_tokenized["label"].numpy(), ["0" , "1"])

In [None]:
#save model
torch.save(model , 'model')

AttributeError: Can't get attribute 'NN' on <module '__main__'>

In [None]:
#save model
text = "sexnhanh.com"
input_ids = torch.tensor([tokenizer(text).input_ids]).to(device)
torch.argmax(the_model(input_ids).logits , dim = 1)
# np.argmax(preds_output.predictions, axis=1)



In [None]:
the_model

In [None]:
def predict(row):
  input_ids = torch.tensor([tokenizer(row['url']).input_ids]).to(device)
  return {
      'predict' : torch.argmax(the_model(input_ids).logits , dim = 1).item()
  }
test_data_tokenized_predict = test_data.map(predict)

In [None]:
test_data_tokenized_predict.set_format('pandas')

In [None]:
df = test_data_tokenized_predict[:]

In [None]:
df

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput


phoBert = AutoModel.from_pretrained("gechim/phobert-base-v2-finetuned")
#mang no ron
class NN(nn.Module):
  def __init__(self, input_size, num_classes):
    super(NN, self).__init__()
    self.phoBert = phoBert # (batchsize , 1 , 768)
    self.num_classes = num_classes
    self.fc1 = nn.Linear(input_size, 256)
    self.fc2 = nn.Linear(256, 768) #(batchsize , 1 , 768)
    self.dropout_nn = nn.Dropout(0.1)
    self.dropout_lm = nn.Dropout(0.1)


    # self.out = nn.Linear(768, num_classes)
    self.out = nn.Linear(1536, num_classes)

  def forward(self, features, input_ids, token_type_ids, attention_mask , labels):
    # output bên sang
    x_nn = F.relu(self.fc1(features))
    x_nn = F.relu(self.fc2(x_nn))

    # output bên bảo
    x_phoBert = self.phoBert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).last_hidden_state[:,0,:]

    #drop out trước khi concat
    x_nn = self.dropout_nn(x_nn)
    x_phoBert = self.dropout_lm(x_phoBert)

    # print(x_phoBert.shape)
    logits = self.out(torch.cat(( x_nn , x_phoBert) , dim=1)) #self.out( x_nn + x_phoBert)


    # tính loss cái này chỉ để hiện kq loss tập valid
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_classes), labels.view(-1))
    return SequenceClassifierOutput(loss = loss , logits=logits) # hàm trainer cần cái này nó mới chịu train




#sieu tham so
input_size = 10
num_classes = 2
batch_size = 64
num_epochs = 50

model = NN(input_size=input_size, num_classes=num_classes)

Some weights of RobertaModel were not initialized from the model checkpoint at gechim/phobert-base-v2-finetuned and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

NN(
  (phoBert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,)

NameError: name 'torch' is not defined