In [1]:
import re
import torch
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from vncorenlp import VnCoreNLP
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, recall_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
labels_task_1 = ["no-spam", "spam"]
labels_task_2 = ["no-spam", "spam-1", "spam-2", "spam-3"]

def show_predict_result(trainer, test_dataset, y_test, labels):
    y_pred_classify = trainer.predict(test_dataset)
    y_pred = np.argmax(y_pred_classify.predictions, axis=-1)
    cf = confusion_matrix(y_test, y_pred)
    df_cm = pd.DataFrame(cf, index = labels, columns = labels)
    sns.heatmap(df_cm, annot=True, cmap="Greys",fmt='g', cbar=True, annot_kws={"size": 30})
    print("F1 - micro:", f1_score(y_test, y_pred, average='micro'))
    print("F1 - macro:", f1_score(y_test, y_pred, average='macro'))
    print("Accuracy:", accuracy_score(y_test, y_pred))


class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_data = pd.read_csv("/home/duy/Documents/NTD_Data_Science_Spam/dataset/train.csv")
test_data = pd.read_csv("/home/duy/Documents/NTD_Data_Science_Spam/dataset/test.csv")
dev_data = pd.read_csv('/home/duy/Documents/NTD_Data_Science_Spam/dataset/dev.csv')
STOPWORDS_PATH = ('/home/duy/Documents/NTD_Data_Science_Spam/dataset/vietnamese-stopwords-dash.txt')

In [3]:
#Load dữ liệu
X_train = train_data.iloc[:, 0:2]
y_train = train_data.iloc[:, 2:4]

X_dev = dev_data.iloc[:, 0:2]
y_dev = dev_data.iloc[:, 2:4]

X_test = test_data.iloc[:, 0:2]
y_test = test_data.iloc[:, 2:4]

NameError: name 'PATH_TRAIN' is not defined

In [29]:
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['

In [30]:
vncorenlp = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

In [31]:
with open(STOPWORDS_PATH, "r") as ins:
    stopwords = []
    for line in ins:
        dd = line.strip('\n')
        stopwords.append(dd)
    stopwords = set(stopwords)

In [32]:
# Hàm để loại bỏ stop words
def filter_stop_words(train_sentences, stop_words):
    new_sent = [word for word in train_sentences.split() if word not in stop_words]
    train_sentences = ' '.join(new_sent)
    return train_sentences

def remove_emojis(text):
    # Biểu thức chính quy để tìm các biểu tượng cảm xúc
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642" 
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    # Loại bỏ các biểu tượng cảm xúc
    text = emoji_pattern.sub(r'', text)
    return text
# xử lý tất cả các hàm trên
def preprocess(text, tokenized=True, lowercased=True):
    text = filter_stop_words(text, stopwords)
    text = remove_emojis(text)
    text = text.lower() if lowercased else text
    if tokenized:
        pre_text = ""
        sentences = vncorenlp.tokenize(text)
        for sentence in sentences:
            pre_text += " ".join(sentence)
        text = pre_text
    return text


In [33]:
#áp dụng cho file train
train_data['Comment'] = train_data['Comment'].apply(lambda x: preprocess(x, tokenized=True, lowercased=True))
train_data.to_csv("new_train_data.csv", index=False)
#áp dụng cho file test
test_data['Comment'] = test_data['Comment'].apply(lambda x: preprocess(x, tokenized=True, lowercased=True))
test_data.to_csv("new_test_data.csv", index=False)

In [34]:
new_train_data = pd.read_csv("/home/duy/Documents/NTD_Data_Science_Spam/new_train_data.csv")
new_test_data = pd.read_csv("/home/duy/Documents/NTD_Data_Science_Spam/new_test_data.csv")

In [41]:
# Load training data
  # Adjust the file path and format accordingly
train_comments = new_train_data["Comment"].tolist()
print(type(train_comments))
train_labels = new_train_data["Label"].tolist()

# Tokenize and encode the training comments
encoded_inputs = tokenizer.encode(text=train_comments, padding=True, truncation=True, max_length=512, return_tensors="pt", is_split_into_words=True)
input_ids_train = encoded_inputs["input_ids"]

# Load testing data
  # Adjust the file path and format accordingly
test_comments = new_test_data["Comment"]
test_labels = new_test_data["Label"].tolis

<class 'list'>


TypeError: PreTokenizedEncodeInput must be Union[PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]]

In [None]:
# Convert lists to tensors
input_ids_train = torch.stack([input_ids_train[i] for i in range(len(input_ids_train))])
train_labels = torch.tensor(train_labels)

input_ids_test = torch.stack([input_ids_test[i] for i in range(len(input_ids_test))])
test_labels = torch.tensor(test_labels)

NameError: name 'input_ids_test' is not defined

In [None]:
# Tokenize and encode the testing comments
encoded_inputs = tokenizer(test_comments, padding=True, truncation=True, max_length=512, return_tensors="pt")
input_ids_test = encoded_inputs["input_ids"]

# Split training data into training and validation setsbatch
input_ids_train, input_ids_val, labels_train, labels_val = train_test_split(
    input_ids_train, train_labels, test_size=0.2, random_state=42
)

In [None]:
# Create data loaders for training, validation, and testing
train_data = torch.utils.data.TensorDataset(input_ids_train, labels_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True)

val_data = torch.utils.data.TensorDataset(input_ids_val, labels_val)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=8, shuffle=False)

test_data = torch.utils.data.TensorDataset(input_ids_test, test_labels)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.train()
for epoch in range(5):
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


RuntimeError: The expanded size of the tensor (349) must match the existing size (258) at non-singleton dimension 1.  Target sizes: [8, 349].  Tensor sizes: [1, 258]

In [None]:
# Evaluation
model.eval()
predictions = []
true_labels = []

for inputs, labels in test_loader:
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(inputs)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        
    predictions.extend(predicted_labels.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Convert predictions and true labels to numpy arrays
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Classification report
report = classification_report(true_labels, predictions)
print(report)