In [1]:
!pip install transformers torch scikit-learn pandas

import pandas as pd
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments



In [2]:
train_data = pd.read_csv("../Data/Train_and_Test_dataset/train.csv")
test_data = pd.read_csv("../Data/Train_and_Test_dataset/test.csv")
val_data = pd.read_csv("../Data/Train_and_Test_dataset/val.csv")

# Ki·ªÉm tra d·ªØ li·ªáu
print(train_data.head())
print(test_data.head())
print(val_data.head())

   Unnamed: 0               title  \
0        2379  Tipping the Velvet   
1        5778        The Thin Man   
2       36041       Saving Steele   
3        9588  The Winthrop Woman   
4       12657   Operation Redwood   

                                              author  \
0                                       Sarah Waters   
1                                   Dashiell Hammett   
2                      Anne Jolin (Goodreads Author)   
3  Anya Seton, Philippa Gregory (Goodreads Author...   
4                                  S. Terrell French   

                                         description  \
0  Nan King, an oyster girl, is captivated by the...   
1  Nick and Nora Charles are Hammett's most encha...   
2  Kennedy Cross had her heart broken early in li...   
3  First published in 1958 and set in the early 1...   
4  "Sibley Carter is a moron and a worldclass jer...   

                             genres  
0                   fiction,romance  
1                          

In [3]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize description
train_encodings = tokenizer(
    list(train_data["description"]),
    truncation=True,
    padding=True,
    max_length=128
)
test_encodings = tokenizer(
    list(test_data["description"]),
    truncation=True,
    padding=True,
    max_length=128
)
val_encodings = tokenizer(
    list(val_data["description"]),
    truncation=True,
    padding=True,
    max_length=128
)

In [4]:
import ast

# Chuy·ªÉn nh√£n t·ª´ chu·ªói sang danh s√°ch
train_data["genres"] = train_data["genres"].apply(lambda x: x.split(",") if isinstance(x, str) else [])
test_data["genres"] = test_data["genres"].apply(lambda x: x.split(",") if isinstance(x, str) else [])
val_data["genres"] = val_data["genres"].apply(lambda x: x.split(",") if isinstance(x, str) else [])

# Binarize nh√£n
mlb = MultiLabelBinarizer()
train_labels_binary = mlb.fit_transform(train_data["genres"])
test_labels_binary = mlb.transform(test_data["genres"])
val_labels_binary = mlb.transform(val_data["genres"])

print(f"S·ªë l∆∞·ª£ng nh√£n: {len(mlb.classes_)}")

S·ªë l∆∞·ª£ng nh√£n: 5


In [5]:
class BookDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# T·∫°o Dataset
train_dataset = BookDataset(train_encodings, train_labels_binary)
test_dataset = BookDataset(test_encodings, test_labels_binary)
val_dataset = BookDataset(val_encodings, val_labels_binary)

In [6]:
# S·ªë nh√£n (t∆∞∆°ng ·ª©ng v·ªõi nh√£n trong d·ªØ li·ªáu)
num_labels = len(mlb.classes_)

# Load m√¥ h√¨nh BERT
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# C·∫•u h√¨nh hu·∫•n luy·ªán
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,                # Regularization
    logging_dir="./logs", 
    load_best_model_at_end=True,      # Load checkpoint t·ªët nh·∫•t
    metric_for_best_model="f1",       # Ti√™u ch√≠ ch·ªçn m√¥ h√¨nh t·ªët nh·∫•t
    save_total_limit=1,
    greater_is_better=True        
)



In [8]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def compute_metrics(pred):
    # Tr√≠ch xu·∫•t logits v√† labels
    logits, labels = pred
    predictions = (logits > 0).astype(int)  # Chuy·ªÉn logits th√†nh nh√£n nh·ªã ph√¢n
    
    # T√≠nh c√°c ch·ªâ s·ªë
    f1 = f1_score(labels, predictions, average="micro")
    precision = precision_score(labels, predictions, average="micro")
    recall = recall_score(labels, predictions, average="micro")
    accuracy = accuracy_score(labels, predictions)
    
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [10]:
# Hu·∫•n luy·ªán m√¥ h√¨nh
trainer.train()

  0%|          | 0/4113 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 0.4356, 'grad_norm': 1.643088936805725, 'learning_rate': 1.7568684658400197e-05, 'epoch': 0.36}
{'loss': 0.3693, 'grad_norm': 2.16227126121521, 'learning_rate': 1.513736931680039e-05, 'epoch': 0.73}


  0%|          | 0/457 [00:00<?, ?it/s]

{'eval_loss': 0.3438918888568878, 'eval_accuracy': 0.47489396634286496, 'eval_f1': 0.839430711610487, 'eval_precision': 0.8420799519086264, 'eval_recall': 0.8367980884109917, 'eval_runtime': 40.4859, 'eval_samples_per_second': 180.532, 'eval_steps_per_second': 11.288, 'epoch': 1.0}
{'loss': 0.339, 'grad_norm': 2.034111738204956, 'learning_rate': 1.2706053975200586e-05, 'epoch': 1.09}
{'loss': 0.2948, 'grad_norm': 2.755068302154541, 'learning_rate': 1.0274738633600778e-05, 'epoch': 1.46}
{'loss': 0.2914, 'grad_norm': 3.7234046459198, 'learning_rate': 7.843423292000973e-06, 'epoch': 1.82}


  0%|          | 0/457 [00:00<?, ?it/s]

{'eval_loss': 0.3337067663669586, 'eval_accuracy': 0.5003420440552743, 'eval_f1': 0.8499615589331125, 'eval_precision': 0.8415505328492797, 'eval_recall': 0.858542413381123, 'eval_runtime': 41.9401, 'eval_samples_per_second': 174.272, 'eval_steps_per_second': 10.896, 'epoch': 2.0}
{'loss': 0.2623, 'grad_norm': 2.644906520843506, 'learning_rate': 5.412107950401167e-06, 'epoch': 2.19}
{'loss': 0.2382, 'grad_norm': 2.9473319053649902, 'learning_rate': 2.9807926088013615e-06, 'epoch': 2.55}
{'loss': 0.226, 'grad_norm': 3.286533832550049, 'learning_rate': 5.49477267201556e-07, 'epoch': 2.92}


  0%|          | 0/457 [00:00<?, ?it/s]

{'eval_loss': 0.3404996395111084, 'eval_accuracy': 0.5015734026542619, 'eval_f1': 0.8509075834295697, 'eval_precision': 0.843638071751512, 'eval_recall': 0.8583034647550777, 'eval_runtime': 41.9124, 'eval_samples_per_second': 174.388, 'eval_steps_per_second': 10.904, 'epoch': 3.0}
{'train_runtime': 1487.4239, 'train_samples_per_second': 44.225, 'train_steps_per_second': 2.765, 'train_loss': 0.3046903343580948, 'epoch': 3.0}


TrainOutput(global_step=4113, training_loss=0.3046903343580948, metrics={'train_runtime': 1487.4239, 'train_samples_per_second': 44.225, 'train_steps_per_second': 2.765, 'total_flos': 4327043632356096.0, 'train_loss': 0.3046903343580948, 'epoch': 3.0})

In [11]:
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test Results:", test_results)

  0%|          | 0/457 [00:00<?, ?it/s]

Test Results: {'eval_loss': 0.33609864115715027, 'eval_accuracy': 0.504993843207005, 'eval_f1': 0.8520479909044669, 'eval_precision': 0.8478118487645132, 'eval_recall': 0.8563266778927111, 'eval_runtime': 41.8179, 'eval_samples_per_second': 174.782, 'eval_steps_per_second': 10.928, 'epoch': 3.0}


In [12]:
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation Results:", val_results)

  0%|          | 0/457 [00:00<?, ?it/s]

Validation Results: {'eval_loss': 0.3404996395111084, 'eval_accuracy': 0.5015734026542619, 'eval_f1': 0.8509075834295697, 'eval_precision': 0.843638071751512, 'eval_recall': 0.8583034647550777, 'eval_runtime': 41.6488, 'eval_samples_per_second': 175.491, 'eval_steps_per_second': 10.973, 'epoch': 3.0}


In [13]:
# L∆∞u m√¥ h√¨nh v√† tokenizer
trainer.save_model("./bert_finetuned")
tokenizer.save_pretrained("./bert_finetuned")

('./bert_finetuned\\tokenizer_config.json',
 './bert_finetuned\\special_tokens_map.json',
 './bert_finetuned\\vocab.txt',
 './bert_finetuned\\added_tokens.json',
 './bert_finetuned\\tokenizer.json')