In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
import pandas as pd
import evaluate
import wandb

DATA_DIR = "/media/hdddisk/bert-classify-smsspam-data"

  from .autonotebook import tqdm as notebook_tqdm
2025-02-06 09:19:00.256803: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-06 09:19:00.276979: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv("./data/smsspam.csv")

# split into train and test
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

train_df.to_csv("./data/smsspam-train.csv", index=False)
test_df.to_csv("./data/smsspam-test.csv", index=False)

In [3]:
# Load the dataset
dataset = load_dataset('csv', data_files={'train': './data/smsspam-train.csv', 'test': './data/smsspam-test.csv'})

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Prepare the dataset for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Generating train split: 4459 examples [00:00, 512731.70 examples/s]
Generating test split: 1115 examples [00:00, 408084.55 examples/s]
Map: 100%|██████████| 4459/4459 [00:01<00:00, 4030.81 examples/s]
Map: 100%|██████████| 1115/1115 [00:00<00:00, 4124.52 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
training_args = TrainingArguments(
    output_dir=f'{DATA_DIR}/results',  # Output directory
    num_train_epochs=10,                # Number of training epochs
    per_device_train_batch_size=8,     # Batch size for training
    per_device_eval_batch_size=8,      # Batch size for evaluation
    learning_rate=1e-6,                # Learning rate
    warmup_steps=500,                   # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                 # Strength of weight decay
    logging_dir=f'{DATA_DIR}/logs',    # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",       # Evaluate every epoch
    save_strategy="epoch",             # Save model every epoch
    load_best_model_at_end=True,       # Load the best model at the end of training
    save_total_limit=2,                # Limit the total number of saved models
    metric_for_best_model="accuracy",  # Use accuracy to determine the best model
)

# Define the metric for evaluation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)



In [19]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The pre-trained model
    args=training_args,                  # Training arguments
    train_dataset=tokenized_datasets['train'],  # Training dataset
    eval_dataset=tokenized_datasets['test'],    # Evaluation dataset
    compute_metrics=compute_metrics,     # Function to compute metrics
)

In [20]:
# Train the model
wandb.init()
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0005,0.077491,0.988341
2,0.0005,0.077838,0.988341
3,0.0005,0.074375,0.988341
4,0.0004,0.072762,0.988341


In [10]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 0.0827006921172142, 'eval_accuracy': 0.9856502242152466, 'eval_runtime': 19.1159, 'eval_samples_per_second': 58.328, 'eval_steps_per_second': 7.324, 'epoch': 10.0}


In [11]:
# Save the model
trainer.save_model('./final_model')
tokenizer.save_pretrained('./final_model')

print("Model training and evaluation complete. Model saved to './final_model'.")

Model training and evaluation complete. Model saved to './final_model'.
