In [1]:
!pip install transformers accelerate evaluate datasets bitsandbytes peft

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py

In [59]:
import transformers, torch, datasets, peft
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model

dataset = load_dataset("seanswyi/sms-spam-classification")

label2id = {"ham": 0, "spam": 1}
dataset = dataset.map(lambda x: {"label": label2id[x["label"]]})

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def token_fun(row):
  return tokenizer(row["text"], truncation=True)

tokenized_dataset = dataset.map(token_fun, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/557 [00:00<?, ? examples/s]

Map:   0%|          | 0/558 [00:00<?, ? examples/s]

Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/557 [00:00<?, ? examples/s]

Map:   0%|          | 0/558 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [76]:
print(dataset)
print(dataset["train"][1])
print(dataset["train"][2])

DatasetDict({
    train: Dataset({
        features: ['original_idx', 'label', 'text'],
        num_rows: 4457
    })
    validation: Dataset({
        features: ['original_idx', 'label', 'text'],
        num_rows: 557
    })
    test: Dataset({
        features: ['original_idx', 'label', 'text'],
        num_rows: 558
    })
})
{'original_idx': 3202, 'label': 0, 'text': 'Haha... Yup hopefully  we will lose a few kg by mon. after hip hop can go orchard and weigh again'}
{'original_idx': 5285, 'label': 1, 'text': 'URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18'}


In [69]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to="none",
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./results/logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].shuffle(seed=42),
    eval_dataset=tokenized_dataset["test"].shuffle(seed=42),
    # tokenizer=tokenizer,
    processing_class=tokenizer,
    data_collator=data_collator
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0218,0.055975
2,0.0004,0.055768
3,0.0003,0.079929


TrainOutput(global_step=1674, training_loss=0.027852012910647875, metrics={'train_runtime': 129.5453, 'train_samples_per_second': 103.215, 'train_steps_per_second': 12.922, 'total_flos': 187196873606580.0, 'train_loss': 0.027852012910647875, 'epoch': 3.0})

In [70]:
trainer.save_model("./results/fine_tuned_model")
tokenizer.save_pretrained("./results/fine_tuned_model")

('./results/fine_tuned_model/tokenizer_config.json',
 './results/fine_tuned_model/special_tokens_map.json',
 './results/fine_tuned_model/vocab.txt',
 './results/fine_tuned_model/added_tokens.json',
 './results/fine_tuned_model/tokenizer.json')

In [71]:
from transformers import pipeline

model_path = "./results/fine_tuned_model"

tokenizer = Tokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [79]:
classifier = pipeline("text-classification",
    model=model,
    tokenizer=tokenizer
)

text1 = "URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18"
text2 = "OMG You have WOM $1000000000000000, Kindly fill the form to clain your REWARD"
result1 = classifier(text1)
result2 = classifier(text2)
print(result1)
print(result2)

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9997031092643738}]
[{'label': 'LABEL_1', 'score': 0.9996359348297119}]


In [80]:
label_map = {0: "ham", 1: "spam"}

def decode_label(result):
    label_num = int(result[0]['label'].split("_")[1])
    return f"{label_map[label_num]} (confidence: {result[0]['score']:.2f})"

print(f"Text 1: {decode_label(result1)}")
print(f"Text 2: {decode_label(result2)}")


Text 1: spam (confidence: 1.00)
Text 2: spam (confidence: 1.00)
