Fine Tune the model

In [None]:
!pip install torch transformers datasets scikit-learn

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
# create folder
#!mkdir spam-bert

In [None]:
import pandas as pd
from datasets import Dataset

spam_csv = "https://github.com/githubmilind/colab-playground/blob/main/dataset/spam.txt?raw=true"

df = pd.read_csv(spam_csv, encoding = "ISO-8859-1", sep=",", on_bad_lines='skip')
df = df.rename(columns={"v1":"label", "v2":"text"})

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], padding="max_length", truncation=True)

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")

In [None]:
def map_label_to_int(example):
    if example["labels"] == "ham":
        return {"labels": 0}
    elif example["labels"] == "spam":
        return {"labels": 1}
    else:
        return {"labels": -1} # Or handle other cases as needed

dataset = dataset.map(map_label_to_int)

In [None]:
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test["train"]
val_dataset = train_test["test"]

In [None]:
# fine tune the model
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./spam-bert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

In [None]:
metrics = trainer.evaluate()
metrics

In [None]:
print(metrics)

In [None]:
trainer.save_model("./spam-bert")

In [None]:
text = "You won $1000, claim now!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Move inputs to the same device as the model
device = model.device
inputs = {k: v.to(device) for k, v in inputs.items()}

ouptuts = model(**inputs)
pred = ouptuts.logits.argmax(dim=-1).item()
print("Spam" if pred == 1 else "Ham")

In [None]:
text = "message me when you reach home."
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Move inputs to the same device as the model
device = model.device
inputs = {k: v.to(device) for k, v in inputs.items()}

ouptuts = model(**inputs)
pred = ouptuts.logits.argmax(dim=-1).item()
print("Spam" if pred == 1 else "Ham")