Fine Tune the model

In [None]:
!pip install torch transformers datasets scikit-learn

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
import pandas as pd
from datasets import Dataset

spam_csv = "https://github.com/githubmilind/colab-playground/blob/main/dataset/spam.txt?raw=true"

df = pd.read_csv(spam_csv, encoding = "ISO-8859-1", sep=",", on_bad_lines='skip')

In [None]:
df

In [None]:
def tokenize(batch):
  return tokenizer(batch["v2"], padding="max_length", truncation=True)

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("v1", "labels")
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test["train"]
val_dataset = train_test["test"]

In [None]:
# fine tune the model
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./spam-bert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

In [None]:
metrics = trainer.evaluate()
metrics

In [None]:
print(metrics)