Fine Tune the model

In [1]:
!pip install torch transformers datasets scikit-learn



In [2]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
!mkdir spam-bert

mkdir: cannot create directory ‘spam-bert’: File exists


In [4]:
import pandas as pd
from datasets import Dataset

spam_csv = "https://github.com/githubmilind/colab-playground/blob/main/dataset/spam.txt?raw=true"

df = pd.read_csv(spam_csv, encoding = "ISO-8859-1", sep=",", on_bad_lines='skip')
df = df.rename(columns={"v1":"label", "v2":"text"})

In [5]:
df

Unnamed: 0,label,text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [6]:
def tokenize(batch):
  return tokenizer(batch["text"], padding="max_length", truncation=True)

In [7]:
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")

Map:   0%|          | 0/5572 [00:00<?, ? examples/s]

In [8]:
def map_label_to_int(example):
    if example["labels"] == "ham":
        return {"labels": 0}
    elif example["labels"] == "spam":
        return {"labels": 1}
    else:
        return {"labels": -1} # Or handle other cases as needed

dataset = dataset.map(map_label_to_int)

Map:   0%|          | 0/5572 [00:00<?, ? examples/s]

In [9]:
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [10]:
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test["train"]
val_dataset = train_test["test"]

In [11]:
train_dataset

Dataset({
    features: ['labels', 'text', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4457
})

In [12]:
# fine tune the model
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./spam-bert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mpansare[0m ([33mpansare-florida-atlantic-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.031952
2,0.045800,0.029849
3,0.045800,0.033324


TrainOutput(global_step=837, training_loss=0.030123637995839548, metrics={'train_runtime': 1314.3699, 'train_samples_per_second': 10.173, 'train_steps_per_second': 0.637, 'total_flos': 3518057921218560.0, 'train_loss': 0.030123637995839548, 'epoch': 3.0})

In [14]:
metrics = trainer.evaluate()
metrics

{'eval_loss': 0.03332448750734329,
 'eval_runtime': 32.2397,
 'eval_samples_per_second': 34.585,
 'eval_steps_per_second': 2.171,
 'epoch': 3.0}

In [15]:
print(metrics)

{'eval_loss': 0.03332448750734329, 'eval_runtime': 32.2397, 'eval_samples_per_second': 34.585, 'eval_steps_per_second': 2.171, 'epoch': 3.0}


In [16]:
trainer.save_model("./spam-bert")

In [19]:
text = "You won $1000, claim now!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Move inputs to the same device as the model
device = model.device
inputs = {k: v.to(device) for k, v in inputs.items()}

ouptuts = model(**inputs)
pred = ouptuts.logits.argmax(dim=-1).item()
print("Spam" if pred == 1 else "Ham")

Spam
