In [4]:
# install the required packages Python environment
!pip install transformers==4.28.0 datasets evaluate



In [5]:
# import necessary packages
import pandas as pd

import numpy as np
import torch

from datetime import datetime

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from evaluate import load as load_metric


In [6]:
# set seed so that the results are reproducible
transformers.set_seed(22)

# set the device to GPU if available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available for training")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Mac ARM GPU is available for training")
else:
    device = torch.device("cpu")
    print("No GPU available, resorting to CPU")

GPU is available for training


In [7]:
# make a pandas DataFrame from the features extracted with 1.Extraction.ipynb
extracted_text = pd.read_csv("https://anti-phish.s3.eu-west-1.amazonaws.com/dataset/extracted/extracted_text.csv", keep_default_na=False, index_col=0)

In [8]:
# drop the header column as this dataframe will be fed directly into huggingface's Dataset class
extracted_text = extracted_text.drop(['header'], axis=1)

In [9]:
# preview extracted text
extracted_text

Unnamed: 0,phishing,text
0,0,Re: i may have a meeting around 3pm i have to ...
1,0,RE: Mosko's Calls a/o 5:55pm Tues 3/4 - 2 new ...
2,0,[domain.com] 'Phillips' IS The Captain Now As ...
3,0,Trump: Leave It To Me To be automatically unsu...
4,0,EVENT: Trump and Clinton video remarks to Lati...
...,...,...
14383,1,Account suspension notice 12/09/2020 03:36:33 ...
14384,1,Undeliverable: Delivery Status Notification (F...
14385,1,Reminder: Notice for monkey.org \n\n\n\r\nDear...
14386,1,Notification jose@monkey.org \n\n\n \nDear jos...


In [10]:
# create huggingface Dataset object
emails = Dataset.from_pandas(extracted_text, preserve_index=False)

In [11]:
# use the Dataset's train_test_split method to split into training and testing subsets
train_emails, test_emails = emails.train_test_split(test_size=0.2).values()

In [12]:
# create a huggingface DatasetDict object to store the subsets
emails = DatasetDict()
emails['train'] = train_emails
emails['test'] = test_emails

In [13]:
# preview DatasetDict
emails

DatasetDict({
    train: Dataset({
        features: ['phishing', 'text'],
        num_rows: 11510
    })
    test: Dataset({
        features: ['phishing', 'text'],
        num_rows: 2878
    })
})

In [14]:
# rename the phishing column to label as this is expected by the DataLoader
emails = emails.rename_column('phishing', 'label')

In [15]:
# set labels and id used later in the HuggingFace Trainer for inference
id2label ={0: "LEGIT", 1: "PHISHING"}
label2id = {"LEGIT": 0, "PHISHING": 1}

**Transformer Model 1: BERT**

In [16]:
# instantiate a huggingface AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(email):
    return tokenizer(email['text'], truncation=True)

tokenized_emails_datasets = emails.map(preprocess_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/11510 [00:00<?, ? examples/s]

Map:   0%|          | 0/2878 [00:00<?, ? examples/s]

In [17]:
# preview the layout of the tokenized dataset
tokenized_emails_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11510
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2878
    })
})

In [18]:
# show the format of attention masks and input_ids of a sample email
print(tokenized_emails_datasets['train'][47])

{'label': 1, 'text': 'Security Center Advisory Dear PayPal Customer,\r\n \r\n \r\nDuring our regularly scheduled account maintenance and verification procedure we have detected a\r\nslight error in your PayPal online account.\r\n \r\nThis might be due to the following reasons:\r\n \r\n \r\n1. A recent change in your personal information (ie. change of address, email address)\r\n \r\n2. An inability to accurately verify your selected option of payment due to an internal \r\nerror within our systems.\r\n \r\n \r\nPlease fill in all the details that are required to complete this verification process.\r\n \r\n \r\nTo do this we have attached a form to this email. Please download the form and follow the \r\ninstructions on your screen. NOTE: The form needs to be opened in a modern browser which has \r\njavascript enabled (ex: Internet Explorer 7, Firefox 3, Safari 3, Opera 9)\r\n \r\n \r\nPlease understand that this is a security measure intended to \r\nhelp protect you and your account. We

In [19]:
# download and instantiate a pre-trained BERT model
model_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           num_labels=2,
                                                           output_attentions = False,
                                                           output_hidden_states = False,
                                                           return_dict=True,
                                                           id2label=id2label,
                                                           label2id=label2id
                                                           )

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [24]:
# instantiate a data collator used to sort batches and pad as necessary
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

metric = load_metric("accuracy")
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# define model hyperparameters
training_args = TrainingArguments(
    output_dir='Models/',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    group_by_length=True,
    num_train_epochs=5,
    weight_decay=0.01,
    optim="adamw_torch",
)

# instantiate a huggingface Trainer with our model, TrainingArguments, datasets and collator
trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_emails_datasets['train'],
    eval_dataset=tokenized_emails_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [25]:
# Evaluate the transformer model before training (for comparison)
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.7587684392929077,
 'eval_accuracy': 0.2623349548297429,
 'eval_runtime': 94.4437,
 'eval_samples_per_second': 30.473,
 'eval_steps_per_second': 1.906}

In [26]:
trainer.train()

Step,Training Loss
500,0.0832
1000,0.0153
1500,0.0068
2000,0.002
2500,0.0002
3000,0.0
3500,0.0


TrainOutput(global_step=3600, training_loss=0.014951517534726816, metrics={'train_runtime': 3263.5053, 'train_samples_per_second': 17.634, 'train_steps_per_second': 1.103, 'total_flos': 8730814150010880.0, 'train_loss': 0.014951517534726816, 'epoch': 5.0})

In [27]:
trainer.evaluate()

{'eval_loss': 0.023833435028791428,
 'eval_accuracy': 0.9972202918693537,
 'eval_runtime': 97.4784,
 'eval_samples_per_second': 29.524,
 'eval_steps_per_second': 1.847,
 'epoch': 5.0}

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
trainer.save_model("/content/drive/MyDrive/Models/" + 'BERT ' + str(datetime.now()))

**Transformer Model 2: DistilBERT**

In [30]:
# instantiate a huggingface AutoTokenizer for Distilbert
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(email):
    return tokenizer(email['text'], truncation=True)

tokenized_emails_datasets = emails.map(preprocess_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/11510 [00:00<?, ? examples/s]

Map:   0%|          | 0/2878 [00:00<?, ? examples/s]

In [31]:
# download and instantiate a pre-trained DistilBERT model
model_distilbert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                                    output_attentions = False,
                                                                    output_hidden_states = False,
                                                                    return_dict=True,
                                                                    id2label=id2label,
                                                                    label2id=label2id
                                                                    )

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

In [33]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# define model hyperparameters
training_args = TrainingArguments(
    output_dir='Models/',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    group_by_length=True,
    num_train_epochs=5,
    weight_decay=0.01,
    optim="adamw_torch"
)

# instantiate a huggingface Trainer with our model, TrainingArguments, datasets and collator
trainer = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=tokenized_emails_datasets['train'],
    eval_dataset=tokenized_emails_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [34]:
# Evaluate the transformer model before fine-tuning
trainer.evaluate()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.6561293601989746,
 'eval_accuracy': 0.7202918693537178,
 'eval_runtime': 50.4628,
 'eval_samples_per_second': 57.032,
 'eval_steps_per_second': 3.567}

In [35]:
trainer.train()

Step,Training Loss
500,0.0809
1000,0.0163
1500,0.0059
2000,0.0023
2500,0.0012
3000,0.0002
3500,0.0


TrainOutput(global_step=3600, training_loss=0.014842228293677585, metrics={'train_runtime': 1643.3139, 'train_samples_per_second': 35.021, 'train_steps_per_second': 2.191, 'total_flos': 4395665689602048.0, 'train_loss': 0.014842228293677585, 'epoch': 5.0})

In [36]:
# Evaluate the transformer model after training
trainer.evaluate()

{'eval_loss': 0.019120201468467712,
 'eval_accuracy': 0.9979152189020153,
 'eval_runtime': 49.7888,
 'eval_samples_per_second': 57.804,
 'eval_steps_per_second': 3.615,
 'epoch': 5.0}

In [37]:
trainer.save_model("/content/drive/MyDrive/Models/" + 'DistilBERT ' + str(datetime.now()))

In [38]:
from transformers import pipeline
phishing_detector = pipeline("sentiment-analysis",model="/content/drive/MyDrive/Models/DistilBERT 2023-06-05 15:49:25.041930")

In [41]:
phishing_detector("top 20 netflix films")

[{'label': 'LEGIT', 'score': 0.9863020181655884}]

In [39]:
phishing_detector("top 20 amazon games")

[{'label': 'PHISHING', 'score': 0.9562236666679382}]

In [40]:
phishing_detector("please reset password")

[{'label': 'PHISHING', 'score': 0.9999674558639526}]

In [42]:
phishing_detector("can you reply soon?")

[{'label': 'PHISHING', 'score': 0.9999254941940308}]

In [43]:
phishing_detector("This email is about scheduling an appointment next week. Can you reply soon?")

[{'label': 'LEGIT', 'score': 0.999916672706604}]