In [1]:
import pandas as pd
import os

In [2]:
df_all = pd.read_csv('/Users/blabbyduck/Desktop/DSA4266/Phisiing data/combined_data.csv',  lineterminator='\n')
df_all = df_all.head(10000)

In [3]:
df = df_all.dropna()[['body', 'label']]
df.head()
df.columns


Index(['body', 'label'], dtype='object')

BERT

In [4]:
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments,
                          DataCollatorWithPadding, Trainer, pipeline)
from sklearn.model_selection import train_test_split
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset, Dataset
import torch, wandb, evaluate
from tqdm.auto import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


BERT expects input data in a specific format, and the tokenizer is responsible for converting the text into that format. The tokenizer splits the text into tokens, which are the basic units of language that the model can understand. The tokenizer also adds special tokens, such as [CLS] and [SEP], to mark the beginning and the end of the text or the separation between two sentences

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased") 

In [6]:
dataset = Dataset.from_pandas(df)

In [7]:
dataset.features

{'body': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None)}

In [8]:
def preprocess_function(examples):
    return tokenizer(examples["body"], truncation=True)

In [9]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 10000/10000 [00:04<00:00, 2417.28 examples/s]


In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Evaluation Metrics

In [11]:
metrics = evaluate.combine(["accuracy", "precision", "recall", "ealvaradob/false_positive_rate"])

In [12]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return metrics.compute(predictions=predictions, references=labels)

Training

In [13]:
id2label = {0: "benign", 1: "phishing"}
label2id = {"benign": 0, "phishing": 1}

In [14]:
df = tokenized_dataset.to_pandas()

In [15]:
train, test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train, test = Dataset.from_pandas(train, preserve_index=False), Dataset.from_pandas(test, preserve_index=False)

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-large-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:

training_args = TrainingArguments(
    output_dir="bert-large-finetuned-phishing",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Adjust based on memory
    per_device_eval_batch_size=4,
    num_train_epochs=1,  # Start with fewer epochs
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    gradient_accumulation_steps=4,
)




In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mluciapanyc[0m ([33mluciapanyc-national-university-of-singapore[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadl

 20%|██        | 100/500 [05:15<21:22,  3.21s/it]

{'loss': 0.3224, 'grad_norm': 7.9738078117370605, 'learning_rate': 4e-05, 'epoch': 0.2}


 40%|████      | 200/500 [10:45<17:01,  3.41s/it]

{'loss': 0.1306, 'grad_norm': 0.20052410662174225, 'learning_rate': 3e-05, 'epoch': 0.4}


 60%|██████    | 300/500 [16:14<11:13,  3.37s/it]

{'loss': 0.0827, 'grad_norm': 0.06538283079862595, 'learning_rate': 2e-05, 'epoch': 0.6}


 80%|████████  | 400/500 [21:51<04:58,  2.98s/it]

{'loss': 0.0478, 'grad_norm': 13.1830415725708, 'learning_rate': 1e-05, 'epoch': 0.8}


100%|██████████| 500/500 [27:49<00:00,  3.80s/it]

{'loss': 0.0752, 'grad_norm': 7.34385871887207, 'learning_rate': 0.0, 'epoch': 1.0}


                                                 
100%|██████████| 500/500 [30:36<00:00,  3.80s/it]

{'eval_loss': 0.061899781227111816, 'eval_accuracy': 0.9845, 'eval_precision': 0.9951456310679612, 'eval_recall': 0.9800796812749004, 'eval_false_positive_rate': 0.008053691275167786, 'eval_runtime': 161.6199, 'eval_samples_per_second': 12.375, 'eval_steps_per_second': 3.094, 'epoch': 1.0}


100%|██████████| 500/500 [30:42<00:00,  3.68s/it]

{'train_runtime': 1843.7166, 'train_samples_per_second': 4.339, 'train_steps_per_second': 0.271, 'train_loss': 0.13174635696411133, 'epoch': 1.0}





TrainOutput(global_step=500, training_loss=0.13174635696411133, metrics={'train_runtime': 1843.7166, 'train_samples_per_second': 4.339, 'train_steps_per_second': 0.271, 'total_flos': 6737951125426752.0, 'train_loss': 0.13174635696411133, 'epoch': 1.0})

In [21]:
#trainer.save_model("phishing_bert")

Testing the model

In [20]:
text = (
    "Text: Dear hotmail user. We noticed a login to your Hotmail account "
          "from an unrecognized device on Tuesday, August 15, 2023 (GMT-5) 7:32 A.M. "
          "Lima, Peru. Was it you? If so, ignore the rest of this email. If it was not "
          "you, follow the links below to keep your Hotmail account secure and "
          "provide the necessary information to keep your account active. CLICK HERE."
          "Thank you, Hotmail Team."
    "\nURL: https://ec-ec.squarespace.com"
)

In [22]:
from transformers import BertConfig, BertModel

In [26]:

classifier = pipeline("text-classification", model = 'phishing_bert')
classifier(text)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'phishing', 'score': 0.9915384650230408}]