**Install libraries**

In [1]:
%%capture
!pip install transformers datasets

**Import libraries**

In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import torch

**Import dataset**

In [3]:
os.environ["WANDB_DISABLED"] = "true"

df = pd.read_csv("/content/Phishing_Email.csv")
df = df.dropna(subset=['Email Text', 'Email Type'])
df['Email Type'] = df['Email Type'].apply(lambda x: 1 if x == 'Phishing Email' else 0)

safe_count = df[df['Email Type'] == 0].shape[0]
phishing_count = df[df['Email Type'] == 1].shape[0]
print(f"Numero di email sicure: {safe_count}")
print(f"Numero di email phishing: {phishing_count}")

Numero di email sicure: 11322
Numero di email phishing: 7312


**Test & training set**

In [4]:
sample_size = 2000

df_safe = df[df['Email Type'] == 0].sample(n=sample_size // 2, random_state=42)
df_phishing = df[df['Email Type'] == 1].sample(n=sample_size // 2, random_state=42)
df_sampled = pd.concat([df_safe, df_phishing]).sample(frac=1, random_state=42)

train_texts, test_texts, train_labels, test_labels = train_test_split(df_sampled['Email Text'], df_sampled['Email Type'], test_size=0.2, random_state=42)

train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
test_dataset = Dataset.from_dict({'text': test_texts.tolist(), 'label': test_labels.tolist()})
dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

**Tokenizer**

In [5]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

**Training model**

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.132347,0.955,0.955665,0.955665,0.955665
2,No log,0.240031,0.9375,0.968421,0.906404,0.936387
3,0.154600,0.096635,0.97,0.965854,0.975369,0.970588
4,0.154600,0.160529,0.9575,0.960396,0.955665,0.958025
5,0.028300,0.144768,0.9725,0.948598,1.0,0.973621
6,0.028300,0.150035,0.97,0.956938,0.985222,0.970874
7,0.028300,0.150543,0.97,0.956938,0.985222,0.970874
8,0.018800,0.15193,0.97,0.956938,0.985222,0.970874
9,0.018800,0.152171,0.97,0.956938,0.985222,0.970874
10,0.016300,0.1524,0.97,0.956938,0.985222,0.970874


TrainOutput(global_step=2000, training_loss=0.05450418090820312, metrics={'train_runtime': 2125.4084, 'train_samples_per_second': 7.528, 'train_steps_per_second': 0.941, 'total_flos': 4209776885760000.0, 'train_loss': 0.05450418090820312, 'epoch': 10.0})

**Evaluating model**

In [7]:
results = trainer.evaluate()
print("Valutazione del modello:", results)

Valutazione del modello: {'eval_loss': 0.1523996889591217, 'eval_accuracy': 0.97, 'eval_precision': 0.9569377990430622, 'eval_recall': 0.9852216748768473, 'eval_f1': 0.970873786407767, 'eval_runtime': 13.1918, 'eval_samples_per_second': 30.322, 'eval_steps_per_second': 3.79, 'epoch': 10.0}
