In [5]:
import sys
import os
import pandas as pd
import json

# Add path to 'src' for imports
sys.path.append(os.path.abspath("../../src"))

from noise import add_combined_noise
from train_bert import train_and_evaluate


In [6]:
metrics_list = []
noise_tag = "noise_5"

for run in range(3):
    print(f"Running {noise_tag} – repetition {run + 1}")

    #  base data
    df = pd.read_csv("../../data/cleaning/2_cleaned_data.csv")

    #  columns to single message
    df["text"] = df[["sender", "receiver", "urls", "clean_text"]].fillna("").astype(str).agg(" ".join, axis=1)

    #  BERT input format
    df = df[["text", "label"]].rename(columns={"text": "body"})

    #  5% combined noise (unique seed per run)
    df["body"] = df["body"].apply(lambda x: add_combined_noise(x, noise_level=0.05, seed=42 + run))

    # Run 
    tag = f"{noise_tag}_r{run+1}"
    train_and_evaluate(df, model_tag=tag, text_col="body")

    # save
    with open(f"results/metrics_{tag}.json") as f:
        metrics = json.load(f)
        metrics["noise_level"] = noise_tag
        metrics["run"] = run + 1
        metrics_list.append(metrics)

# Save 
df_metrics = pd.DataFrame(metrics_list)
df_metrics.to_csv(f"results/metrics_{noise_tag}.csv", index=False)
df_metrics


Running noise_5 – repetition 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 49.2898, 'train_samples_per_second': 16.231, 'train_steps_per_second': 2.029, 'train_loss': 0.29834957122802735, 'epoch': 1.0}
Running noise_5 – repetition 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 46.0957, 'train_samples_per_second': 17.355, 'train_steps_per_second': 2.169, 'train_loss': 0.363526496887207, 'epoch': 1.0}
Running noise_5 – repetition 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 46.3217, 'train_samples_per_second': 17.271, 'train_steps_per_second': 2.159, 'train_loss': 0.36721805572509764, 'epoch': 1.0}


Unnamed: 0,accuracy,f1_phishing,precision,recall,noise_level,run
0,0.96,0.963,0.963,0.963,noise_5,1
1,0.97,0.9717,0.9904,0.9537,noise_5,2
2,0.97,0.972,0.9811,0.963,noise_5,3
