In [1]:
import sys
import os
import pandas as pd
import json

#  path to src
sys.path.append(os.path.abspath("../../src"))

from noise import add_combined_noise
from train_bert import train_and_evaluate

metrics_list = []
noise_tag = "noise_35"

for run in range(3):
    print(f"Running {noise_tag} – repetition {run + 1}")

    #  cleaned dataset
    df = pd.read_csv("../../data/cleaning/2_cleaned_data.csv")

    #  relevant fields
    df["text"] = df[["sender", "receiver", "urls", "clean_text"]].fillna("").astype(str).agg(" ".join, axis=1)

    #  BERT input
    df = df[["text", "label"]].rename(columns={"text": "body"})

    # Apply 35% combined noise
    df["body"] = df["body"].apply(lambda x: add_combined_noise(x, noise_level=0.35, seed=42 + run))

    # Run 
    tag = f"{noise_tag}_r{run + 1}"
    train_and_evaluate(df, model_tag=tag, text_col="body")

    # Save metrics
    with open(f"results/metrics_{tag}.json") as f:
        metrics = json.load(f)
        metrics["noise_level"] = noise_tag
        metrics["run"] = run + 1
        metrics_list.append(metrics)

# Save 
df_metrics = pd.DataFrame(metrics_list)
df_metrics.to_csv(f"results/metrics_{noise_tag}.csv", index=False)
df_metrics


Running noise_35 – repetition 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 83.1099, 'train_samples_per_second': 9.626, 'train_steps_per_second': 1.203, 'train_loss': 0.38226139068603515, 'epoch': 1.0}
Running noise_35 – repetition 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 95.2489, 'train_samples_per_second': 8.399, 'train_steps_per_second': 1.05, 'train_loss': 0.436374626159668, 'epoch': 1.0}
Running noise_35 – repetition 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 98.1182, 'train_samples_per_second': 8.153, 'train_steps_per_second': 1.019, 'train_loss': 0.39586246490478516, 'epoch': 1.0}


Unnamed: 0,accuracy,f1_phishing,precision,recall,noise_level,run
0,0.95,0.9558,0.9153,1.0,noise_35,1
1,0.955,0.9585,0.9541,0.963,noise_35,2
2,0.975,0.9767,0.9813,0.9722,noise_35,3
