In [1]:
import sys
import os
import pandas as pd
import json

#  path to src
sys.path.append(os.path.abspath("../../src"))

from noise import add_combined_noise
from train_bert import train_and_evaluate

metrics_list = []
noise_tag = "noise_45"

for run in range(3):
    print(f"Running {noise_tag} – repetition {run + 1}")

    #  cleaned dataset
    df = pd.read_csv("../../data/cleaning/2_cleaned_data.csv")

    #  relevant fields
    df["text"] = df[["sender", "receiver", "urls", "clean_text"]].fillna("").astype(str).agg(" ".join, axis=1)

    #  BERT input
    df = df[["text", "label"]].rename(columns={"text": "body"})

    # Apply 45% combined noise
    df["body"] = df["body"].apply(lambda x: add_combined_noise(x, noise_level=0.45, seed=42 + run))

    # Run 
    tag = f"{noise_tag}_r{run + 1}"
    train_and_evaluate(df, model_tag=tag, text_col="body")

    # Save metrics
    with open(f"results/metrics_{tag}.json") as f:
        metrics = json.load(f)
        metrics["noise_level"] = noise_tag
        metrics["run"] = run + 1
        metrics_list.append(metrics)

# Save 
df_metrics = pd.DataFrame(metrics_list)
df_metrics.to_csv(f"results/metrics_{noise_tag}.csv", index=False)
df_metrics


Running noise_45 – repetition 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 55.1292, 'train_samples_per_second': 14.511, 'train_steps_per_second': 1.814, 'train_loss': 0.49468097686767576, 'epoch': 1.0}
Running noise_45 – repetition 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 47.7017, 'train_samples_per_second': 16.771, 'train_steps_per_second': 2.096, 'train_loss': 0.47894790649414065, 'epoch': 1.0}
Running noise_45 – repetition 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 46.7516, 'train_samples_per_second': 17.112, 'train_steps_per_second': 2.139, 'train_loss': 0.46426097869873045, 'epoch': 1.0}


Unnamed: 0,accuracy,f1_phishing,precision,recall,noise_level,run
0,0.92,0.9298,0.8833,0.9815,noise_45,1
1,0.915,0.9238,0.8957,0.9537,noise_45,2
2,0.935,0.939,0.9524,0.9259,noise_45,3
