In [1]:
import sys
import os
import pandas as pd
import json

#  path to src
sys.path.append(os.path.abspath("../../src"))

from noise import add_combined_noise
from train_bert import train_and_evaluate

metrics_list = []
noise_tag = "noise_50"

for run in range(3):
    print(f"Running {noise_tag} – repetition {run + 1}")

    #  cleaned dataset
    df = pd.read_csv("../../data/cleaning/2_cleaned_data.csv")

    #  relevant fields
    df["text"] = df[["sender", "receiver", "urls", "clean_text"]].fillna("").astype(str).agg(" ".join, axis=1)

    #  BERT input
    df = df[["text", "label"]].rename(columns={"text": "body"})

    # Apply 50% combined noise
    df["body"] = df["body"].apply(lambda x: add_combined_noise(x, noise_level=0.50, seed=42 + run))

    # Run 
    tag = f"{noise_tag}_r{run + 1}"
    train_and_evaluate(df, model_tag=tag, text_col="body")

    # Save metrics
    with open(f"results/metrics_{tag}.json") as f:
        metrics = json.load(f)
        metrics["noise_level"] = noise_tag
        metrics["run"] = run + 1
        metrics_list.append(metrics)

# Save 
df_metrics = pd.DataFrame(metrics_list)
df_metrics.to_csv(f"results/metrics_{noise_tag}.csv", index=False)
df_metrics


Running noise_50 – repetition 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 47.6849, 'train_samples_per_second': 16.777, 'train_steps_per_second': 2.097, 'train_loss': 0.42958019256591795, 'epoch': 1.0}
Running noise_50 – repetition 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 46.58, 'train_samples_per_second': 17.175, 'train_steps_per_second': 2.147, 'train_loss': 0.4656621170043945, 'epoch': 1.0}
Running noise_50 – repetition 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 46.5612, 'train_samples_per_second': 17.182, 'train_steps_per_second': 2.148, 'train_loss': 0.5100610733032227, 'epoch': 1.0}


Unnamed: 0,accuracy,f1_phishing,precision,recall,noise_level,run
0,0.92,0.9286,0.8966,0.963,noise_50,1
1,0.925,0.9309,0.9266,0.9352,noise_50,2
2,0.92,0.931,0.871,1.0,noise_50,3
