In [13]:
# Imports + environment

from __future__ import annotations

import os
import asyncio
import json
import re
from enum import StrEnum
from pathlib import Path
from typing import Any, Dict, List

import pandas as pd
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from tqdm.auto import tqdm

load_dotenv()


True

In [14]:
import kagglehub

path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")
files = os.listdir(path)

print("Path to dataset files:", path)
print("Files in dataset:", files)

Path to dataset files: /Users/ext-elias.melas/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1
Files in dataset: ['Nigerian_Fraud.csv', 'Ling.csv', 'Nazario.csv', 'SpamAssasin.csv', 'CEAS_08.csv', 'phishing_email.csv', 'Enron.csv']


In [4]:
lora_data = pd.read_csv(f'{path}/Enron.csv')

In [5]:
lora_data['label'].value_counts(dropna=False)

label
0    15791
1    13976
Name: count, dtype: int64

In [6]:
# Balance positives/negatives and split into train/test (for evaluation).

from sklearn.model_selection import train_test_split

true_df = lora_data[lora_data["label"] == True]
false_df = lora_data[lora_data["label"] == False]

false_down = false_df.sample(n=len(true_df), random_state=42)
lora_data_balanced = (
    pd.concat([true_df, false_down]).sample(frac=0.2, random_state=42).reset_index(drop=True)
)

lora_train, lora_test = train_test_split(
    lora_data_balanced,
    test_size=0.2,
    random_state=42,
    stratify=lora_data_balanced["label"],
)

display(lora_train["label"].value_counts(dropna=False))
display(lora_test["label"].value_counts(dropna=False))

lora_train.to_parquet("../data/finetuning/lora_train_emails.parquet")
lora_test.to_parquet("../data/finetuning/lora_test_emails.parquet")


label
1    2241
0    2231
Name: count, dtype: int64

label
1    560
0    558
Name: count, dtype: int64

In [19]:
from __future__ import annotations

from datetime import datetime
from pathlib import Path

# SAFE-BY-DEFAULT: this cell does NOT write any files unless you explicitly opt in.
WRITE_TRAINING_DATA = False
OVERWRITE_OUTPUTS = False
RUN_TAG = 'spam'

system_prompt = Path("../data/verification/system_prompt_spam.md").read_text()

ft_training_data_list: list[dict[str, str]] = []
for _, d in lora_train.iterrows():
    subject = d["subject"]
    body = d["body"]
    is_spam = d["label"]

    user_prompt = f"""
Email subject:
---------------------------------------------------
\n{subject}\n\n
---------------------------------------------------
Email body:
---------------------------------------------------
\n{body}\n\n
---------------------------------------------------

"""

    ft_training_data_list.append(
        {
            "instruction": system_prompt,
            "input": user_prompt,
            # Keep the same output format your Tinker model expects.
            "output": f"is_spam: {is_spam}",
        }
    )

print("Prepared examples:", len(ft_training_data_list))

# If you want to write, write to a VERSIONED path by default.
out_path = Path(f"../data/finetuning/training_data_{RUN_TAG}.json")

if WRITE_TRAINING_DATA:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.exists() and not OVERWRITE_OUTPUTS:
        raise FileExistsError(
            f"Refusing to overwrite existing file: {out_path}. Set OVERWRITE_OUTPUTS=True to overwrite."
        )

    import json

    out_path.write_text(json.dumps(ft_training_data_list, indent=2, ensure_ascii=False), encoding="utf-8")
    print("Wrote:", out_path)
else:
    print("WRITE_TRAINING_DATA is False; not writing any files.")

Prepared examples: 4472
WRITE_TRAINING_DATA is False; not writing any files.
