In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from sklearn.model_selection import train_test_split

In [2]:
RANDOM_STATE=42
np.random.seed(RANDOM_STATE)
OUTPUT_DIR=Path("data")
OUTPUT_DIR.mkdir(parents=True,exist_ok=True)

In [None]:
# Load data

ds=load_dataset("renemel/compiled-phishing-dataset", split="train")
df=ds.to_pandas()
print("Total rows:",len(df))
print("Columns: ",df.columns.tolist())
print(df.head())
print("Original type distribution:")
print(df["type"].value_counts())


# Create numeric label:phishing-1,legit-0

df["label"]=(df["type"]=="phishing").astype(int)

print("Numeric label distribution:")
print(df["label"].value_counts())


# Split data 30%-training, 35%-testing,35%-validation set

X=df.drop(columns=["label"])
y=df["label"]

X_train,X_temp,y_train,y_temp=train_test_split(
    X,
    y,
    train_size=0.30, 
    shuffle=True,
    random_state=RANDOM_STATE,
    stratify=y
)
X_test,X_val,y_test,y_val=train_test_split(
    X_temp,
    y_temp,
    test_size=0.5, 
    shuffle=True,
    random_state=RANDOM_STATE,
    stratify=y_temp
)

train_df=X_train.copy()
train_df["label"]=y_train.values

val_df=X_val.copy()
val_df["label"]=y_val.values

test_df=X_test.copy()
test_df["label"]=y_test.values



print("Split sizes:")
print("Total:",len(df))
print("Train:",len(train_df))
print("Test: ",len(test_df))
print("Val: ",len(val_df))

print("Train label distribution:")
print(train_df["label"].value_counts(normalize=True))

print("Test label distribution:")
print(test_df["label"].value_counts(normalize=True))


#Save splits to CSV


OUTPUT_DIR.mkdir(parents=True,exist_ok=True)

train_path="data/train.csv"
val_path="data/val.csv"
test_path="data/test.csv"


train_df.to_csv(train_path,index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path,index=False)

print(f"Saved training split to: {train_path}")
print(f"Saved test split to: {test_path}")
print(f"Saved validation split to: {val_path}")

Total rows: 119148
Columns:  ['text', 'type', 'source', 'word_count', 'sentence_count', 'words_per_sentence', 'domain']
                                                text      type  \
0  \nHi, Ms Nadia Jules is my name, An Orphan fro...  phishing   
1  I am Edward Thomas  , There is  this  client o...  phishing   
2  My name is Gerald L. Marcus, I know a single b...  phishing   
3  \nMy Name is Rose Modepe an orphan, I gave bir...  phishing   
4  \nHi\n\nMy name is Mrs Yetunde Owolabi from Re...  phishing   

                     source  word_count  sentence_count  words_per_sentence  \
0  deduplicated_dataset.csv          47               3           15.666667   
1  deduplicated_dataset.csv         101               5           20.200000   
2  deduplicated_dataset.csv          95               8           11.875000   
3  deduplicated_dataset.csv          56               3           18.666667   
4  deduplicated_dataset.csv          90               4           22.500000   

      do