## 정상 이메일과 피싱이메일의 데이터를 결합하여 하나의 데이터셋 구성

In [None]:
import json

# 파일 경로 설정
parsed_emails_path = "parsed_emails.json"  # 정상 이메일 데이터셋
generated_emails_path = "generated_emails.json"  # 피싱 이메일 데이터셋
combined_dataset_path = "combined_emails_dataset.json"  # 결합된 데이터셋

# JSON 데이터 로드
with open(parsed_emails_path, "r", encoding="utf-8") as file:
    normal_emails = json.load(file)

with open(generated_emails_path, "r", encoding="utf-8") as file:
    phishing_emails = json.load(file)

# 레이블 추가
for email in normal_emails:
    email["label"] = 0  # 정상 이메일 레이블: 0

for email in phishing_emails:
    email["label"] = 1  # 피싱 이메일 레이블: 1

# 데이터 결합
combined_emails = normal_emails + phishing_emails

# 결합된 데이터셋 저장
with open(combined_dataset_path, "w", encoding="utf-8") as file:
    json.dump(combined_emails, file, ensure_ascii=False, indent=4)

print(f"Combined dataset saved to {combined_dataset_path}")

## 데이터셋 로드 및 전처리 

In [1]:
import json
from sklearn.model_selection import train_test_split

combined_dataset_path = "combined_emails_dataset.json"

# 결합된 데이터 로드
with open(combined_dataset_path, "r", encoding="utf-8") as file:
    combined_emails = json.load(file)

# 텍스트와 레이블 추출
texts = [email["content"] for email in combined_emails]
labels = [email["label"] for email in combined_emails]

# 데이터셋 분리 (80% 학습, 20% 테스트)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

## NLP 모델 준비 및 학습

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
from tqdm import tqdm

# 모델 및 토크나이저 로드
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 텍스트 데이터 토큰화
train_encodings = tokenizer(
    train_texts, truncation=True, padding=True, max_length=512, return_tensors="tf"
)
test_encodings = tokenizer(
    test_texts, truncation=True, padding=True, max_length=512, return_tensors="tf"
)

# TensorFlow 데이터셋 생성
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)

# 모델 컴파일
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

# tqdm을 사용하여 학습 진행 상황 표시
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    with tqdm(total=len(train_dataset), desc="Training", unit="batch") as pbar:
        for batch in train_dataset:
            model.train_on_batch(batch[0], batch[1])
            pbar.update(1)
    with tqdm(total=len(test_dataset), desc="Validation", unit="batch") as pbar:
        for batch in test_dataset:
            model.test_on_batch(batch[0], batch[1])
            pbar.update(1)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', '

Epoch 1/3


Training:   0%|          | 0/120 [00:00<?, ?batch/s]

Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


Training:  50%|█████     | 60/120 [05:38<05:23,  5.40s/batch]

In [None]:
import os
import tarfile
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# 모델 저장 경로
save_directory = "./saved_model"
os.makedirs(save_directory, exist_ok=True)

# 모델 및 토크나이저 로드
model_name = "distilbert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 모델 저장
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# 압축 파일 저장 경로
compressed_file_path = "./saved_model.tar.gz"

# tar.gz 형식으로 압축
with tarfile.open(compressed_file_path, "w:gz") as tar:
    tar.add(save_directory, arcname=os.path.basename(save_directory))

print(f"Model and tokenizer saved and compressed at {compressed_file_path}")

In [None]:
from sklearn.metrics import classification_report

# 예측
predictions = model.predict(test_dataset)
predicted_labels = tf.argmax(predictions.logits, axis=1).numpy()

# 성능 평가
print(classification_report(test_labels, predicted_labels, target_names=["Legitimate", "Phishing"]))