베이즈 스팸 필터 실습 코드

In [None]:
import numpy as np
import pandas as pd
import re
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class NaiveBayesSpamFilter:
    def __init__(self, alpha=1.0):
        """
        나이브 베이즈 스팸 필터 초기화
        alpha: 라플라스 평활화 파라미터
        """
        self.alpha = alpha
        self.spam_word_counts = defaultdict(int)
        self.ham_word_counts = defaultdict(int)
        self.spam_total_words = 0
        self.ham_total_words = 0
        self.vocab = set()
        self.spam_prior = 0
        self.ham_prior = 0
        
    def preprocess_text(self, text):
        """
        텍스트 전처리 함수
        """
        # 소문자 변환
        text = text.lower()
        
        # 영문자와 공백만 남기고 제거
        text = re.sub(r'[^a-z\s]', '', text)
        
        # 단어 토큰화
        words = text.split()
        
        # 불용어 제거 (간단한 버전)
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 
                     'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were',
                     'i', 'you', 'he', 'she', 'it', 'we', 'they', 'this', 'that'}
        
        words = [word for word in words if word not in stop_words and len(word) > 2]
        
        return words
    
    def fit(self, X, y):
        """
        모델 훈련
        X: 이메일 텍스트 리스트
        y: 라벨 (1: 스팸, 0: 정상)
        """
        print("모델 훈련 시작...")
        
        # 사전확률 계산
        total_emails = len(y)
        spam_count = sum(y)
        ham_count = total_emails - spam_count
        
        self.spam_prior = spam_count / total_emails
        self.ham_prior = ham_count / total_emails
        
        print(f"총 이메일: {total_emails}개")
        print(f"스팸 이메일: {spam_count}개 ({self.spam_prior:.3f})")
        print(f"정상 이메일: {ham_count}개 ({self.ham_prior:.3f})")
        
        # 각 클래스별 단어 빈도 계산
        for email, label in zip(X, y):
            words = self.preprocess_text(email)
            self.vocab.update(words)
            
            if label == 1:  # 스팸
                for word in words:
                    self.spam_word_counts[word] += 1
                    self.spam_total_words += 1
            else:  # 정상
                for word in words:
                    self.ham_word_counts[word] += 1
                    self.ham_total_words += 1
        
        print(f"전체 어휘 크기: {len(self.vocab)}개")
        print(f"스팸에서 총 단어 수: {self.spam_total_words}개")
        print(f"정상에서 총 단어 수: {self.ham_total_words}개")
        
    def calculate_word_probability(self, word, is_spam):
        """
        라플라스 평활화를 적용한 단어 확률 계산
        """
        if is_spam:
            word_count = self.spam_word_counts[word]
            total_words = self.spam_total_words
        else:
            word_count = self.ham_word_counts[word]
            total_words = self.ham_total_words
            
        # 라플라스 평활화 적용
        return (word_count + self.alpha) / (total_words + self.alpha * len(self.vocab))
    
    def predict_proba(self, email):
        """
        이메일이 스팸일 확률 계산 (로그 확률 사용)
        """
        words = self.preprocess_text(email)
        
        # 로그 확률 계산 (수치 안정성을 위해)
        log_spam_prob = np.log(self.spam_prior)
        log_ham_prob = np.log(self.ham_prior)
        
        for word in words:
            # P(word|spam)
            spam_word_prob = self.calculate_word_probability(word, is_spam=True)
            log_spam_prob += np.log(spam_word_prob)
            
            # P(word|ham)
            ham_word_prob = self.calculate_word_probability(word, is_spam=False)
            log_ham_prob += np.log(ham_word_prob)
        
        # 로그 공간에서 정규화
        max_log_prob = max(log_spam_prob, log_ham_prob)
        log_spam_prob -= max_log_prob
        log_ham_prob -= max_log_prob
        
        # 다시 확률 공간으로 변environn
        spam_prob = np.exp(log_spam_prob)
        ham_prob = np.exp(log_ham_prob)
        
        # 정규화
        total_prob = spam_prob + ham_prob
        return spam_prob / total_prob
    
    def predict(self, X, threshold=0.5):
        """
        이메일 분류 예측
        """
        predictions = []
        probabilities = []
        
        for email in X:
            prob = self.predict_proba(email)
            probabilities.append(prob)
            predictions.append(1 if prob > threshold else 0)
            
        return np.array(predictions), np.array(probabilities)
    
    def get_most_informative_words(self, n=20):
        """
        가장 정보가 많은 단어들 추출
        """
        word_scores = {}
        
        for word in self.vocab:
            spam_prob = self.calculate_word_probability(word, is_spam=True)
            ham_prob = self.calculate_word_probability(word, is_spam=False)
            
            # 스팸과 정상에서의 확률 비율
            if ham_prob > 0:
                score = spam_prob / ham_prob
                word_scores[word] = score
        
        # 스팸을 나타내는 단어들 (비율이 높은 것)
        spam_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:n]
        
        # 정상을 나타내는 단어들 (비율이 낮은 것)
        ham_words = sorted(word_scores.items(), key=lambda x: x[1])[:n]
        
        return spam_words, ham_words

def create_sample_data():
    """
    샘플 데이터 생성 (실제 사용시에는 실제 이메일 데이터셋 사용)
    """
    spam_emails = [
        "congratulations you have won a million dollars click here now",
        "free money urgent action required send bank details immediately",
        "make money fast work from home guaranteed income",
        "urgent business proposal transfer millions inheritance fund",
        "congratulations lottery winner claim prize money now click",
        "free cash advance no credit check apply today",
        "get rich quick scheme investment opportunity limited time",
        "inheritance fund requires urgent attention bank transfer",
        "claim your prize money lottery winner congratulations",
        "make thousands working from home guaranteed success",
        "urgent transfer required millions available inheritance",
        "free money cash advance no questions asked",
        "investment opportunity get rich quick guaranteed returns",
        "lottery winner congratulations claim prize immediately",
        "work from home make money fast guaranteed income"
    ]
    
    ham_emails = [
        "hello how are you doing today meeting tomorrow",
        "please review the attached document for our project",
        "thank you for your help with the presentation",
        "meeting scheduled for next week tuesday afternoon",
        "looking forward to working with you on this project",
        "please send the report by end of day",
        "great job on the presentation yesterday",
        "conference call scheduled for tomorrow morning",
        "please review and provide feedback on the proposal",
        "thank you for attending the meeting today",
        "project deadline has been extended until friday",
        "please confirm your attendance for the workshop",
        "excellent work on the quarterly report",
        "reminder about the team meeting next monday",
        "looking forward to our discussion tomorrow"
    ]
    
    # 라벨 생성 (1: 스팸, 0: 정상)
    emails = spam_emails + ham_emails
    labels = [1] * len(spam_emails) + [0] * len(ham_emails)
    
    return emails, labels

def evaluate_model(y_true, y_pred, y_proba=None):
    """
    모델 성능 평가
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print("=== 모델 성능 평가 ===")
    print(f"정확도 (Accuracy): {accuracy:.3f}")
    print(f"정밀도 (Precision): {precision:.3f}")
    print(f"재현율 (Recall): {recall:.3f}")
    print(f"F1 점수: {f1:.3f}")
    
    # 혼동 행렬
    cm = confusion_matrix(y_true, y_pred)
    print("\n혼동 행렬:")
    print(f"실제\\예측  정상  스팸")
    print(f"정상      {cm[0,0]:4d}  {cm[0,1]:4d}")
    print(f"스팸      {cm[1,0]:4d}  {cm[1,1]:4d}")
    
    return accuracy, precision, recall, f1

def plot_probability_distribution(y_true, y_proba):
    """
    예측 확률 분포 시각화
    """
    plt.figure(figsize=(10, 6))
    
    # 실제 정상 메일들의 예측 확률
    ham_probs = y_proba[y_true == 0]
    spam_probs = y_proba[y_true == 1]
    
    plt.hist(ham_probs, bins=20, alpha=0.7, label='정상 메일', color='blue')
    plt.hist(spam_probs, bins=20, alpha=0.7, label='스팸 메일', color='red')
    
    plt.axvline(x=0.5, color='black', linestyle='--', label='분류 임계값 (0.5)')
    plt.xlabel('스팸 확률')
    plt.ylabel('빈도')
    plt.title('베이즈 스팸 필터 - 예측 확률 분포')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:
def create_sample_data():
    """
    샘플 데이터 생성 (실제 사용시에는 실제 이메일 데이터셋 사용)
    """
    spam_emails = [
        "congratulations you have won a million dollars click here now",
        "free money urgent action required send bank details immediately",
        "make money fast work from home guaranteed income",
        "urgent business proposal transfer millions inheritance fund",
        "congratulations lottery winner claim prize money now click",
        "free cash advance no credit check apply today",
        "get rich quick scheme investment opportunity limited time",
        "inheritance fund requires urgent attention bank transfer",
        "claim your prize money lottery winner congratulations",
        "make thousands working from home guaranteed success",
        "urgent transfer required millions available inheritance",
        "free money cash advance no questions asked",
        "investment opportunity get rich quick guaranteed returns",
        "lottery winner congratulations claim prize immediately",
        "work from home make money fast guaranteed income"
    ]
    
    ham_emails = [
        "hello how are you doing today meeting tomorrow",
        "please review the attached document for our project",
        "thank you for your help with the presentation",
        "meeting scheduled for next week tuesday afternoon",
        "looking forward to working with you on this project",
        "please send the report by end of day",
        "great job on the presentation yesterday",
        "conference call scheduled for tomorrow morning",
        "please review and provide feedback on the proposal",
        "thank you for attending the meeting today",
        "project deadline has been extended until friday",
        "please confirm your attendance for the workshop",
        "excellent work on the quarterly report",
        "reminder about the team meeting next monday",
        "looking forward to our discussion tomorrow"
    ]
    
    # 라벨 생성 (1: 스팸, 0: 정상)
    emails = spam_emails + ham_emails
    labels = [1] * len(spam_emails) + [0] * len(ham_emails)
    
    return emails, labels

In [None]:
def evaluate_model(y_true, y_pred, y_proba=None):
    """
    모델 성능 평가
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print("=== 모델 성능 평가 ===")
    print(f"정확도 (Accuracy): {accuracy:.3f}")
    print(f"정밀도 (Precision): {precision:.3f}")
    print(f"재현율 (Recall): {recall:.3f}")
    print(f"F1 점수: {f1:.3f}")
    
    # 혼동 행렬
    cm = confusion_matrix(y_true, y_pred)
    print("\n혼동 행렬:")
    print(f"실제\\예측  정상  스팸")
    print(f"정상      {cm[0,0]:4d}  {cm[0,1]:4d}")
    print(f"스팸      {cm[1,0]:4d}  {cm[1,1]:4d}")
    
    return accuracy, precision, recall, f1

def plot_probability_distribution(y_true, y_proba):
    """
    예측 확률 분포 시각화
    """
    plt.figure(figsize=(10, 6))
    
    # 실제 정상 메일들의 예측 확률
    ham_probs = y_proba[y_true == 0]
    spam_probs = y_proba[y_true == 1]
    
    plt.hist(ham_probs, bins=20, alpha=0.7, label='정상 메일', color='blue')
    plt.hist(spam_probs, bins=20, alpha=0.7, label='스팸 메일', color='red')
    
    plt.axvline(x=0.5, color='black', linestyle='--', label='분류 임계값 (0.5)')
    plt.xlabel('스팸 확률')
    plt.ylabel('빈도')
    plt.title('베이즈 스팸 필터 - 예측 확률 분포')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:
def main():
    """
    메인 실행 함수
    """
    print("=== 베이즈 스팸 필터 실습 ===\n")
    
    # 1. 데이터 준비
    print("1. 샘플 데이터 생성...")
    emails, labels = create_sample_data()
    
    # 2. 훈련/테스트 데이터 분할
    X_train, X_test, y_train, y_test = train_test_split(
        emails, labels, test_size=0.3, random_state=42, stratify=labels
    )
    
    print(f"훈련 데이터: {len(X_train)}개")
    print(f"테스트 데이터: {len(X_test)}개\n")
    
    # 3. 모델 훈련
    print("2. 모델 훈련...")
    model = NaiveBayesSpamFilter(alpha=1.0)
    model.fit(X_train, y_train)
    print()
    
    # 4. 예측 및 평가
    print("3. 모델 예측 및 평가...")
    y_pred, y_proba = model.predict(X_test)
    
    evaluate_model(y_test, y_pred, y_proba)
    print()
    
    # 5. 가장 정보가 많은 단어들 출력
    print("4. 가장 정보가 많은 단어들...")
    spam_words, ham_words = model.get_most_informative_words(n=10)
    
    print("스팸을 나타내는 단어들:")
    for word, score in spam_words:
        print(f"  {word}: {score:.2f}")
    
    print("\n정상을 나타내는 단어들:")
    for word, score in ham_words:
        print(f"  {word}: {score:.2f}")
    print()
    
    # 6. 새로운 이메일 테스트
    print("5. 새로운 이메일 테스트...")
    test_emails = [
        "congratulations you won free money click now",
        "please send the meeting minutes from yesterday",
        "urgent transfer required millions inheritance fund",
        "thank you for your collaboration on the project"
    ]
    
    for email in test_emails:
        prob = model.predict_proba(email)
        prediction = "스팸" if prob > 0.5 else "정상"
        print(f"이메일: '{email[:50]}{'...' if len(email) > 50 else ''}'")
        print(f"스팸 확률: {prob:.3f} → 분류: {prediction}\n")
    
    # 7. 확률 분포 시각화
    print("6. 예측 확률 분포 시각화...")
    plot_probability_distribution(y_test, y_proba)
    
    # 8. 다양한 임계값에서의 성능 비교
    print("7. 다양한 임계값에서의 성능 비교...")
    thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
    
    print("임계값  정확도  정밀도  재현율  F1점수")
    print("-" * 40)
    
    for threshold in thresholds:
        y_pred_thresh = (y_proba > threshold).astype(int)
        acc = accuracy_score(y_test, y_pred_thresh)
        prec = precision_score(y_test, y_pred_thresh)
        rec = recall_score(y_test, y_pred_thresh)
        f1 = f1_score(y_test, y_pred_thresh)
        
        print(f"{threshold:6.1f}  {acc:6.3f}  {prec:6.3f}  {rec:6.3f}  {f1:6.3f}")



In [None]:
if __name__ == "__main__":
    main()
