In [3]:
import tkinter as tk
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# 下載 nltk 所需的資源
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# 讀取CSV文件
data_path = 'email.csv'  
data = pd.read_csv(data_path)

# 將標籤轉換為數值類型（0 表示正常郵件，1 表示垃圾郵件）
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# 文本預處理函數
def preprocess_text(text):
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text.lower())
    english_stopwords = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in english_stopwords]
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in filtered_tokens]
    return ' '.join(lemmatized_tokens)

# 處理郵件內容
data['processed_content'] = data['email'].apply(preprocess_text)

# 將文本和標籤分開
X = data['processed_content']
y = data['label']

# 向量化文本數據
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# 切分數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用樸素貝葉斯模型訓練
model = MultinomialNB()
model.fit(X_train, y_train)

# 模型評估
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Model Evaluation Metrics:')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'Confusion Matrix:\n{conf_matrix}')

# GUI 界面
class SpamClassifierApp:
    def __init__(self, master):
        self.master = master
        master.title("Spam Classifier")

        self.label = tk.Label(master, text="Enter Email Text:")
        self.label.pack()

        self.text_entry = tk.Text(master, height=10, width=50)
        self.text_entry.pack()

        self.classify_button = tk.Button(master, text="Classify", command=self.classify_email)
        self.classify_button.pack()

        self.result_label = tk.Label(master, text="", font=('Helvetica', 14))
        self.result_label.pack()

    def classify_email(self):
        email_text = self.text_entry.get("1.0", tk.END)
        email_text_processed = preprocess_text(email_text)
        email_text_transformed = vectorizer.transform([email_text_processed])
        prediction = model.predict(email_text_transformed)
        result = "It's a spam!!!" if prediction[0] == 1 else "Not Spam"
        self.result_label.config(text=f"Result: {result}")

# 創建 GUI 應用
root = tk.Tk()
app = SpamClassifierApp(root)
root.mainloop()


[nltk_data] Downloading package punkt to C:\Users\Gigi
[nltk_data]     Yang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Gigi
[nltk_data]     Yang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Gigi
[nltk_data]     Yang\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gigi Yang\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Model Evaluation Metrics:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
Confusion Matrix:
[[14  0]
 [ 0 22]]
