In [None]:
# 01_data_preprocessing.ipynb

# ===========================
# 1. 匯入套件
# ===========================
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt
import seaborn as sns
import nltk

# 如果第一次執行，請取消註解以下下載資源：
# nltk.download('stopwords')
# nltk.download('wordnet')

# ===========================
# 2. 載入資料
# ===========================
# 範例資料可來自 Packt repository 的 Chapter 3
# 假設檔案位於 ./data/spam.csv
df = pd.read_csv("data/spam.csv", encoding='latin-1')
df = df.rename(columns={'v1': 'label', 'v2': 'message'})[['label', 'message']]

print("資料筆數：", df.shape)
df.head()

# ===========================
# 3. 初步資料檢查
# ===========================
print(df['label'].value_counts())
sns.countplot(x='label', data=df)
plt.title('Class Distribution (Ham vs Spam)')
plt.show()

# 檢查缺失值
print(df.isnull().sum())

# ===========================
# 4. 文字預處理函式
# ===========================
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # 移除非字母
    text = re.sub('[^a-zA-Z]', ' ', text)
    # 轉小寫
    text = text.lower()
    # 分詞
    words = text.split()
    # 移除停用詞 + 詞形還原
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    # 合併回字串
    return ' '.join(words)

df['clean_text'] = df['message'].apply(clean_text)
df.head()

# ===========================
# 5. 特徵向量化
# ===========================
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['clean_text']).toarray()

y = np.where(df['label'] == 'spam', 1, 0)

print("TF-IDF matrix shape:", X.shape)

# ===========================
# 6. 切分訓練與測試集
# ===========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)

# ===========================
# 7. 儲存處理後資料
# ===========================
import joblib

joblib.dump((X_train, X_test, y_train, y_test), "data/processed_spam_data.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")

print("✅ 資料預處理完成，已輸出至 data/processed_spam_data.pkl")
