# 人工知能期末レポート (問題5)
### 学籍番号：2358123
### 名前：Smittiviroj Thanakorn


## import

In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

1. Load dataset

In [5]:
df = pd.read_csv("spam_ham_dataset.csv")

2. Define English stop words

In [6]:
stop_words = text.ENGLISH_STOP_WORDS

3. Preprocessing function


In [7]:
def preprocess_email(text_input):
    # Remove HTML tags
    text_clean = re.sub(r'<.*?>', ' ', text_input)
    # Remove punctuation, digits, and special characters
    text_clean = re.sub(r'[^a-zA-Z\s]', ' ', text_clean)
    # Lowercase
    text_clean = text_clean.lower()
    # Remove extra whitespace
    text_clean = re.sub(r'\s+', ' ', text_clean).strip()
    # Tokenization
    tokens = text_clean.split()
    # Remove stop words
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

4. Apply preprocessing

In [8]:
df['clean_text'] = df['text'].apply(preprocess_email)

5. Prepare features and labels


In [9]:
X = df['clean_text']
y = df['label_num']  # 0 = ham, 1 = spam

6. Split train/test (80% train, 20% test)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

7. TF-IDF vectorization

In [11]:
vectorizer = TfidfVectorizer(max_df=0.9, min_df=5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

8. Train SVM with RBF kernel


In [12]:
model = SVC(kernel='rbf', gamma='scale', C=1.0, probability=True, random_state=42)
model.fit(X_train_vec, y_train)

9. Evaluate on test set


In [13]:
y_pred = model.predict(X_test_vec)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))

Test Accuracy: 0.9893719806763285
              precision    recall  f1-score   support

         ham       1.00      0.99      0.99       735
        spam       0.97      0.99      0.98       300

    accuracy                           0.99      1035
   macro avg       0.98      0.99      0.99      1035
weighted avg       0.99      0.99      0.99      1035



10. Test

In [None]:
# 10. test model
def predict_spam(text_input):
    clean = preprocess_email(text_input)
    vec = vectorizer.transform([clean])
    
    # predict
    pred = model.predict(vec)[0]
    prob = model.predict_proba(vec)[0][pred]
    label = 'spam' if pred == 1 else 'ham'
    return label, prob

# use model
test_text = "Congratulations! You've won a free ticket. Click here to claim."
label, prob = predict_spam(test_text)
print(f"text: {test_text}")
print(f"predict: {label} (accuracy: {prob:.2f})")

text: Congratulations! You've won a free ticket. Click here to claim.
predict: spam (accuracy: 0.99)


> **注意:**  
> このコードではテストセットで約99%の精度が得られていますが、使用しているデータセットは限られたサンプル数のため、実際の汎化性能が高いとは限りません。  
> あくまでも動作確認用の簡易シミュレーションである点にご留意ください。