In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 导入数据集
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]
data.columns = ['label', 'text']

# 标签编码
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# 分割数据集
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 特征提取
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 模型训练与评估
# LinearSVC
svc_count = LinearSVC()
svc_count.fit(X_train_count, y_train)
y_pred_svc_count = svc_count.predict(X_test_count)

svc_tfidf = LinearSVC()
svc_tfidf.fit(X_train_tfidf, y_train)
y_pred_svc_tfidf = svc_tfidf.predict(X_test_tfidf)

# LogisticRegression
lr_count = LogisticRegression()
lr_count.fit(X_train_count, y_train)
y_pred_lr_count = lr_count.predict(X_test_count)

lr_tfidf = LogisticRegression()
lr_tfidf.fit(X_train_tfidf, y_train)
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)

# 评估结果
print("LinearSVC with CountVectorizer")
print(classification_report(y_test, y_pred_svc_count))
print("LinearSVC with TfidfVectorizer")
print(classification_report(y_test, y_pred_svc_tfidf))
print("LogisticRegression with CountVectorizer")
print(classification_report(y_test, y_pred_lr_count))
print("LogisticRegression with TfidfVectorizer")
print(classification_report(y_test, y_pred_lr_tfidf))


LinearSVC with CountVectorizer
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.88      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

LinearSVC with TfidfVectorizer
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

LogisticRegression with CountVectorizer
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98   