In [1]:
import os
import email_read_util

## Download 2007 TREC Public Spam Corpus
1. Read the "Agreement for use"
   https://plg.uwaterloo.ca/~gvcormac/treccorpus07/

2. Download 255 MB Corpus (trec07p.tgz) and untar into the 'chapter1/datasets' directory

3. Check that the below paths for 'DATA_DIR' and 'LABELS_FILE' exist

In [2]:
DATA_DIR = 'datasets/trec07p/data/'
LABELS_FILE = 'datasets/trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [3]:
labels = {}
# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0


In [4]:
def read_email_files():
    X = []
    y = [] 
    for i in range(len(labels)):
        filename = 'inmail.' + str(i+1)
        email_str = email_read_util.extract_email_text(
            os.path.join(DATA_DIR, filename))
        X.append(email_str)
        y.append(labels[filename])
    return X, y

X, y = read_email_files()

In [5]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test, idx_train, idx_test = \
    train_test_split(X, y, range(len(y)), 
    train_size=TRAINING_SET_RATIO, random_state=2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(
    min_df=3,                   # 등장 빈도 임계값 낮춤
    max_df=0.8,                 # 너무 흔한 단어 제외
    ngram_range=(1, 3),         # 3-gram까지 포함
    stop_words='english',
    lowercase=True,
    use_idf=True,
    analyzer='word',
    token_pattern=r'\w{1,}'     # 1글자 단어도 포함
)
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Initialize the classifier and make label predictions
mnb = MultinomialNB()
mnb.fit(X_train_vector, y_train)
y_pred = mnb.predict(X_test_vector)

# Print results
print(classification_report(y_test, y_pred, target_names=['Spam', 'Ham']))
print('Classification accuracy {:.3%}'.format(accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

        Spam       0.98      0.95      0.97     15035
         Ham       0.91      0.97      0.94      7591

    accuracy                           0.96     22626
   macro avg       0.95      0.96      0.95     22626
weighted avg       0.96      0.96      0.96     22626

Classification accuracy 95.903%


In [10]:
from sklearn.calibration import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

ensemble = VotingClassifier(
    estimators=[
        ('nb', MultinomialNB(alpha=1.0)),
        ('lr', LogisticRegression(C=10, class_weight='balanced')),
        ('svm', LinearSVC(C=1, class_weight='balanced')),
        ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced'))
    ],
    voting='hard'
)
ensemble.fit(X_train_vector, y_train)
y_pred = ensemble.predict(X_test_vector)

# Print results
print(classification_report(y_test, y_pred, target_names=['Spam', 'Ham']))
print('Classification accuracy {:.3%}'.format(accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

        Spam       0.99      0.99      0.99     15035
         Ham       0.99      0.97      0.98      7591

    accuracy                           0.99     22626
   macro avg       0.99      0.98      0.99     22626
weighted avg       0.99      0.99      0.99     22626

Classification accuracy 98.762%
