In [1]:
import os
from email import parser
from email import policy
from collections import Counter
import re
from html import unescape
import nltk
from nltk.corpus import stopwords
import pandas
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn import svm
from sklearn import ensemble
from sklearn import naive_bayes
from sklearn import neighbors
import urlextract
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
INDEX_PATH = os.path.join('trec07p', 'delay', 'index')
DATA_PATH = os.path.join('trec07p', 'data')
labels = []
filenames = []


def create_dataset(index_path):
    with open(index_path) as f:
        while True:
            line = f.readline()
            if not line:
                break
            line = line.split(' ')
            labels.append(line[0])
            filenames.append(line[1].strip('\n').split('/')[-1])


create_dataset(INDEX_PATH)

In [3]:
def load_email(filename, file_path):
    with open(os.path.join(file_path, filename), 'rb') as f:
        return parser.BytesParser(policy=policy.default).parse(f)


raw_emails = [load_email(name, DATA_PATH) for name in filenames]

print(raw_emails[1].get_content().strip())

Hi, i've just updated from the gulus and I check on other mirrors.
It seems there is a little typo in /debian/README file

Example:
http://gulus.usherbrooke.ca/debian/README
ftp://ftp.fr.debian.org/debian/README

"Testing, or lenny.  Access this release through dists/testing.  The
current tested development snapshot is named etch.  Packages which
have been tested in unstable and passed automated tests propogate to
this release."

etch should be replace by lenny like in the README.html



-- 
Yan Morin
Consultant en logiciel libre
yan.morin@savoirfairelinux.com
514-994-1556


-- 
To UNSUBSCRIBE, email to debian-mirrors-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org


In [4]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return 'multipart({})'.format(', '.join([get_email_structure(sub_email) for sub_email in payload]))
    else:
        return email.get_content_type()


def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [5]:
def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub(r'<[aA]\s.*?>', 'HYPERLINK', text, flags=re.M | re.S | re.I)
    text = re.sub(r'<img\s.*?>', 'IMAGE', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [6]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if ctype not in ('text/plain', 'text/html'):
            continue
        try:
            content = part.get_content()
        except LookupError:
            content = str(part.get_payload())
        if ctype == 'text/plain':
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)
print(structures_counter(raw_emails).most_common())

[('text/html', 3612), ('text/plain', 2414), ('multipart(text/plain, text/html)', 2080), ('multipart(multipart(text/plain, text/html), image/gif)', 860), ('multipart(multipart(text/plain, text/html), image/jpeg)', 371), ('multipart(text/html)', 205), ('multipart(text/html, image/gif)', 128), ('multipart(text/plain)', 87), ('multipart(text/plain, application/pgp-signature)', 47), ('multipart(text/plain, text/plain)', 24), ('multipart(multipart(text/plain, text/html), image/png)', 21), ('multipart(text/plain, application/x-msdownload)', 15), ('multipart(multipart(text/plain, text/html), image/gif, image/gif, image/jpeg, application/octet-stream)', 13), ('multipart(multipart(text/plain, text/html))', 13), ('multipart(multipart(text/plain, application/pgp-signature), text/plain)', 11), ('multipart(text/html, text/plain, image/png)', 11), ('multipart(text/plain charset=us-ascii, text/html)', 9), ('multipart(text/plain, text/x-diff)', 7), ('multipart(multipart(text/html, image/gif), applicati

In [7]:
stopwords_list = stopwords.words('english')
token = nltk.stem.SnowballStemmer('english')
for single in range(97, 123):
    stopwords_list.append(chr(single))
extractor = urlextract.URLExtract()


def word_split(email):
    text = email_to_text(email) or ' '
    text = text.lower()
    text = re.sub(r'\W+', ' ', text, flags=re.M)
    urls = list(set(extractor.find_urls(text)))
    urls.sort(key=lambda item: len(item), reverse=True)
    for url in urls:
        text = text.replace(url, "URL")
    text = re.sub(r'\d+(?:\.\d*[eE]\d+)?', 'NUMBER', text)
    content = list(nltk.word_tokenize(text))
    all_words = []
    for word in content:
        if word not in stopwords_list:
            word = token.stem(word)
            all_words.append(word)
    return all_words
all_emails = [word_split(data) for data in raw_emails]
print(all_emails[0])

['feel', 'pressur', 'perform', 'rise', 'occas', 'hyperlinktri', 'viagra', 'anxieti', 'thing', 'past', 'back', 'old', 'self']


In [8]:
trainDF = pandas.DataFrame()
trainDF['text'] = all_emails
trainDF['label'] = labels

# 将数据集分为测试集和训练集
train_data, test_data, train_label, test_label = train_test_split(trainDF['text'], trainDF['label'], random_state=0)

# label编码为目标变量,即从字符串转为一个数字
encoder = preprocessing.LabelEncoder()
train_label = encoder.fit_transform(train_label)
test_label = encoder.fit_transform(test_label)


In [9]:
# 将分词后的列表重新拼接成字符串
trainDF['text'] = trainDF['text'].apply(lambda x: ' '.join(x))

# 确保 train_data 和 test_data 也是字符串形式
train_data = [' '.join(doc) for doc in train_data]
test_data = [' '.join(doc) for doc in test_data]

# 然后再进行向量化
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
count_vect.fit(trainDF['text'])
xtrain_count = count_vect.transform(train_data)  # 训练集特征向量
xtest_count = count_vect.transform(test_data)    # 测试集特征向量

# 4.2 TF-IDF特征向量
# 4.2.1 词语级
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)

# 4.2.2 多词语级
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
							ngram_range=(2, 3), max_features=5000)

# 4.2.3 词性级
tfidf_vect_char = TfidfVectorizer(analyzer='char', ngram_range=(2, 3), max_features=5000)



In [10]:
# 创建分类器
def train_model(classifier, train_feature, test_feature):
    classifier.fit(train_feature, train_label)
    prediction = classifier.predict(test_feature)
    acc = metrics.accuracy_score(prediction, test_label)
    prec = metrics.precision_score(prediction, test_label)
    rec = metrics.recall_score(prediction, test_label)
    f1 = metrics.f1_score(prediction, test_label)
    return acc, prec, rec, f1


In [11]:
# 5.1 朴素贝叶斯多项式模型
accuracy, precision, recall, f1_score = train_model(naive_bayes.MultinomialNB(),
                            xtrain_count,
                            xtest_count)
print("NB (Multinomial), Count Vectors: ")
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")

# 5.2 朴素贝叶斯伯努利模型
accuracy, precision, recall, f1_score = train_model(naive_bayes.BernoulliNB(), xtrain_count, xtest_count)
print("NB (Bernoulli), Count Vectors: ")
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")

# 5.3 SVM
accuracy, precision, recall, f1_score = train_model(svm.SVC(), xtrain_count, xtest_count)
print("SVM, Count Vectors: ")
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")

# 5.4 随机森林
accuracy, precision, recall, f1_score = train_model(ensemble.RandomForestClassifier(), xtrain_count, xtest_count)
print("Random Forest, Count Vectors: ")
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")

# 5.5 KNN
accuracy, precision, recall, f1_score = train_model(neighbors.KNeighborsClassifier(), xtrain_count, xtest_count)
print("KNN, Count Vectors: ")
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")


NB (Multinomial), Count Vectors: 
Accuracy: 0.8460, Precision: 0.8104, Recall: 1.0000, F1 Score: 0.8953
NB (Bernoulli), Count Vectors: 
Accuracy: 0.8324, Precision: 0.7937, Recall: 1.0000, F1 Score: 0.8850
SVM, Count Vectors: 
Accuracy: 0.9748, Precision: 0.9916, Recall: 0.9777, F1 Score: 0.9846
Random Forest, Count Vectors: 
Accuracy: 0.9936, Precision: 0.9985, Recall: 0.9936, F1 Score: 0.9961
KNN, Count Vectors: 
Accuracy: 0.9652, Precision: 0.9921, Recall: 0.9660, F1 Score: 0.9789
