In [1]:
import numpy as np
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

In [2]:
import urllib.request
import tarfile
from pathlib import Path

def load_dataset(name_url):
    root_url = "https://spamassassin.apache.org/old/publiccorpus/"
    url = root_url + name_url
    tarball_path = Path(f"email/{name_url.split('.')[0]}")  # Directory to extract files
    # Create root email directory if it doesn't exist
    Path("email").mkdir(parents=True, exist_ok=True)
    # Download the tar file if it doesn't exist
    if not tarball_path.is_file():
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as data_tarball:
            data_tarball.extractall(path="email")
    # Return the path to the extracted directory containing emails

# Datasets
easy_ham = ["20030228_easy_ham.tar.bz2", "20030228_easy_ham_2.tar.bz2"]
hard_ham = ["20030228_hard_ham.tar.bz2"]
spam = ["20030228_spam.tar.bz2", "20050311_spam_2.tar.bz2"]

for name_url in easy_ham+hard_ham+spam:
    load_dataset(name_url)

easy_ham_path = [f for f in sorted(Path("email/easy_ham").iterdir()) if len(f.name) > 20] + [f for f in sorted(Path("email/easy_ham_2").iterdir()) if len(f.name) > 20]
hard_ham_path = [f for f in sorted(Path("email/hard_ham").iterdir()) if len(f.name) > 20] 
spam_path = [f for f in sorted(Path("email/spam").iterdir()) if len(f.name) > 20] + [f for f in sorted(Path("email/spam_2").iterdir()) if len(f.name) > 20]


In [3]:
import email
from email import policy

def load_email(file_path):
    with open(file_path, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

easy_ham_emails = [load_email(f) for f in easy_ham_path]
hard_ham_emails = [load_email(f) for f in hard_ham_path]
spam_emails = [load_email(f) for f in spam_path]

In [4]:
print(len(easy_ham_emails))
print(len(hard_ham_emails))
print(len(spam_emails))

3900
250
1896


In [5]:
easy_ham_emails[1].get_content().strip()

"Martin A posted:\nTassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n Mount Athos monastic community, was ideal for the patriotic sculpture. \n \n As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n museum, a restored amphitheatre and car park for admiring crowds are\nplanned\n---------------------\nSo is this mountain limestone or granite?\nIf it's limestone, it'll weather pretty fast.\n\n------------------------ Yahoo! Groups Sponsor ---------------------~-->\n4 DVDs Free +s&p Join Now\nhttp://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n---------------------------------------------------------------------~->\n\nTo unsubscribe from this group, send an email to:\nforteana-unsubscribe@egroups.com\n\n \n\nYour use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/"

In [6]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        multipart = ", ".join([get_email_structure(sub_email) for sub_email in payload])
        return f"multipart({multipart})"
    return email.get_content_type()

In [7]:
from collections import Counter

def structures_counter(emails, len):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 100/len
    return structures

structures_counter(easy_ham_emails, 3900)

Counter({'text/plain': 96.17948717948681,
         'multipart(text/plain, application/pgp-signature)': 2.5897435897435845,
         'multipart(text/plain, text/html)': 0.512820512820513,
         'multipart(text/plain, text/plain)': 0.10256410256410256,
         'multipart(text/plain)': 0.07692307692307693,
         'multipart(text/plain, application/ms-tnef, text/plain)': 0.05128205128205128,
         'multipart(text/plain, application/octet-stream)': 0.05128205128205128,
         'multipart(text/plain, multipart(text/plain))': 0.05128205128205128,
         'multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)': 0.05128205128205128,
         'text/html': 0.05128205128205128,
         'multipart(text/plain, text/enriched)': 0.02564102564102564,
         'multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)': 0.02564102564102564,
         'multipart(text/plain, video/mng)': 0.02564102564102564,
         'multipart(text/plain, appli

In [8]:
structures_counter(hard_ham_emails, 250)

Counter({'text/html': 47.199999999999896,
         'text/plain': 32.39999999999995,
         'multipart(text/plain, text/html)': 17.200000000000003,
         'multipart(text/html)': 0.8,
         'multipart(text/plain, image/bmp)': 0.4,
         'multipart(multipart(text/plain, text/html))': 0.4,
         'multipart(text/plain, application/x-pkcs7-signature)': 0.4,
         'multipart(text/plain, image/png, image/png)': 0.4,
         'multipart(multipart(text/plain, text/html), image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif, image/jpeg, image/gif, image/gif, image/gif, image/gif, image/gif, image/gif)': 0.4,
         'multipart(text/plain, text/plain)': 0.4})

In [9]:
structures_counter(spam_emails, 1896)

Counter({'text/plain': 42.985232067510005,
         'text/html': 40.71729957805858,
         'multipart(text/plain, text/html)': 8.386075949367074,
         'multipart(text/html)': 2.5843881856540087,
         'multipart(text/plain)': 2.3206751054852317,
         'multipart(multipart(text/html))': 1.2130801687763713,
         'multipart(multipart(text/plain, text/html))': 0.26371308016877637,
         'multipart(text/plain, application/octet-stream)': 0.15822784810126583,
         'multipart(text/html, text/plain)': 0.15822784810126583,
         'multipart(text/plain, image/jpeg)': 0.15822784810126583,
         'multipart(text/plain, application/octet-stream, text/plain)': 0.15822784810126583,
         'multipart(text/html, application/octet-stream)': 0.10548523206751055,
         'multipart/alternative': 0.10548523206751055,
         'multipart(text/html, image/jpeg)': 0.10548523206751055,
         'multipart(multipart(text/plain), application/octet-stream)': 0.10548523206751055,
    

spam have more html ones 53% while easy ham have just 0.51% but hard ham has 66% html. so of a email is a multipart or has a html content then it more probability of being a SPAM but if a email is php sihnatured it is more likely a ham email

In [10]:
for header, value in spam_emails[100].items():
    print(header, ":", value)

Return-Path : <FreeSoftware-5265v80@yahoo.com>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id BE46143F9B	for <zzzz@localhost>; Mon, 26 Aug 2002 16:37:20 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Mon, 26 Aug 2002 21:37:20 +0100 (IST)
Received : from yahoo.com ([211.185.47.189])	by webnote.net (8.9.3/8.9.3) with SMTP id VAA27898	for <zzzz@spamassassin.taint.org>; Mon, 26 Aug 2002 21:39:06 +0100
Reply-To : Free Publishing Software <FreeSoftware-5265v80@yahoo.com>
Message-ID : <004b12e28d1a$4347d2b7$3ce68ab0@sgcrua>
From : Free Publishing Software <FreeSoftware-5265v80@yahoo.com>
To : zzzz@spamassassin.taint.org
Subject : Take your Marketing to the Next Level
Date : Mon, 26 Aug 2002 19:24:06 +0100
MiME-Version : 1.0
X-Priority : 3 (Normal)
X-MSMail-Priority : Normal
X-Mailer : Mi

In [11]:
print(spam_emails[100]["From"])
print(spam_emails[100]["Subject"])

Free Publishing Software <FreeSoftware-5265v80@yahoo.com>
Take your Marketing to the Next Level


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np

X = np.array(easy_ham_emails + spam_emails, dtype="object")
Y = np.array([0]*len(easy_ham_emails) + [1]*len(spam_emails))

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [13]:
from bs4 import BeautifulSoup

def html_to_text(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    text = soup.get_text(separator="\n")
    return text.strip()

In [14]:
def email_to_text(email):
    total_content = ""
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        else:
            try:
                content = part.get_content()
            except:
                content = str(part.get_payload())
            if ctype == "text/plain":
                total_content += content
            else:
                total_content += html_to_text(content)
    return total_content

In [15]:
print(email_to_text(X_train[195]))

Details



Want to refinance?

Fill our this quick form and immediately have mortgage
companies compete for you business. 

You will be offered the, absolute, BEST refinance rates 
availible!

Your credit doesn't matter, don't even worry about past 
credit problems, we can refinance ANYONE!

Let Us Put Our Expertise to Work for You!

http://210.51.251.244/al/cgi-bin/redir.cgi?goto=ID74210

Or Site 2
http://61.129.81.99/al/cgi-bin/redir.cgi?goto=ID74215
















Erase
http://210.51.251.244/al/uns/list.htm



In [16]:
import nltk

porter = nltk.PorterStemmer()
for word in ["fell", "fall", "fallen", "felt", "feel", "taken", "took", "taking"]:
    print(porter.stem(word))

fell
fall
fallen
felt
feel
taken
took
take


In [17]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\huzai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "ran", "runs", "run", "happiness", "was"]
lemmas = [lemmatizer.lemmatize(word, pos="v") for word in words]  # pos="v" for verbs

print(lemmas)  # Output: ['run', 'run', 'run', 'run', 'happiness', 'be']


['run', 'run', 'run', 'run', 'happiness', 'be']


In [19]:
from urlextract import URLExtract
def url_extractor(email):
    url_extract = URLExtract()
    urls = url_extract.find_urls(email)
    return urls

print(url_extractor(email_to_text(X_train[195])))

['http://210.51.251.244/al/cgi-bin/redir.cgi?goto=ID74210', 'http://61.129.81.99/al/cgi-bin/redir.cgi?goto=ID74215', 'http://210.51.251.244/al/uns/list.htm']


In [20]:
from sklearn.base import TransformerMixin, BaseEstimator
import re

class EmailtoWordCounterTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, strip_headers=True, to_lowercase=True, replace_numbers=True,
                remove_punctuations=True, replace_urls=True, stemming=True):
        self.strip_headers=strip_headers
        self.to_lowercase=to_lowercase
        self.replace_numbers=replace_numbers
        self.replace_urls = replace_urls
        self.remove_punctuations = remove_punctuations
        self.stemming=stemming

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text=email_to_text(email) or ""
            if self.to_lowercase:
                text=text.lower()
            if self.replace_urls:
                if url_extractor is None:
                    raise ValueError("URL extractor is not initialized!")
                urls = url_extractor(text)
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text=re.sub("\d+(?:\.\d*)?(?:[eE][+-]?\d+(?:\.\d*))?", "NUMBER", text)
            if self.remove_punctuations:
                text=re.sub("\W+", ' ', text)
            word_counts = Counter(text.split())
            if self.stemming:
                if porter is None:
                    raise ValueError("Porter Stemmer is not initialized!")
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word_counts[porter.stem(word)] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)        

In [21]:
from pprint import pprint
pprint(EmailtoWordCounterTransformer().fit_transform(X_train[:5]))

array([Counter({'i': 9, 'of': 9, 'the': 9, 'your': 8, 'for': 7, 'thi': 6, 'in': 5, 'a': 5, 'to': 5, 'you': 5, 'money': 5, 'my': 4, 'with': 4, 'on': 4, 'contact': 3, 'assist': 3, 'invest': 3, 'mr': 3, 'jallow': 3, 'and': 3, 'mail': 3, 'url': 3, 'search': 2, 'which': 2, 'transfer': 2, 'm': 2, 'ibrahim': 2, 'late': 2, 'unit': 2, 'u': 2, 'wa': 2, 'number': 2, 's': 2, 'realis': 2, 'can': 2, 'through': 2, 'e': 2, 'hello': 1, 'dear': 1, 'sir': 1, 'got': 1, 'caus': 1, 'serious': 1, 'reliabl': 1, 'foreign': 1, 'partner': 1, 'realli': 1, 'made': 1, 'me': 1, 'purpos': 1, 'son': 1, 'sierraleonian': 1, 'busi': 1, 'man': 1, 'kulu': 1, 'who': 1, 'die': 1, 'two': 1, 'yaer': 1, 'ago': 1, 'when': 1, 'revolutionari': 1, 'front': 1, 'rebel': 1, 'r': 1, 'f': 1, 'attack': 1, 'our': 1, 'resid': 1, 'makeni': 1, 'sierra': 1, 'leon': 1, 'follow': 1, 'ceas': 1, 'fire': 1, 'agreement': 1, 'reach': 1, 'last': 1, 'year': 1, 'help': 1, 'nation': 1, 'peac': 1, 'keep': 1, 'troop': 1, 'use': 1, 'oppoturn': 1, 'leav': 1

In [22]:
from scipy.sparse import csr_matrix

class WordCountertoVectorsTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size

    def fit(self, X, y=None):
        total_words = Counter()
        for words_counter in X:
            for word, count in words_counter.items():
                total_words[word] += min(count, 10)
        most_common = total_words.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word:index+1 for index, (word, count) in enumerate(most_common)}
        return self
    
    def transform(self, X, y=None):
        vector = []
        col_index = []
        row_index = []
        for row, email_word_counter in enumerate(X):
            for word, count in email_word_counter.items():
                row_index.append(row)
                col_index.append(self.vocabulary_.get(word, 0))
                vector.append(count)
        return csr_matrix((vector, (row_index, col_index)), shape=(len(X), self.vocabulary_size+1))

In [23]:
TovectorTransformer = WordCountertoVectorsTransformer(vocabulary_size=10)
sample_vector = TovectorTransformer.fit_transform(
    EmailtoWordCounterTransformer().fit_transform(X_train[:5]))
sample_vector

<5x11 sparse matrix of type '<class 'numpy.int32'>'
	with 47 stored elements in Compressed Sparse Row format>

In [24]:
sample_vector.toarray()

array([[217,   9,   9,   5,   2,   9,   7,   3,   5,   5,   8],
       [236,  10,  14,   3,  46,   1,   3,   3,   5,   2,   1],
       [ 98,   6,   4,   6,   0,   2,   0,   3,   1,   1,   0],
       [101,   3,   2,   2,  14,   1,   0,   3,   0,   1,   0],
       [145,   2,   0,   7,   1,   5,   4,   1,   0,   2,   1]])

In [25]:
TovectorTransformer.vocabulary_

{'the': 1,
 'i': 2,
 'to': 3,
 'number': 4,
 'of': 5,
 'for': 6,
 'and': 7,
 'in': 8,
 'a': 9,
 'your': 10}

In [26]:
from sklearn.pipeline import Pipeline

preprocessing = Pipeline([
    ("Words_to_Counter", EmailtoWordCounterTransformer()),
    ("WordCounter_To_Vector", WordCountertoVectorsTransformer())
])
X_train_processed = preprocessing.fit_transform(X_train, Y_train)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_processed)

In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_processed, Y_train, cv=10)
score.mean()

0.9866281559544202

In [29]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(random_state=42)
score = cross_val_score(rnd_clf, X_train_tfidf, Y_train, cv=10)
score.mean()

0.9859778803902586

In [30]:
from sklearn.svm import SVC

svm_clf = SVC(random_state=42)
score = cross_val_score(svm_clf, X_train_tfidf, Y_train, cv=10)
score.mean()

0.9889974491695837

In [31]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
score = cross_val_score(knn_clf, X_train_tfidf, Y_train, cv=10)
score.mean()


0.9167409883071425

In [32]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier()
score = cross_val_score(xgb_clf, X_train_tfidf, Y_train, cv=10)
score.mean()


0.9857614321888732

In [33]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix

X_test_processed = preprocessing.transform(X_test)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_processed)
log_clf.fit(X_train_processed, Y_train)
Y_pred = log_clf.predict(X_test_processed)
print(f"Precision {precision_score(Y_test, Y_pred):.2%}")
print(f"Recall {recall_score(Y_test, Y_pred):.2%}")
conf = confusion_matrix(Y_test, Y_pred)
conf

Precision 97.38%
Recall 93.82%


array([[795,   9],
       [ 22, 334]], dtype=int64)

In [None]:
hard_ham_predict = log_clf.predict(preprocessing.transform(np.array(hard_ham_emails, dtype="object")))
Y_orig = np.array([0]*len(hard_ham_emails))
conf1 = confusion_matrix(Y_orig, hard_ham_predict)
conf1

array([[150, 100],
       [  0,   0]], dtype=int64)

In [35]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix

X_test_processed = preprocessing.transform(X_test)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_processed)
xgb_clf.fit(X_train_tfidf, Y_train)
Y_pred = xgb_clf.predict(X_test_tfidf)
print(f"Precision {precision_score(Y_test, Y_pred):.2%}")
print(f"Recall {recall_score(Y_test, Y_pred):.2%}")
conf = confusion_matrix(Y_test, Y_pred)
conf

Precision 98.87%
Recall 98.60%


array([[800,   4],
       [  5, 351]], dtype=int64)

In [None]:
hard_ham_predict = xgb_clf.predict(tfidf_transformer.fit_transform(preprocessing.transform(np.array(hard_ham_emails, dtype="object"))))
Y_orig = np.array([0]*len(hard_ham_emails))
conf1 = confusion_matrix(Y_orig, hard_ham_predict)
conf1

array([[ 86, 164],
       [  0,   0]], dtype=int64)

In [None]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix

X_test_processed = preprocessing.transform(X_test)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_processed)
rnd_clf.fit(X_train_tfidf, Y_train)
Y_pred = rnd_clf.predict(X_test_tfidf)
print(f"Precision {precision_score(Y_test, Y_pred):.2%}")
print(f"Recall {recall_score(Y_test, Y_pred):.2%}")
conf = confusion_matrix(Y_test, Y_pred)
conf

Precision 99.43%
Recall 98.03%


array([[802,   2],
       [  7, 349]], dtype=int64)

In [None]:
hard_ham_predict = rnd_clf.predict(tfidf_transformer.fit_transform(preprocessing.transform(np.array(hard_ham_emails, dtype="object"))))
Y_orig = np.array([0]*len(hard_ham_emails))
conf1 = confusion_matrix(Y_orig, hard_ham_predict)
conf1

array([[134, 116],
       [  0,   0]], dtype=int64)

In [None]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix

X_test_processed = preprocessing.transform(X_test)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_processed)
svm_clf.fit(X_train_tfidf, Y_train)
Y_pred = svm_clf.predict(X_test_tfidf)
print(f"Precision {precision_score(Y_test, Y_pred):.2%}")
print(f"Recall {recall_score(Y_test, Y_pred):.2%}")
conf = confusion_matrix(Y_test, Y_pred)
conf

Precision 98.60%
Recall 98.88%


array([[799,   5],
       [  4, 352]], dtype=int64)

In [None]:
hard_ham_predict = svm_clf.predict(tfidf_transformer.fit_transform(preprocessing.transform(np.array(hard_ham_emails, dtype="object"))))
Y_orig = np.array([0]*len(hard_ham_emails))
conf1 = confusion_matrix(Y_orig, hard_ham_predict)
conf1

array([[150, 100],
       [  0,   0]], dtype=int64)

After this only three models were selected logistic regression, random forest, and SVC