In [1]:
!pip install chardet beautifulsoup4 scikit-learn



In [2]:
from pathlib import Path
from chardet import detect

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
from sklearn import (
    feature_extraction,
    tree,
    linear_model,
    naive_bayes,
    metrics,
    svm,
)
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

In [37]:
import random

random.seed(42)

# Data

In [3]:
!wget -qO- https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2 | tar -xjf -
!wget -qO- https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2 | tar -xjf -

!cat spam/00001.7848dde101aa985090474a91ec93fcf0 | tail
!cat easy_ham/00001.7c53336b37003a9286aba55d2945844c | tail

REMOVE. If you 
      reside in any state which prohibits e-mail solicitations for insuran=
ce, 
      please disregard this 
      email.<BR></FONT><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR=
><BR><BR><BR></FONT></P></CENTER></CENTER></TR></TBODY></TABLE></CENTER></=
CENTER></CENTER></CENTER></CENTER></BODY></HTML>



ps: this is still using the version of the code form a day ago, I haven't
been able to reach the cvs repository today (local routing issue I think).



_______________________________________________
Exmh-workers mailing list
Exmh-workers@redhat.com
https://listman.redhat.com/mailman/listinfo/exmh-workers



In [4]:
def file_bodies(file_path):
    result = []
    path_list = Path(file_path).glob('**/00*')
    for path in path_list:
        with open(str(path), 'rb') as enc_file:
            char_det = detect(enc_file.read())['encoding']
            enc_file.close()
            with open(str(path), 'r', encoding=char_det) as spam_file:
                file_body = None
                try:
                    file_body = spam_file.read()
                    result.append(file_body)
                except UnicodeDecodeError as err:
                    pass
                spam_file.close()
    return result

In [30]:
import re
from bs4 import BeautifulSoup

def file_bodies_cleaned(file_bodies: list):
    result = []
    for file_body in file_bodies:
        file_body_wo_header = re.sub(r'.*\:\s.*|\w+\s.*@', ' ', file_body)
        soup = BeautifulSoup(file_body_wo_header)
        result.append(soup.get_text())
    return result

# EDA

In [31]:
spam_file_bodies_cleaned = file_bodies_cleaned(file_bodies('./spam/'))
ham_file_bodies_cleaned = file_bodies_cleaned(file_bodies('./easy_ham/'))

In [32]:
rows = len(spam_file_bodies_cleaned) + len(ham_file_bodies_cleaned)
cols = 0

spam_email_df = pd.DataFrame()
spam_email_df['text'] = spam_file_bodies_cleaned
spam_email_df['label'] = 'spam'

ham_email_df = pd.DataFrame()
ham_email_df['text'] = ham_file_bodies_cleaned
ham_email_df['label'] = 'ham'

df = pd.DataFrame()
df = df.append(spam_email_df)
df = df.append(ham_email_df)

  df = df.append(spam_email_df)
  df = df.append(ham_email_df)


In [33]:
df.tail()


Unnamed: 0,text,label
994,redhat.com Tue Sep 10 11:22:40 spamassassin....,ham
995,xent.com Mon Sep 9 19:28:13 xent.com>\n \n ...,ham
996,xent.com Mon Sep 23 22:47:41 xent.com>\n \n ...,ham
997,svanstrom.com Wed Aug 28 11:02:33 svanstrom....,ham
998,xent.com Thu Sep 26 11:04:57 xent.com>\n \n ...,ham


In [34]:
df.describe()

Unnamed: 0,text,label
count,1493,1493
unique,1493,2
top,insiq.us Thu Aug 29 11:09:03 insiq.us>\n \n ...,ham
freq,1,999


In [35]:
df['length'] = df['text'].apply(len)
df.head(5)

Unnamed: 0,text,label,length
0,insiq.us Thu Aug 29 11:09:03 insiq.us>\n \n ...,spam,3498
1,bounce.tilw.net Thu Sep 19 11:14:52 bounce.t...,spam,3002
2,Flashmail.com Mon Aug 26 15:49:58 Flashmail....,spam,743
3,gmx.net Fri Sep 6 15:40:52 gmx.net>\n \n \n...,spam,2777
4,GetResponse.com Mon Aug 26 15:14:07 GetRespo...,spam,3584


# Train / Test prep

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], random_state=100)
X_train.shape

(1119,)

In [12]:
count_vect = text.CountVectorizer(stop_words="english")
count_vect.fit(X_train)

X_train_count_vect_trans = count_vect.transform(X_train)
X_test_count_vect_trans = count_vect.transform(X_test)

In [13]:
tfidf_vect = text.TfidfVectorizer(stop_words="english")
tfidf_vect.fit(X_train)

X_train_tfidf_vect_trans = tfidf_vect.transform(X_train)
X_test_tfidf_vect_trans = tfidf_vect.transform(X_test)

# Train / Inference / Validation

In [14]:
def plot_learning_curves(model, X_train, X_val, y_train, y_val):
    train_errors, val_errors = [], []
    for m in range(20, len(X_train), 10):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(f1_score(y_train[:m], y_train_predict) * 100)
        val_errors.append(f1_score(y_val, y_val_predict) * 100)

    plt.plot(train_errors, "r-+", linewidth=2, label="train")
    plt.plot(val_errors, "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)
    plt.xlabel("", fontsize=14)
    plt.ylabel("F1", fontsize=14)

Logistic Regression + Count Vectorizer

> Indented block



In [16]:
log_clf = linear_model.LogisticRegression(penalty='l2', max_iter=1000, fit_intercept=True)

log_scores = cross_val_score(log_clf, X_train_count_vect_trans, y_train, cv=10)
log_scores.mean()

0.9857142857142858

Logistic Regression + Tfidf Vectorizer

In [17]:
log_clf = linear_model.LogisticRegression(penalty='l2', max_iter=1000, fit_intercept=True)

log_scores = cross_val_score(log_clf, X_train_tfidf_vect_trans, y_train, cv=10)
log_scores.mean()

0.9562258687258689

DecisionTree + Count Vectorizer

In [18]:
dt_clf = tree.DecisionTreeClassifier()
dt_scores = cross_val_score(dt_clf, X_train_count_vect_trans, y_train, cv=10)
dt_scores.mean()

0.9785714285714286

DecisionTree + Tfidf Vectorizer

In [19]:
dt_clf = tree.DecisionTreeClassifier()
dt_scores = cross_val_score(dt_clf, X_train_tfidf_vect_trans, y_train, cv=10)
dt_scores.mean()

0.9919642857142857

Bnb + Count Vectorizer

In [27]:
bnb = naive_bayes.BernoulliNB()
bnb.fit(X_train_count_vect_trans, y_train)

bnb_scores = cross_val_score(bnb, X_train_count_vect_trans, y_train, cv=10)
bnb_scores.mean()

0.7667149292149292

Bnb + Tfidf Vectorizer

In [26]:
bnb = naive_bayes.BernoulliNB()
bnb.fit(X_train_tfidf_vect_trans, y_train)

bnb_scores = cross_val_score(bnb, X_train_tfidf_vect_trans, y_train, cv=10)
bnb_scores.mean()

0.7667149292149292