In [1]:
!pip install chardet beautifulsoup4 scikit-learn gensim



In [2]:
from pathlib import Path
from chardet import detect

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
from sklearn import (
    feature_extraction,
    tree,
    linear_model,
    naive_bayes,
    metrics,
    svm,
)
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

from gensim.models import Word2Vec, FastText

In [3]:
import random

random.seed(42)

# Data

In [4]:
!wget -qO- https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2 | tar -xjf -
!wget -qO- https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2 | tar -xjf -

!cat spam/00001.7848dde101aa985090474a91ec93fcf0 | tail
!cat easy_ham/00001.7c53336b37003a9286aba55d2945844c | tail

REMOVE. If you 
      reside in any state which prohibits e-mail solicitations for insuran=
ce, 
      please disregard this 
      email.<BR></FONT><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR=
><BR><BR><BR></FONT></P></CENTER></CENTER></TR></TBODY></TABLE></CENTER></=
CENTER></CENTER></CENTER></CENTER></BODY></HTML>



ps: this is still using the version of the code form a day ago, I haven't
been able to reach the cvs repository today (local routing issue I think).



_______________________________________________
Exmh-workers mailing list
Exmh-workers@redhat.com
https://listman.redhat.com/mailman/listinfo/exmh-workers



In [5]:
def file_bodies(file_path):
    result = []
    path_list = Path(file_path).glob('**/00*')
    for path in path_list:
        with open(str(path), 'rb') as enc_file:
            char_det = detect(enc_file.read())['encoding']
            enc_file.close()
            with open(str(path), 'r', encoding=char_det) as spam_file:
                file_body = None
                try:
                    file_body = spam_file.read()
                    result.append(file_body)
                except UnicodeDecodeError as err:
                    pass
                spam_file.close()
    return result

In [6]:
import re
from bs4 import BeautifulSoup

def file_bodies_cleaned(file_bodies: list):
    result = []
    for file_body in file_bodies:
        file_body_wo_header = re.sub(r'.*\:\s.*|\w+\s.*@', ' ', file_body)
        soup = BeautifulSoup(file_body_wo_header)
        result.append(soup.get_text())
    return result

# EDA

In [7]:
spam_file_bodies_cleaned = file_bodies_cleaned(file_bodies('./spam/'))
ham_file_bodies_cleaned = file_bodies_cleaned(file_bodies('./easy_ham/'))

In [8]:
rows = len(spam_file_bodies_cleaned) + len(ham_file_bodies_cleaned)
cols = 0

spam_email_df = pd.DataFrame()
spam_email_df['text'] = spam_file_bodies_cleaned
spam_email_df['label'] = 'spam'

ham_email_df = pd.DataFrame()
ham_email_df['text'] = ham_file_bodies_cleaned
ham_email_df['label'] = 'ham'

df = pd.DataFrame()
df = df.append(spam_email_df)
df = df.append(ham_email_df)

  df = df.append(spam_email_df)
  df = df.append(ham_email_df)


In [9]:
df.tail()


Unnamed: 0,text,label
994,xent.com Wed Oct 2 18:18:54 xent.com>\n \n ...,ham
995,xent.com Tue Sep 3 14:24:08 xent.com>\n \n ...,ham
996,linux.ie Thu Aug 22 16:27:21 linux.ie>\n \n ...,ham
997,xent.com Fri Sep 20 21:47:36 xent.com>\n \n ...,ham
998,xent.com Wed Oct 2 17:51:51 xent.com>\n \n ...,ham


In [10]:
df.describe()

Unnamed: 0,text,label
count,1493,1493
unique,1493,2
top,yahoo.com Mon Sep 2 12:19:14 yahoo.com>\n \...,ham
freq,1,999


In [11]:
df['length'] = df['text'].apply(len)
df.head(5)

Unnamed: 0,text,label,length
0,yahoo.com Mon Sep 2 12:19:14 yahoo.com>\n \...,spam,1065
1,yahoo.com Mon Aug 26 15:50:00 yahoo.com>\n \...,spam,5824
2,linux.ie Thu Sep 19 17:52:00 linux.ie>\n \n ...,spam,3350
3,att.net Mon Sep 23 12:11:50 att.net>\n \n \n...,spam,8879
4,maktoob.com Sat Sep 21 10:48:50 maktoob.com>...,spam,3673


# Train / Test prep

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], random_state=100)
X_train.shape

(1119,)

In [13]:
word2vec_model = Word2Vec(
    df['text'],
    vector_size=100,
    window=5,
    min_count=1,
    sg=1)

print('CBOW') if word2vec_model.sg == 0 else print('Skip-gram')



Skip-gram


In [14]:
fasttext_model = FastText(
    df['text'],
    vector_size=100,
    window=5,
    min_count=1,
    sg=1)



In [15]:
def vectorize_text(model, text):
    vector = np.zeros(model.vector_size)
    count = 0
    for word in text.split():
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count > 0:
        vector /= count
    return vector

# Train / Inference / Validation

In [16]:
X_train_word2vec = np.array([vectorize_text(word2vec_model, text) for text in X_train])
X_test_word2vec = np.array([vectorize_text(word2vec_model, text) for text in X_test])

X_train_fasttext = np.array([vectorize_text(fasttext_model, text) for text in X_train])
X_test_fasttext = np.array([vectorize_text(fasttext_model, text) for text in X_test])

Logistic Regression + Word2Vec

> Indented block



In [17]:
lr_word2vec = linear_model.LogisticRegression(
    penalty='l2',
    max_iter=1000,
    fit_intercept=True,
    random_state=42)
lr_word2vec.fit(X_train_word2vec, y_train)

log_scores = cross_val_score(lr_word2vec, X_train_word2vec, y_train, cv=10)
log_scores.mean()

0.8498471685971687

Logistic Regression + FastText

In [18]:
lr_fasttext = linear_model.LogisticRegression(
    penalty='l2',
    max_iter=1000,
    fit_intercept=True,
    random_state=42)
lr_fasttext.fit(X_train_fasttext, y_train)

log_scores = cross_val_score(lr_fasttext, X_train_fasttext, y_train, cv=10)
log_scores.mean()

0.6657738095238095

Inference Accuracy

In [19]:
y_pred_word2vec = lr_word2vec.predict(X_test_word2vec)
y_pred_fasttext = lr_fasttext.predict(X_test_fasttext)

accuracy_word2vec = metrics.accuracy_score(y_test, y_pred_word2vec)
accuracy_fasttext = metrics.accuracy_score(y_test, y_pred_fasttext)

print(f"Accuracy for Word2Vec: {accuracy_word2vec}")
print(f"Accuracy for fastText: {accuracy_fasttext}")

Accuracy for Word2Vec: 0.8342245989304813
Accuracy for fastText: 0.679144385026738
