# Spam mail detection with machine learning

_Wesley Boosko & Romain Fontaine_

The dataset was taken from https://www.kaggle.com/c/adcg-ss14-challenge-02-spam-mails-detection

The goal was to classify mails as spam and non spam with the help of a classifier and an anomaly detection algorithm.

## Imports

In [1]:
import os
import re
import glob
import  email
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import IsolationForest
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

## Parse mails

In [2]:
EXTRACT_ID = re.compile(r".*_(\d+)\.eml")
EXTRACT_LABEL = re.compile(r"\d+,(0|1)")

In [3]:
def get_mails(folder):
    res = []
    for mail in glob.glob(folder + "/*.eml"):
        mail_id = EXTRACT_ID.match(mail).groups()[0]
        res.append((int(mail_id), read_mail(mail)))
        
    return res

def read_mail(filename):
    with open(filename, "rb") as f:
        mail = email.message_from_binary_file(f)
    return mail

def get_labels(filename):
    with open(filename) as f:
        labels = []
        for l in f:
            m = EXTRACT_LABEL.match(l)
            if m:
                labels.append(m.groups()[0])
    labels = [int(l) for l in labels]
    return labels

In [4]:
def save(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,l in enumerate(predictions):
            f.write("{},{}\n".format(i+1, l))

In [5]:
TR_mails = get_mails("TR")
TR_mails = sorted(TR_mails, key=lambda x:x[0])
TR_labels = get_labels("spam-mail.tr.label")
TT_mails = get_mails("TT")
TT_mails = sorted(TT_mails, key=lambda x:x[0])

In [6]:
def get_payload(message):
    try:
        return message.as_string()
    except:
        payload = message.get_payload()
        if type(payload) == type(list()) :
            payload = payload[0]
        if type(payload) == type(message):
            payload = payload.get_payload()
        if type(payload) != type('') :
            payload = str(payload)

        return payload

In [7]:
TR_mails_payload = [get_payload(m) for (_,m) in TR_mails]
TR_ids = [i for (i,_) in TR_mails]
TT_mails_payload = [get_payload(m) for (_,m) in TT_mails]
TT_ids = [i for (i,_) in TT_mails]

In [8]:
def score(prediction, labels):
    accuracy = (prediction == labels).sum()/len(labels)
    TP = np.logical_and(prediction, labels).sum()
    TN = (np.logical_not(prediction) & np.logical_not(labels)).sum()
    FP = np.logical_and(prediction, np.logical_not(labels)).sum()
    FN = np.logical_and(np.logical_not(prediction), labels).sum()
    return accuracy, (TP, TN, FP, FN)

def pprint(accuracy, score):
    print("Accuracy : {:.4f}".format(accuracy))
    print("\n True positif : {}".format(score[0]))
    print(" True negatif : {}".format(score[1]))
    print("False positif : {}".format(score[2]))
    print("False negatif : {}".format(score[3]))

## Features extraction

In [9]:
vect = CountVectorizer(stop_words='english', strip_accents="unicode")
vect = vect.fit(TR_mails_payload + TT_mails_payload)

In [10]:
TR_count = vect.transform(TR_mails_payload)
TT_count = vect.transform(TT_mails_payload)

In [11]:
vect_tfidf = TfidfVectorizer(stop_words='english', strip_accents="unicode")
vect_tfidf = vect_tfidf.fit(TR_mails_payload + TT_mails_payload)

In [12]:
TR_tfidf = vect_tfidf.transform(TR_mails_payload)
TT_tfidf = vect_tfidf.transform(TT_mails_payload)

## Classification
### Naive Bayes

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(TR_count, TR_labels)
model = MultinomialNB()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
pprint(*score(predictions, Y_test))

Accuracy : 0.9536

 True positif : 421
 True negatif : 175
False positif : 21
False negatif : 8


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(TR_tfidf, TR_labels)
model = MultinomialNB()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
pprint(*score(predictions, Y_test))

Accuracy : 0.8688

 True positif : 416
 True negatif : 127
False positif : 74
False negatif : 8


In [15]:
model = MultinomialNB()
model.fit(TR_count, TR_labels)

predictions = model.predict(TT_count)
save("NaiveBayes.res", predictions)
predictions

array([1, 1, 1, ..., 0, 0, 0])

### Linear SVC

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(TR_count, TR_labels)
model = LinearSVC()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
pprint(*score(predictions, Y_test))

Accuracy : 0.9808

 True positif : 431
 True negatif : 182
False positif : 5
False negatif : 7


In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(TR_tfidf, TR_labels)
model = LinearSVC()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
pprint(*score(predictions, Y_test))

Accuracy : 0.9888

 True positif : 398
 True negatif : 220
False positif : 3
False negatif : 4


Kaggle score : 0.95785

In [18]:
model = LinearSVC()
model.fit(TR_tfidf, TR_labels)

predictions = model.predict(TT_tfidf)
save("LinearSVC.res", predictions)
predictions

array([1, 1, 1, ..., 0, 0, 0])

Kaggle score : 0.98522

## Anomaly detection with IsolationForest

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(TR_tfidf, TR_labels)
model = IsolationForest()
model.fit(X_train, Y_train)
predictions = model.predict(X_test).clip(min=0)
pprint(*score(predictions, Y_test))

Accuracy : 0.6560

 True positif : 394
 True negatif : 16
False positif : 195
False negatif : 20


In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(TR_count, TR_labels)
model = IsolationForest()
model.fit(X_train, Y_train)
predictions = model.predict(X_test).clip(min=0)
pprint(*score(predictions, Y_test))

Accuracy : 0.7040

 True positif : 409
 True negatif : 31
False positif : 167
False negatif : 18


In [21]:
model = IsolationForest()
model.fit(TR_tfidf, TR_labels)

predictions = model.predict(TT_tfidf)
predictions = predictions.clip(min=0)
save("IsolationForest.res", predictions)
predictions

array([1, 1, 1, ..., 0, 1, 1])

kaggle score : 0.65900