## Imports

In [1]:
import  email
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import os
import re
import glob

## Parse mails

In [2]:
EXTRACT_ID = re.compile(r".*_(\d+)\.eml")
EXTRACT_LABEL = re.compile(r"\d+,(0|1)")

In [3]:
def get_mails(folder):
    res = []
    for mail in glob.glob(folder + "/*.eml"):
        mail_id = EXTRACT_ID.match(mail).groups()[0]
        res.append((int(mail_id), read_mail(mail)))
        
    return res

def read_mail(filename):
    with open(filename, "rb") as f:
        mail = email.message_from_binary_file(f)
    return mail

def get_labels(filename):
    with open(filename) as f:
        labels = []
        for l in f:
            m = EXTRACT_LABEL.match(l)
            if m:
                labels.append(m.groups()[0])
    labels = [int(l) for l in labels]
    return labels

In [4]:
m = read_mail("TR/TRAIN_2125.eml")

In [5]:
TR_mails = get_mails("TR")
TR_mails = sorted(TR_mails, key=lambda x:x[0])
TR_labels = get_labels("spam-mail.tr.label")
TT_mails = get_mails("TT")
TT_mails = sorted(TT_mails, key=lambda x:x[0])

In [6]:
def get_payload(message):
    try:
        return message.as_string()
    except:
        payload = message.get_payload()
        if type(payload) == type(list()) :
            payload = payload[0]
        if type(payload) == type(message):
            payload = payload.get_payload()
        if type(payload) != type('') :
            payload = str(payload)

        return payload

In [None]:
print(m.as_string

In [7]:
TR_mails_payload = [get_payload(m) for (_,m) in TR_mails]
TR_ids = [i for (i,_) in TR_mails]
TT_mails_payload = [get_payload(m) for (_,m) in TT_mails]
TT_ids = [i for (i,_) in TT_mails]

## TFIDF

In [8]:
vect = CountVectorizer(stop_words='english', strip_accents="unicode")
vect = vect.fit(TR_mails_payload + TT_mails_payload)

In [9]:
TR_tfidf = vect.transform(TR_mails_payload)
TT_tfidf = vect.transform(TT_mails_payload)

## Creta models

In [10]:
model = MultinomialNB()
model.fit(TR_tfidf, TR_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
predictions = model.predict(TT_tfidf)
predictions

array([1, 1, 1, ..., 0, 0, 0])

In [12]:
def save(filename):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,l in enumerate(predictions):
            f.write("{},{}\n".format(i+1, l))

In [13]:
save("res.labes")