In [23]:
import glob
import email
import re
import pandas as pd
import numpy as np
import math
from email import policy
from email.parser import BytesParser
from string import punctuation

In [71]:
def flat_text(text):
    new_text = ""
    html_tag = False
    for c in text:
        if(c == "<"):
            html_tag = True
            continue
        elif(c == ">"):
            html_tag = False
            continue
        if(html_tag == False):
            if(c not in punctuation):
                new_text += c
            else:
                new_text += " "
    return new_text.lower()

def email_parser():
    files = glob.glob("DATA\*.eml")
    emails_set = dict()
    for file in files:
        with open(file, 'rb') as fp:
            msg = BytesParser(policy=policy.default).parse(fp)
            try:
                text = msg.get_body(preferencelist=('plain')).get_content()
            except:
                text.encode('utf-32', 'surrogateescape').decode('utf-32') 
            new_text = flat_text(text)    
            words_set = dict()
            email_words = re.split(" |\n|\t", new_text)
            for word in email_words:
                try:
                    words_set[word] += 1
                except:
                    words_set[word] = 1
            emails_set[file.split("\\")[1]] = words_set
    return emails_set        

def label_parser():
    file = open("SPAMTrainn.label", "r")
    labels_set = dict()
    if file.mode == 'r':
        for line in file.readlines():
            vec = line.split(" ")
            labels_set[vec[1].replace('\n', '')] = vec[0]
    return labels_set
    

In [72]:
emails = email_parser()
labels = label_parser()

In [73]:
class NaiveBayes:
    
    p_spams = dict() #P(word|spam)
    p_hams = dict() #P(word|ham)
    p_spam = 0 #P(spam)
    p_ham = 0  #P(ham)
    
    def __init__(self, k):
        self.k = k
        
    def divide(self, emails, labels):
        spams = 0
        hams = 0
        words_spam = dict()
        words_ham = dict()
        words = dict()
        n_spam = 0
        n_ham = 0
        for item in emails:
            if(int(labels[item]) == 0):
                spams += 1
                for word in emails[item]:
                    try:
                        words_spam[word] += emails[item][word]
                    except:
                        words_spam[word] = emails[item][word]
                    try:
                        words[word] += emails[item][word]
                    except:
                        words[word] = emails[item][word]
                    n_spam  += emails[item][word]
            else:
                hams += 1
                for word in emails[item]:
                    try:
                        words_ham[word] += emails[item][word]
                    except:
                        words_ham[word] = emails[item][word]
                    try:
                        words[word] += emails[item][word]
                    except:
                        words[word] = emails[item][word]
                    n_ham += emails[item][word]
                    
        return words, words_spam, words_ham, n_spam, n_ham, spams, hams
    
    def fit(self, emails, labels):
        words, words_spam, words_ham, n_spam, n_ham, spams, hams = self.divide(emails, labels)
        self.p_spam = (spams + self.k)/(spams + hams + 2*self.k) #P(spam)
        self.p_ham = (hams + self.k)/(spams + hams + 2*self.k) #P(ham)
        dict_size = len(words)
        print(dict_size)
        for item in words:
            try:
                self.p_spams[item] = (words_spam[item] + self.k)/(n_spam + (self.k*dict_size))
            except:
                self.p_spams[item] = (self.k)/(n_spam + (self.k*dict_size))
            try:
                self.p_hams[item] = (words_ham[item] + self.k)/(n_spam + (self.k*dict_size))
            except:
                self.p_hams[item] = (self.k)/(n_spam + (self.k*dict_size))

    def predict(self, emails):
        labels = list()
        for item in emails:
            print(emails[item])
            p_bespam = 0
            p_beham = 0
            for word in emails[item]:
                p_bespam += math.log(self.p_spams[word])
                p_beham += math.log(self.p_hams[word])
            print(p_bespam, " ", p_beham)
            print(self.p_spam, " ", self.p_ham)            
            p_bespam = p_bespam + self.p_spam
            p_beham = p_beham + self.p_ham
            labels.append(1 if p_bespam<p_beham else 0)
        return labels

In [74]:
nb = NaiveBayes(1)

In [75]:
nb.fit(emails, labels)

836


In [78]:
def email2_parser():
    emails_set = dict()
    with open("DATA\TRAIN_00002.eml", 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)
        try:
            text = msg.get_body(preferencelist=('plain')).get_content()
        except:
            text.encode('utf-32', 'surrogateescape').decode('utf-32') 
        new_text = flat_text(text)    
        words_set = dict()
        email_words = re.split(" |\n|\t", new_text)
        for word in email_words:
            try:
                words_set[word] += 1
            except:
                words_set[word] = 1
        emails_set["TRAIN_00001.eml"] = words_set
    return emails_set        

In [79]:
nb.predict(email2_parser())

{'on': 2, 'sat': 1, '15': 1, 'may': 1, '10': 1, '': 147, '16': 1, '47': 1, '07': 1, 'merciadri': 1, 'luca': 1, 'wrote': 1, 'but': 5, 'will': 1, 'probably': 2, 'not': 6, 'work': 1, 'in': 3, 'you': 12, 'case': 1, 'as': 3, 'it': 8, 'was': 1, 'meant': 1, 'to': 12, 'combine': 1, 'two': 3, 'or': 4, 'more': 2, 'network': 3, 'ports': 1, 'from': 2, 'the': 11, 'same': 3, 'computer': 1, 'connected': 3, 'switch': 2, 'description': 1, 'says': 1, 'linux': 2, 'bonding': 2, 'driver': 1, 'provides': 1, 'a': 8, 'method': 1, 'for': 4, 'aggregating': 1, 'multiple': 1, 'interfaces': 1, 'into': 2, 'single': 1, 'logical': 1, 'bonded': 2, 'interface': 1, 'strictly': 1, 'speaking': 1, 'this': 6, 'is': 15, 'what': 2, 'i': 5, 'want': 2, 'now': 1, 'your': 4, 'interpretation': 1, 'seems': 1, 'be': 5, 'based': 1, 'definition': 1, 'of': 3, 'link': 1, 'aggregation': 1, 'which': 2, 'am': 2, 'really': 2, 'familiar': 1, 'with': 3, 'basically': 1, 'merge': 1, 'connections': 3, 'one': 4, 'at': 2, 'least': 1, 'divide': 1, 

[1]