In [155]:
import glob
import email
import re
import pandas as pd
import numpy as np
import math
from email import policy
from email.parser import BytesParser
from string import punctuation

In [156]:
def flat_text(text):
    new_text = ""
    html_tag = False
    for c in text:
        if(c == "<"):
            html_tag = True
            continue
        elif(c == ">"):
            html_tag = False
            continue
        if(html_tag == False):
            if(c not in punctuation):
                new_text += c
            else:
                new_text += " "
    return new_text.lower()

def email_parser():
    files = glob.glob("DATA\*.eml")
    emails_set = dict()
    for file in files:
        with open(file, 'rb') as fp:
            msg = BytesParser(policy=policy.default).parse(fp)
            try:
                text = msg.get_body(preferencelist=('plain')).get_content()
            except:
                text.encode('utf-32', 'surrogateescape').decode('utf-32') 
            new_text = flat_text(text)    
            words_set = dict()
            email_words = re.split(" |\n|\t", new_text)
            for word in email_words:
                try:
                    words_set[word] += 1
                except:
                    words_set[word] = 1
            emails_set[file.split("\\")[1]] = words_set
    return emails_set        

def label_parser():
    file = open("SPAMTrain.label", "r")
    labels_set = dict()
    if file.mode == 'r':
        for line in file.readlines():
            vec = line.split(" ")
            labels_set[vec[1].replace('\n', '')] = vec[0]
    return labels_set
    

In [157]:
emails = email_parser()
labels = label_parser()

In [164]:
def train_test_split(x,y, train_size = 0.8):
    perm = np.random.permutation(list(x.keys()))
    dict_size = len(x)
    x_train = dict()
    y_train = dict()
    x_test = dict()
    y_test = dict()
    i = 0
    index = len(x)*train_size
    for k in perm:
        if (i < index):
            x_train[k] = x[k] 
            y_train[k] = y[k]
        else:
            x_test[k] = x[k] 
            y_test[k] = y[k]
        i += 1
    return x_train, y_train, x_test, y_test

In [165]:
email_train, label_train, email_test, label_test = train_test_split(emails, labels)

In [173]:
class NaiveBayes:
    
    p_spams = dict() #P(word|spam)
    p_hams = dict() #P(word|ham)
    p_spam = 0 #P(spam)
    p_ham = 0  #P(ham)
    n_spam = 0 #Qtd total de palavras em spam
    n_ham = 0 #Qtd total de palavras em ham
    dict_size = 0 #Qtd de palavras distintas nos emails
    
    def __init__(self, k = 1):
        self.k = k
        
    def set_k(self, k):
        self.k = k
        
    def divide(self, emails, labels):
        spams = 0
        hams = 0
        words_spam = dict()
        words_ham = dict()
        words = dict()
      
        for item in emails:
            if(int(labels[item]) == 0):
                spams += 1
                for word in emails[item]:
                    try:
                        words_spam[word] += emails[item][word]
                    except:
                        words_spam[word] = emails[item][word]
                    try:
                        words[word] += emails[item][word]
                    except:
                        words[word] = emails[item][word]
                    self.n_spam  += emails[item][word]
            else:
                hams += 1
                for word in emails[item]:
                    try:
                        words_ham[word] += emails[item][word]
                    except:
                        words_ham[word] = emails[item][word]
                    try:
                        words[word] += emails[item][word]
                    except:
                        words[word] = emails[item][word]
                    self.n_ham += emails[item][word]
                    
        return words, words_spam, words_ham, spams, hams
    
    def fit(self, emails, labels):
        words, words_spam, words_ham, spams, hams = self.divide(emails, labels)
        self.p_spam = (spams + self.k)/(spams + hams + 2*self.k) #P(spam)
        self.p_ham = (hams + self.k)/(spams + hams + 2*self.k) #P(ham)
        self.dict_size = len(words)
        for item in words:
            try:
                self.p_spams[item] = (words_spam[item] + self.k)/(self.n_spam + (self.k*self.dict_size))
            except:
                self.p_spams[item] = (self.k)/(self.n_spam + (self.k*self.dict_size))
            try:
                self.p_hams[item] = (words_ham[item] + self.k)/(self.n_ham + (self.k*self.dict_size))
            except:
                self.p_hams[item] = (self.k)/(self.n_ham + (self.k*self.dict_size))

    def predict(self, emails):
        labels = list()
        for item in emails:
            p_bespam = math.log(self.p_spam)
            p_beham = math.log(self.p_ham)
            for word in emails[item]:
                try:
                    p_bespam += math.log(self.p_spams[word])
                except:
                    p_bespam += math.log((self.k)/(self.n_spam + (self.k*self.dict_size)))
                try:
                    p_beham += math.log(self.p_hams[word]) 
                except:
                    p_beham += math.log((self.k)/(self.n_ham + (self.k*self.dict_size)))
            labels.append(1 if p_bespam<p_beham else 0)
        return labels

In [174]:
nb = NaiveBayes()

In [175]:
nb.fit(emails_train, labels_train)

In [176]:
prediction = nb.predict(emails_val)

In [177]:
def accuracy(prediction, labels_val):
    num = 0
    total = 0
    for i in labels_val:
        if(prediction[total] == int(labels_val[i])):
            num += 1
        total += 1
    return num/total

In [None]:
class grid_search_cv:
    self.x = dict()
    self.y = dict()
    def __init__(self, clf, params, cv = 10):
        self.clf = clf
        self.params = params
        self.cv = cv
    def kfold(k = 10):
        size_part = len(self.x)//k
        perm = np.random.permutation(list(x.keys()))
        
        #toda partição tem que ser teste uma vez
        #pra cada partiçao eu pego o slice da ista pra escolher o train e test
        #chamar o classificador
    def fit(self, x, y):
        self.x = x
        self.y = y
        #chamar o kfold pra cada param
        #treinar o classificador de novo com o conjunto todo
        
            