In [1]:
import glob
import email
import re
import pandas as pd
import numpy as np
import math
from email import policy
from email.parser import BytesParser
from string import punctuation

In [141]:
def flat_text(text):
    new_text = ""
    html_tag = False
    for c in text:
        if(c == "<"):
            html_tag = True
            continue
        elif(c == ">"):
            html_tag = False
            continue
        if(html_tag == False):
            if(c not in punctuation):
                new_text += c
            else:
                new_text += " "
    return new_text.lower()

def email_parser():
    files = glob.glob("DATA\*.eml")
    emails_set = dict()
    for file in files:
        with open(file, 'rb') as fp:
            msg = BytesParser(policy=policy.default).parse(fp)
            try:
                text = msg.get_body(preferencelist=('plain')).get_content()
            except:
                text.encode('utf-32', 'surrogateescape').decode('utf-32') 
            new_text = flat_text(text)    
            words_set = dict()
            email_words = re.split(" |\n|\t", new_text)
            for word in email_words:
                try:
                    words_set[word] += 1
                except:
                    words_set[word] = 1
            emails_set[file.split("\\")[1]] = words_set
    return emails_set        

def label_parser():
    file = open("SPAMTrain.label", "r")
    labels_set = dict()
    if file.mode == 'r':
        for line in file.readlines():
            vec = line.split(" ")
            labels_set[vec[1].replace('\n', '')] = vec[0]
    return labels_set
    

In [142]:
emails = email_parser()
labels = label_parser()

In [149]:
emails_train = dict()
labels_train = dict()
emails_val = dict()
labels_val = dict()
i = 0
indice = 4000
for k in emails.keys():
    if (i > indice):
        emails_train[k] = emails[k] 
        labels_train[k] = labels[k]
    else:
        emails_val[k] = emails[k] 
        labels_val[k] = labels[k]
    i += 1

In [144]:
class NaiveBayes:
    
    p_spams = dict() #P(word|spam)
    p_hams = dict() #P(word|ham)
    p_spam = 0 #P(spam)
    p_ham = 0  #P(ham)
    n_spam = 0 #Qtd total de palavras em spam
    n_ham = 0 #Qtd total de palavras em ham
    dict_size = 0 #Qtd de palavras distintas nos emails
    
    def __init__(self, k):
        self.k = k
        
    def divide(self, emails, labels):
        spams = 0
        hams = 0
        words_spam = dict()
        words_ham = dict()
        words = dict()
      
        for item in emails:
            if(int(labels[item]) == 0):
                spams += 1
                for word in emails[item]:
                    try:
                        words_spam[word] += emails[item][word]
                    except:
                        words_spam[word] = emails[item][word]
                    try:
                        words[word] += emails[item][word]
                    except:
                        words[word] = emails[item][word]
                    self.n_spam  += emails[item][word]
            else:
                hams += 1
                for word in emails[item]:
                    try:
                        words_ham[word] += emails[item][word]
                    except:
                        words_ham[word] = emails[item][word]
                    try:
                        words[word] += emails[item][word]
                    except:
                        words[word] = emails[item][word]
                    self.n_ham += emails[item][word]
                    
        return words, words_spam, words_ham, spams, hams
    
    def fit(self, emails, labels):
        words, words_spam, words_ham, spams, hams = self.divide(emails, labels)
        self.p_spam = (spams + self.k)/(spams + hams + 2*self.k) #P(spam)
        self.p_ham = (hams + self.k)/(spams + hams + 2*self.k) #P(ham)
        self.dict_size = len(words)
        print(self.p_spam, " ", self.p_ham)
        for item in words:
            try:
                self.p_spams[item] = (words_spam[item] + self.k)/(self.n_spam + (self.k*self.dict_size))
            except:
                self.p_spams[item] = (self.k)/(self.n_spam + (self.k*self.dict_size))
            try:
                self.p_hams[item] = (words_ham[item] + self.k)/(self.n_ham + (self.k*self.dict_size))
            except:
                self.p_hams[item] = (self.k)/(self.n_ham + (self.k*self.dict_size))

    def predict(self, emails):
        labels = list()
        for item in emails:
            print(item)
            p_bespam = math.log(self.p_spam)
            p_beham = math.log(self.p_ham)
            for word in emails[item]:
                try:
                    p_bespam += math.log(self.p_spams[word])
                except:
                    p_bespam += math.log((self.k)/(self.n_spam + (self.k*self.dict_size)))
                try:
                    p_beham += math.log(self.p_hams[word]) 
                except:
                    p_beham += math.log((self.k)/(self.n_ham + (self.k*self.dict_size)))
            print(p_bespam, " ", p_beham)
            labels.append(1 if p_bespam<p_beham else 0)
        return labels

In [150]:
nb = NaiveBayes(1)

In [151]:
nb.fit(emails_train, labels_train)

0.3475609756097561   0.6524390243902439


In [152]:
prediction = nb.predict(emails_val)

TRAIN_00000.eml
-3124.2917367625064   -3197.6608349980256
TRAIN_00001.eml
-176.0941087483279   -177.55450402623399
TRAIN_00002.eml
-2090.8155071469073   -1974.2957525832412
TRAIN_00003.eml
-201.6670621080776   -208.2331456406732
TRAIN_00004.eml
-571.1671536290683   -585.2059157796995
TRAIN_00005.eml
-831.3237260913031   -802.3455392580993
TRAIN_00006.eml
-924.6224547713398   -887.7265101680446
TRAIN_00007.eml
-534.3684729614603   -497.72401322588706
TRAIN_00008.eml
-855.3735714176272   -792.9818229351796
TRAIN_00009.eml
-1572.6748877932293   -1484.7394782615395
TRAIN_00010.eml
-1705.5100126619072   -1616.1215132492216
TRAIN_00011.eml
-4776.560700897436   -4978.031458969587
TRAIN_00012.eml
-955.248424354504   -919.9855585117381
TRAIN_00013.eml
-1399.7607252417963   -1532.385160930786
TRAIN_00014.eml
-789.8706398359483   -823.8918613030867
TRAIN_00015.eml
-2610.768125173738   -2670.651172206421
TRAIN_00016.eml
-683.0297280613293   -639.907171961333
TRAIN_00017.eml
-325.46921653228003   -

-1092.3621311324457   -1039.4967053829657
TRAIN_00352.eml
-237.079283624288   -227.95084620588193
TRAIN_00353.eml
-2485.540881288492   -2555.2538147743717
TRAIN_00354.eml
-606.556202996448   -631.4919292440946
TRAIN_00355.eml
-4334.436523303191   -4472.208599335229
TRAIN_00356.eml
-575.7960788999152   -639.5538822849144
TRAIN_00357.eml
-1314.3178266770838   -1220.4650946122563
TRAIN_00358.eml
-10756.29715511877   -10432.979203291101
TRAIN_00359.eml
-920.1852533771587   -851.7030479478808
TRAIN_00360.eml
-512.1269095269053   -540.2690876224289
TRAIN_00361.eml
-1024.4451555616458   -1060.7291801441056
TRAIN_00362.eml
-12633.32062103034   -12514.80943837065
TRAIN_00363.eml
-743.4658908446767   -684.4219592887732
TRAIN_00364.eml
-648.7285100815291   -586.2907695396324
TRAIN_00365.eml
-648.4605952585944   -671.3323339884373
TRAIN_00366.eml
-490.87509773311507   -451.3673941355776
TRAIN_00367.eml
-1823.5897787371566   -1713.5505905736022
TRAIN_00368.eml
-2434.8905437426233   -2375.8899245961

-1045.045044693027   -975.9598753188983
TRAIN_00624.eml
-1605.866999566326   -1497.6842148632397
TRAIN_00625.eml
-25311.049868360682   -27405.82533877479
TRAIN_00626.eml
-1737.2843033419015   -1751.4321662867706
TRAIN_00627.eml
-783.1406782231929   -745.1085772819132
TRAIN_00628.eml
-631.3155626174782   -572.5367828139689
TRAIN_00629.eml
-7915.277073730037   -7589.3040864451905
TRAIN_00630.eml
-326.6925997725793   -334.20935743592173
TRAIN_00631.eml
-1574.9969737468625   -1546.054974557936
TRAIN_00632.eml
-558.7385106181749   -616.1493619276483
TRAIN_00633.eml
-1769.0449840308854   -1764.956005314918
TRAIN_00634.eml
-1368.3956218938004   -1393.7501595938245
TRAIN_00635.eml
-1167.5295067122865   -1080.3067652610573
TRAIN_00636.eml
-1452.5755729618406   -1375.4440489572817
TRAIN_00637.eml
-3802.838105596649   -3998.3653633325384
TRAIN_00638.eml
-1754.5918100071215   -1640.0094115869651
TRAIN_00639.eml
-947.0853966851411   -854.135366555166
TRAIN_00640.eml
-1226.3361400523513   -1154.8102

-10934.201886704723   -10617.667398411384
TRAIN_00888.eml
-1265.6259605951898   -1176.0221925907676
TRAIN_00889.eml
-1222.4771695397544   -1170.1660130087605
TRAIN_00890.eml
-418.7367107212358   -367.48926935461634
TRAIN_00891.eml
-1063.5206366095967   -1034.3248729192028
TRAIN_00892.eml
-612.3428858565784   -564.9831946159087
TRAIN_00893.eml
-1515.5577731041194   -1509.3270770504016
TRAIN_00894.eml
-826.4682782632801   -787.2841614728192
TRAIN_00895.eml
-1561.6989684710566   -1604.931935076087
TRAIN_00896.eml
-481.9471833872937   -444.76800057791604
TRAIN_00897.eml
-10317.60198887969   -10778.237194023715
TRAIN_00898.eml
-2052.2322583417645   -2198.660483032528
TRAIN_00899.eml
-509.4821006252975   -454.82634999056535
TRAIN_00900.eml
-597.6313954873189   -542.9842883012875
TRAIN_00901.eml
-537.7782044062583   -581.8905665964027
TRAIN_00902.eml
-569.2229419678582   -547.9119718440423
TRAIN_00903.eml
-1800.9045260154296   -1812.895899586409
TRAIN_00904.eml
-395.88269766777313   -436.2832

TRAIN_01229.eml
-1338.6156165017135   -1438.9325722648314
TRAIN_01230.eml
-2026.9212102199901   -1891.3778214106576
TRAIN_01231.eml
-1808.1076977285832   -1678.530254486868
TRAIN_01232.eml
-5126.009122570088   -5064.323012410721
TRAIN_01233.eml
-284.9759111663679   -270.90046288161676
TRAIN_01234.eml
-492.1630668804446   -538.3289660160409
TRAIN_01235.eml
-1142.3186310907895   -1019.8726341368917
TRAIN_01236.eml
-381.6829441112699   -417.20723406923526
TRAIN_01237.eml
-533.4402658398129   -485.15913794966525
TRAIN_01238.eml
-535.7359643364861   -589.7429511033072
TRAIN_01239.eml
-355.55402583412723   -348.7017896501516
TRAIN_01240.eml
-2110.5810351955643   -1942.6405790195724
TRAIN_01241.eml
-1781.8112801945854   -1645.0421290990653
TRAIN_01242.eml
-88202.63650442347   -85662.43592864064
TRAIN_01243.eml
-1323.5487212864823   -1268.518651815914
TRAIN_01244.eml
-962.2839404653269   -1013.5762539737154
TRAIN_01245.eml
-822.5597836956711   -754.7098213554525
TRAIN_01246.eml
-1237.501295247

-402.2509998853344   -384.44518977120487
TRAIN_01499.eml
-936.6510339536388   -911.1667802985571
TRAIN_01500.eml
-934.3401167673397   -951.8149141156314
TRAIN_01501.eml
-1140.5556937189435   -1234.560630381751
TRAIN_01502.eml
-667.012742424184   -602.5065624028445
TRAIN_01503.eml
-1267.944904921618   -1201.9554183435855
TRAIN_01504.eml
-1934.0204523670336   -2055.5678827223237
TRAIN_01505.eml
-2850.310897350062   -2710.394757937934
TRAIN_01506.eml
-261.55829870209027   -274.8385244301394
TRAIN_01507.eml
-709.3420668532426   -665.4427935899338
TRAIN_01508.eml
-308.3753286754247   -292.7854067583705
TRAIN_01509.eml
-659.2525995880575   -686.0403052791135
TRAIN_01510.eml
-359.43291590893534   -344.8003289229492
TRAIN_01511.eml
-320.40681567472205   -311.115879581098
TRAIN_01512.eml
-1099.0177584138507   -1008.2361037489367
TRAIN_01513.eml
-10796.89631771462   -10973.944437245436
TRAIN_01514.eml
-3305.6094233175804   -3160.236599718355
TRAIN_01515.eml
-841.6559326001711   -899.70941076617


-494.1492959023207   -472.7222083724841
TRAIN_01829.eml
-875.2836020419257   -815.8474146837902
TRAIN_01830.eml
-847.2912219497549   -876.135780402746
TRAIN_01831.eml
-317.9104465909745   -339.74013571863236
TRAIN_01832.eml
-1020.8112292721066   -980.9239588947785
TRAIN_01833.eml
-2673.898364817133   -2523.6660788881204
TRAIN_01834.eml
-1399.7581787926752   -1406.3940235085104
TRAIN_01835.eml
-435.4439603935403   -384.29512339241535
TRAIN_01836.eml
-1024.0309122966132   -968.4066003996417
TRAIN_01837.eml
-1220.7696444861444   -1187.0411906307424
TRAIN_01838.eml
-1559.797936159414   -1627.352803023755
TRAIN_01839.eml
-747.8264571552402   -720.1649480055293
TRAIN_01840.eml
-1893.6571506148405   -1826.8730695625586
TRAIN_01841.eml
-513.5860013264527   -470.2294057012337
TRAIN_01842.eml
-1302.9749831711058   -1203.8948236067683
TRAIN_01843.eml
-590.2261423774087   -564.1395725084471
TRAIN_01844.eml
-1443.5081174308032   -1353.380462425376
TRAIN_01845.eml
-1505.9539440943436   -1397.8119356

TRAIN_02123.eml
-1216.8202677502977   -1142.353188187914
TRAIN_02124.eml
-385.16024571119925   -353.0868659862709
TRAIN_02125.eml
-4323.53354739322   -4516.960376174847
TRAIN_02126.eml
-319.7907624571142   -296.6856753562618
TRAIN_02127.eml
-519.9894894856214   -478.9555269431867
TRAIN_02128.eml
-194.03685704770783   -219.21306176343302
TRAIN_02129.eml
-857.7807338552199   -803.1590243179373
TRAIN_02130.eml
-845.818359939056   -801.4298247138489
TRAIN_02131.eml
-307.52225376971256   -291.17575938382066
TRAIN_02132.eml
-615.0453129107938   -631.651578429488
TRAIN_02133.eml
-1068.8774936035732   -1017.9286507044559
TRAIN_02134.eml
-988.5175490376429   -916.7216594421283
TRAIN_02135.eml
-368.12483021208783   -344.4530581803968
TRAIN_02136.eml
-807.267822874147   -777.5710476243635
TRAIN_02137.eml
-987.771886191392   -922.6819241237575
TRAIN_02138.eml
-594.2362747260889   -530.6768314812686
TRAIN_02139.eml
-1633.043618694443   -1539.8569901395679
TRAIN_02140.eml
-360.2640005136495   -323.5

TRAIN_02480.eml
-2289.5120211271555   -2135.079034468333
TRAIN_02481.eml
-1521.9142520309445   -1654.6374580106578
TRAIN_02482.eml
-1063.078908560525   -1105.0617506640003
TRAIN_02483.eml
-1417.0107531262681   -1295.5982657499114
TRAIN_02484.eml
-1130.9305588159016   -1076.972977387736
TRAIN_02485.eml
-307.38936298199775   -290.1112396697493
TRAIN_02486.eml
-2077.720386387052   -2070.930847017183
TRAIN_02487.eml
-724.2707429168278   -668.8612375996917
TRAIN_02488.eml
-1836.975906032843   -1762.0631104072866
TRAIN_02489.eml
-483.1918045976237   -493.2042660630018
TRAIN_02490.eml
-1926.2917561781828   -1749.351127179353
TRAIN_02491.eml
-917.0865778788288   -981.3168317319908
TRAIN_02492.eml
-725.812199735283   -659.346870117304
TRAIN_02493.eml
-1051.8252555452145   -1006.0040935412928
TRAIN_02494.eml
-955.8503898262958   -947.1656112025944
TRAIN_02495.eml
-589.5521334120314   -525.08712966402
TRAIN_02496.eml
-120.61023301039853   -130.7050505186427
TRAIN_02497.eml
-803.1850073908676   -7

-2029.8388224288985   -1959.1083384974486
TRAIN_02748.eml
-416.26528554351813   -387.9858213502839
TRAIN_02749.eml
-1621.8882429881469   -1571.0427401478178
TRAIN_02750.eml
-253.1465994363662   -238.2475982757603
TRAIN_02751.eml
-1510.923861195136   -1421.6100730464036
TRAIN_02752.eml
-683.4274895549778   -716.0719974645665
TRAIN_02753.eml
-1412.8371549311096   -1325.5522808583382
TRAIN_02754.eml
-3072.6088310954665   -2969.104635959093
TRAIN_02755.eml
-250.0113271058087   -253.3163575671049
TRAIN_02756.eml
-427.2226400852642   -455.8313230128247
TRAIN_02757.eml
-386.76800340156905   -364.6064573247956
TRAIN_02758.eml
-575.8731064790513   -534.9632979307485
TRAIN_02759.eml
-878.23515554541   -839.6710135431696
TRAIN_02760.eml
-1894.8961192867323   -1793.864759008858
TRAIN_02761.eml
-1457.895548457708   -1329.74731170139
TRAIN_02762.eml
-701.7831067303752   -665.2911000342239
TRAIN_02763.eml
-1377.0891373739444   -1392.6436461454823
TRAIN_02764.eml
-671.6623367850596   -618.479660774471

-3086.7829686800756   -3065.6852888292224
TRAIN_03038.eml
-749.2984543313743   -690.4135655797347
TRAIN_03039.eml
-244.67443821055394   -212.18012365780692
TRAIN_03040.eml
-1584.5637889899672   -1490.5446251863068
TRAIN_03041.eml
-870.0704002637267   -890.4957269585709
TRAIN_03042.eml
-379.2450636136153   -362.20804609426636
TRAIN_03043.eml
-416.8337608008605   -404.37966316780717
TRAIN_03044.eml
-673.6865933428486   -716.0161952158215
TRAIN_03045.eml
-1031.4510086411585   -955.9948791015461
TRAIN_03046.eml
-517.7877405050392   -549.0947573932638
TRAIN_03047.eml
-472.2616563222741   -433.37444823701844
TRAIN_03048.eml
-443.4865206968688   -460.5617516556798
TRAIN_03049.eml
-1806.527510910243   -1740.824835800981
TRAIN_03050.eml
-1487.9980621814457   -1441.437612200054
TRAIN_03051.eml
-1869.1614286501633   -1740.1847788414827
TRAIN_03052.eml
-3154.5899306887877   -3071.0103325852215
TRAIN_03053.eml
-2201.539179409632   -2067.6051266329855
TRAIN_03054.eml
-992.730737138676   -926.0254622

-2798.6382968615553   -2608.154042308789
TRAIN_03373.eml
-539.2212031783456   -485.8524174195691
TRAIN_03374.eml
-1169.384734301377   -1119.2567049400093
TRAIN_03375.eml
-1026.5219253928278   -973.5063919104869
TRAIN_03376.eml
-1113.73458697335   -1013.4081342058801
TRAIN_03377.eml
-422.0967022214437   -449.7103634439881
TRAIN_03378.eml
-2079.8005450599644   -1929.8294735508355
TRAIN_03379.eml
-877.4613955081351   -927.8681188364708
TRAIN_03380.eml
-826.3899665898158   -800.4741614276367
TRAIN_03381.eml
-156.44288813010076   -159.76332949359704
TRAIN_03382.eml
-502.0781820970347   -475.7333184667656
TRAIN_03383.eml
-943.3828340499638   -918.9253042222487
TRAIN_03384.eml
-1252.291761614354   -1164.6788711627341
TRAIN_03385.eml
-4478.936200230451   -4437.6366530949645
TRAIN_03386.eml
-653.0375175931888   -618.9722854059305
TRAIN_03387.eml
-1038.4680915557512   -1083.4194484076606
TRAIN_03388.eml
-2948.9042769987554   -2841.8771252589563
TRAIN_03389.eml
-3626.184333710852   -3768.46186443

-673.2555252448633   -622.3846446512032
TRAIN_03672.eml
-338.84214200075655   -328.51325738756157
TRAIN_03673.eml
-771.9287150142667   -733.9019652974721
TRAIN_03674.eml
-1678.5732952982878   -1566.599795446149
TRAIN_03675.eml
-1006.1561958490543   -1069.3342261724779
TRAIN_03676.eml
-384.11747578631196   -418.95457023394613
TRAIN_03677.eml
-2061.8848756097764   -2107.032972457721
TRAIN_03678.eml
-4676.732917094742   -4909.0975851813955
TRAIN_03679.eml
-438.18276642475104   -409.5060386560486
TRAIN_03680.eml
-1891.6540018838334   -1737.5096137796033
TRAIN_03681.eml
-1275.2536137913794   -1171.8253371541523
TRAIN_03682.eml
-1303.1171830375517   -1364.1097499893126
TRAIN_03683.eml
-249.59598836291718   -234.7263202487253
TRAIN_03684.eml
-1252.7695410418523   -1157.3007767307834
TRAIN_03685.eml
-2944.4248400231095   -2898.008833304677
TRAIN_03686.eml
-1064.7977494157524   -995.0749653523322
TRAIN_03687.eml
-2068.3877297988356   -2119.9767639789143
TRAIN_03688.eml
-4451.627008898029   -462

TRAIN_03970.eml
-1094.6535275781805   -1021.949789515152
TRAIN_03971.eml
-977.5948547144753   -1001.7442852318123
TRAIN_03972.eml
-1071.2781058451735   -1044.7126064884153
TRAIN_03973.eml
-456.4409506554447   -421.31325000778565
TRAIN_03974.eml
-1290.3338731727945   -1185.8649983697314
TRAIN_03975.eml
-898.7442507730772   -813.5148897824206
TRAIN_03976.eml
-537.723763988474   -573.5173921255428
TRAIN_03977.eml
-1667.1315083465822   -1550.7273595090937
TRAIN_03978.eml
-1392.1500416772776   -1287.502727892146
TRAIN_03979.eml
-501.1845087859442   -459.1847123212165
TRAIN_03980.eml
-2000.5181157766576   -1893.7988714384487
TRAIN_03981.eml
-1646.8841563806661   -1569.0987489616823
TRAIN_03982.eml
-579.8993773771288   -552.6272918988374
TRAIN_03983.eml
-932.3308100578411   -887.2002206661107
TRAIN_03984.eml
-664.5855700589581   -632.4635072454381
TRAIN_03985.eml
-820.8953789808794   -791.7698876809486
TRAIN_03986.eml
-101.26653867345438   -109.9667021903031
TRAIN_03987.eml
-1983.459790905663

In [154]:
num = 0
total = 0
for i in labels_val:
    if(prediction[total] == int(labels_val[i])):
        num += 1
    total += 1
num/total

0.9835041239690078