In [10]:
from collections import Counter
from vectors import distance
import random 
import matplotlib.pyplot as plt
from collections import defaultdict
from statistic import mean
import math
import glob,re

## Naive Bayes

In [5]:
def tokenize(message):
    message = message.lower()
    all_words = re.findall('[a-z0-9]+',message)
    return set(all_words)


def count_words(training_set):
    counts = defaultdict(lambda: [0,0])

    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1

    return counts

def words_probabilites(counts, total_spams, total_non_spams, k=0.5):
    return [(w,((spam+k)/(total_spams + 2*k)),((non_spam + k)/(total_non_spams + 2*k))) 
            for w,(spam,non_spam) in counts.items()]

def spam_probability(word_props,message):
    message_words = tokenize(message)
    log_prop_if_spam = log_prop_if_not_spam = 0

    for word, prob_if_spam, prob_if_not_spam in word_props:
        
        if word in message_words:
            log_prop_if_spam += math.log(prob_if_spam) 
            log_prop_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prop_if_spam += math.log(1-prob_if_spam) 
            log_prop_if_not_spam += math.log(1-prob_if_not_spam)


    prob_if_spam = math.exp(log_prop_if_spam)
    prob_if_not_spam = math.exp(log_prop_if_not_spam)

    return prob_if_spam/(prob_if_spam + prob_if_not_spam)


In [34]:
class NaiveBayesClassifier: 
    def __init__(self, k=0.5):
        self.k = k
        self.word_props = []

    def train(self, training_set):

        num_spams = len([message for message,is_spam in training_set if is_spam])
        num_non_spams = len(training_set) - num_spams 

        word_counts = count_words(training_set)

        self.word_props = words_probabilites(word_counts,num_spams,num_non_spams,self.k)

    def classify(self,message):
        return spam_probability(self.word_props, message)

In [30]:
path = r"spam\*"

data = []

for fn in glob.glob('spam/*/*'):
    is_spam = 'ham' not in fn
    try:
        with open(fn,'r') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subject = re.sub(r"^Subject: ", "", line).strip()
                    data.append((subject, is_spam))
                    break
    except:
        pass 
        #print(f'erro ao abrir arquivo: {fn}')

print(len(data))

3074


In [32]:
def split_data(data, prop):
    results = [[],[]]

    for num in data:
        results[0 if random.random() < prop else 1].append(num)

    return results


In [36]:
random.seed(0)

train_data, test_data = split_data(data, 0.75)

classifier = NaiveBayesClassifier()

classifier.train(train_data)


classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]

counts = Counter((is_spam,spam_probability > 0.5) for _,is_spam,spam_probability in classified)

counts.most_common()

[((False, False), 648),
 ((True, True), 69),
 ((True, False), 35),
 ((False, True), 22)]

In [40]:
classified.sort(key=lambda row:row[2], reverse=True)
classified

[('MSNBC: Rates Hit 18 year Low 4.75% ...28940', True, 0.9999999941001101),
 ('Last time you spent $25 did u make $100,000,s? Well this time you will. wju',
  True,
  0.999999988485202),
 ('HOME REPS WANTED-FORTUNE 500 COMP HIRING', True, 0.9999999761928421),
 ('FORTUNE 500 WORK AT HOME REPS NEEDED!', True, 0.9999998981122058),
 ('Order your Viagra and weight-loss here 6117kFvc5--9',
  True,
  0.9999998900992519),
 ('Fw: PROTECT YOUR COMPUTER,YOU NEED SYSTEMWORKS!DGYWAOG',
  True,
  0.9999998655671016),
 ('ADV: Lowest life insurance rates available!                                                   moode',
  True,
  0.9999997977148453),
 ('[ILUG] Earn 100.000$ in one year working at home',
  True,
  0.9999996140424159),
 ('ADV: Extended Auto Warranties Here hvgxs', True, 0.9999990779082213),
 ('International calls for only 33 cents per minute with no subscription',
  True,
  0.9999974955038126),
 ('[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206',
  True,
  0.9999968704592087),
 

In [43]:
spammiest_hams = list(filter(lambda row: not row[1], classified))
hammiest_spams = list(filter(lambda row: row[1], classified))

print("spammiest_hams",spammiest_hams[-5:])
print("hammiest_spams",hammiest_spams[:5])

spammiest_hams [('Re: [SAtalk] checking out Razor2 (and SA 2.41) install - Net::DNS:Resolver problem?', False, 1.0287954956778856e-09), ('Re: [Razor-users] Problem with Razor 2.14 and Spamassassin 2.41', False, 3.5645248760271325e-10), ('Re[2]: Selling Wedded Bliss (was Re: Ouch...)', False, 2.1889677222997326e-10), ('RE: Re[2]: Selling Wedded Bliss (was Re: Ouch...)', False, 2.1889677222997326e-10), ('Re[2]: Selling Wedded Bliss (was Re: Ouch...)', False, 2.1889677222997326e-10)]
hammiest_spams [('MSNBC: Rates Hit 18 year Low 4.75% ...28940', True, 0.9999999941001101), ('Last time you spent $25 did u make $100,000,s? Well this time you will. wju', True, 0.999999988485202), ('HOME REPS WANTED-FORTUNE 500 COMP HIRING', True, 0.9999999761928421), ('FORTUNE 500 WORK AT HOME REPS NEEDED!', True, 0.9999998981122058), ('Order your Viagra and weight-loss here 6117kFvc5--9', True, 0.9999998900992519)]
