In [1]:
from machine_learning import split_data
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from collections import defaultdict, Counter
import random, math
import re
import glob

In [3]:
def tokenize(message):
    message = message.lower() #Małe litery
    all_words = re.findall("[a-z0-9]+", message) #Wyciagnij słowa
    return set(all_words) #Usun duplikaty

In [4]:
#Tworzy słownik, którego kluczami są słowa, 
#a wartościami lista zawierająca liczbę wystąpień tego słowa 
#w wiadomościach będących i niebędących spamem.
def count_words(training_set):
    counts = defaultdict(lambda: [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

In [5]:
#Zamiana sum z metody wyżej na szacunki prawdopodobieństwa
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

In [6]:
#wykorzystanie naiwnego klasyfikatora do 
#przypisania prawdopodobieństw do wiadomości
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    #iteracja po wszystkich słowach w słowniku
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [7]:
#Połączenie elementów w celu uzyskania klasyfikatora

class NaiveBayesClassifier:
    
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):
        #policzenie wiadomości będących/niebędących spamem
        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        #zbiór treningowy
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [25]:
#Testowanie modelu
path = r"F:\\_Semestr7\\Laborki_Zacniewski\\Spam\\spam\\*"
data = []

for fn in glob.glob(path):
    is_spam = "ham" not in fn

    with open(fn,'r',encoding='ISO-8859-1') as file:
        for line in file:
            if line.startswith("Subject:"):
                subject = re.sub(r"^Subject: ", "", line).strip()
                data.append((subject, is_spam))
                
random.seed(0)
train_data, test_data = split_data(data, 0.70)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

counts = Counter((is_spam, spam_probability > 0.5)
                 for _, is_spam, spam_probability in classified)

print(counts)
print('\n\n')

#Sortuj według wartości spam_probability
classified.sort(key=lambda row: row[2])

#Największe prawdopodob. spamu uzyskane wśród wiadomości niebędących spamem:
spammiest_hams = list(filter(lambda row: row[1], classified))[-5:]
#Najmniejsze prawdopodob. spamu uzyskane wśród wiadomości będących spamem:
hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

print("spammiest_hams", spammiest_hams)
print('\n\n')

print("hammiest_spams", hammiest_spams)
print('\n\n')
    

Counter({(True, True): 157})



spammiest_hams [('Hit the Road with CNA', True, 1.0), ('$10 a hour for watching e-mmercials! No joke!', True, 1.0), ("Today's Special: Amazing Penetrations No. 17 29264", True, 1.0), ("GOV'T GUARANTEED HOME BUSINESS", True, 1.0), ('[ILUG] MANUEL OKO', True, 1.0)]



hammiest_spams [('Life Insurance - Why Pay More?', True, 1.0), ('[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206', True, 1.0), ('FORTUNE 500 COMPANY HIRING, AT HOME REPS.', True, 1.0), ('^^^^^Cell Phone Belt Clips $1.95^^^^^^                           18070', True, 1.0), ('FREE Cell Phone + $50 Cash Back!', True, 1.0)]





In [26]:
#Najczęściej używane słowa w spamie
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

words = sorted(classifier.word_probs, key=p_spam_given_word)

spammiest_words = words[-5:]
hammiest_words = words[:5]

print("spammiest_hams", spammiest_hams)
print('\n\n')

print("hammiest_spams", hammiest_spams)
print('\n\n')

spammiest_hams [('Hit the Road with CNA', True, 1.0), ('$10 a hour for watching e-mmercials! No joke!', True, 1.0), ("Today's Special: Amazing Penetrations No. 17 29264", True, 1.0), ("GOV'T GUARANTEED HOME BUSINESS", True, 1.0), ('[ILUG] MANUEL OKO', True, 1.0)]



hammiest_spams [('Life Insurance - Why Pay More?', True, 1.0), ('[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206', True, 1.0), ('FORTUNE 500 COMPANY HIRING, AT HOME REPS.', True, 1.0), ('^^^^^Cell Phone Belt Clips $1.95^^^^^^                           18070', True, 1.0), ('FREE Cell Phone + $50 Cash Back!', True, 1.0)]



