In [29]:
import re,glob
from collections import defaultdict, Counter
import random, math

In [40]:
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+",message)
    return set(all_words)

def count_words(training_set):
    """zbior treningowy to para (message,is_spam)"""
    counts = defaultdict(lambda: [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

def word_probabilities(counts,total_spams,total_non_spams,k=0.5):
    """zwrocenie 3-elementowej listy zawierajace slowo, prawdopodobienstwo wystapienia w spamie i prawdopodobienstwa nie bycia spamem"""
 
    return [(w,(spam +k)/(total_spams + 2 *k),
            (non_spam + k)/(total_non_spams +2 * k))
            for w,(spam,non_spam) in counts.items()]

def spam_probability(word_probs, message):
    """prawdopodbienstwo wystapienia slow w celu przypisania prawdopodobienstw do wiadomosci"""
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [35]:
class NaiveBayesClassifier:
    def __init__(self,k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self,training_set):
        num_spams = len([is_spam for message, is_spam in training_set if is_spam])
        num_non_spams = len(training_set) - num_spams
        
        #przetworzenie zbioru danych
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,num_spams,num_non_spams,self.k)
        
    def classify(self,message):
        return spam_probability(self.word_probs,message)
        
        

In [59]:
path = r"D:\\studia\\spam\\*\\*\\**"

data = []

for fn in glob.glob(path):
    is_spam = "ham" not in fn

    with open(fn,'r',encoding='ISO-8859-1') as file:
        for line in file:
            if line.startswith("Subject:"):
                subject = re.sub(r"^Subject: ","", line).strip()
                data.append((subject, is_spam))

#metoda z pliku machine_learning
def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

#dodatkowa metoda, ktora na podstawie twierdzenia bayesa oblicza prawdopodobienstwa spamu
def p_spam_given_word(word_prob):
    word, prob_if_spam, prob_if_not_spam = word_prob
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

In [61]:
#podzielenie zbioru na treningowy i testowy oraz zbudowanie klasyfikatora
random.seed(0)
train_data,test_data = split_data(data,0.6)

classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(subject, is_spam,classifier.classify(subject)) for subject,is_spam in test_data]
counts = Counter((is_spam,spam_probability > 0.5) for _,is_spam,spam_probability in classified)

classified.sort(key=lambda row: row[2])
#najwieksze prawdopodbienstwo spamu wsrod wiadomosci niebedacych spamem

spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]

#najmniejsze prawdopodobienstwo spamu wsrod wiadomosci bedacych spamem

hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

print("spammiest_hams", spammiest_hams,'\n')
print("hammiest_spams", hammiest_spams,'\n')

words = sorted(classifier.word_probs, key=p_spam_given_word)

#najwieksze prawdopodbienstwo spamu
spammiest_words = words[-5:]
#najwieksze prawdopodobienstwo ze nie jest spamem
hammiest_words = words[:5]

print("spammiest_words", spammiest_words,'\n')
print("hammiest_words", hammiest_words)

spammiest_hams [('FREE SHIPPING! No Minimum Purchase* at Buy.com', False, 0.9814624012504162), ('=?iso-8859-1?Q?Matrox_Parhelia=99_now_available?=', False, 0.9871924463035405), ('Cost price Guinness, Budweiser and selected spirits at tesco.ie', False, 0.9898231096678337), ('=?iso-2022-jp?B?GyRCRnxLXDhsJE43b0w+IUolNSVWJTglJyUvJUghSyEhJTkbKEI=?=', False, 0.9910012705992947), ('Four free e-mailers reviewed, Get the gear you need to burn DVDs', False, 0.9999182370913647)] 

hammiest_spams [('I was so scared... my very first DP', True, 2.063843818875714e-05), ('Not too old to put out!                   26792', True, 0.00041945396601744613), ('The Flight to Safety is Upon Us', True, 0.000987253170881586), ("UK's Leading PC Specialist", True, 0.0015306190853744126), ('Looking for property in SPAIN?', True, 0.002282369371051323)] 

spammiest_words [('sale', 0.026501766784452298, 0.00029291154071470416), ('need', 0.026501766784452298, 0.00029291154071470416), ('zzzz', 0.030035335689045935, 0.00