# 手刻基本Naive Bayes模型

#### 學習重點：理解單純貝氏模型原理

---

In [1]:
!unzip -q spam_data.zip

In [2]:
import os
import re
import math

def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)

### 讀入資料並分割為 train/testset

In [3]:
X, Y = [], []
for root, dirs, files in os.walk("spam_data"):
    for name in files:
        filename = os.path.join(root, name)
        is_spam = False if 'ham' in filename else True
        with open(filename, encoding='utf-8', errors='ignore') as f:
            for line in f.readlines():
                if line.startswith('Subject:'):
                    subject = re.sub(r'^Subject:', '', line).strip()
                    X.append(subject)
                    Y.append(is_spam)

In [4]:
from sklearn.model_selection import train_test_split
# random_state 是為了讓各為學員得到相同的結果，平時可以移除
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [5]:
train_data = [(x, y) for x, y in zip(X_train, y_train)]
test_data = [(x, y) for x, y in zip(X_test, y_test)]

---

### defaultdict用法示範

In [6]:
from collections import defaultdict

counts = defaultdict(lambda: [0, 0])
counts['you'][0] += 1
counts['hi'][0] += 1
counts['hi'][1] += 2
counts['no'][1] += 1
counts['no'][0] += 8
print(f"dic : {counts}")
print(f"you : {counts['you']}")

dic : defaultdict(<function <lambda> at 0x7f423f4075f0>, {'you': [1, 0], 'hi': [1, 2], 'no': [8, 1]})
you : [1, 0]


### 創造一個字典，裡面是{'hi': [1, 0]}，對應第一個數字是是垃圾郵件的次數，對應第二個數字是不是垃圾郵件的次數

In [7]:
def count_words(training_set):
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

---

## 計算 p(w|spam) / p(w|non_spam)
* 其中K為超參數，為了確保分母/分子皆不為0

In [8]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    # 獲得三組數據，分別為w這個字，p(w|spam)，p(w|non_spam)
    # counts[w][0]=spam, counts[w][1]=non_spam
    return [(w, 
             (counts[w][0] + k) / (total_spams + 2 * k), 
             (counts[w][1] + k)/(total_non_spams + 2 * k)
            ) for w in counts]

---

## 計算貝氏結果

In [9]:
def spam_probability(word_probs, message, is_spam_probability, is_not_spam_probability):
    # 先把這個mail的文字處理一下
    message_words = tokenize(message)
    #初始化值
    log_prob_if_spam = log_prob_if_not_spam = 0.
    # 將w這個字，p(w|spam)，p(w|non_spam)依序引入
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            # 假如這個字有在這個mail中出現
            # 把他的p(w|spam)轉log值加上log_prob_if_spam
            log_prob_if_spam += math.log(prob_if_spam)
            # 把他的p(w|non_spam)轉log值加上log_prob_if_not_spam
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        else:
            # 如果沒出現log_prob_if_spam➕上得值就是1-p(w|spam)也就是這封信是垃圾郵件但是w這個字卻沒在裡面
            log_prob_if_spam += math.log(1 - prob_if_spam)
            log_prob_if_not_spam += math.log(1 - prob_if_not_spam)
    log_prob_if_spam += math.log(is_spam_probability)
    log_prob_if_not_spam += math.log(is_not_spam_probability)
    
    # 把+起來的值轉成exp再算NaiveBayes
    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    #貝氏
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)

---

### 打包整個模型

In [10]:
class NaiveBayesClassifier:
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):
        # 訓練的資料格式為(message, is_spam)
        # 所有垃圾郵件的數量
        num_spams = len([is_spam for _, is_spam in training_set if is_spam])
        #所有不是垃圾郵件的數量
        num_non_spams = len(training_set) - num_spams
        
        self.is_spam_probability = num_spams / (num_spams + num_non_spams)
        self.is_not_spam_probability = num_non_spams / (num_spams + num_non_spams)
        # 把training_set裡面的所有字體轉成('Bad', num_is_spam, num_not_spam)
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, num_spams, num_non_spams, self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message, self.is_spam_probability, self.is_not_spam_probability)

---

### Fit 訓練集

In [11]:
classifier=NaiveBayesClassifier()

In [12]:
classifier.train(train_data)

### 預測

In [13]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

y_true = [is_spam for _, is_spam in test_data]
y_pred = [classifier.classify(subject) > 0.5 for subject, _ in test_data]

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)
print(f"precision : {precision * 100:.2f}%")
print(f"recall : {recall * 100:.2f}%")
print(f"accuracy : {accuracy * 100:.2f}%")

precision : 90.00%
recall : 53.47%
accuracy : 92.26%


In [14]:
import pandas as pd

y_true = pd.Series(y_true).map({True: 'spam', False: 'ham'})
y_pred = pd.Series(y_pred).map({True: 'spam', False: 'ham'})
confusion_matrix = pd.crosstab(y_true, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [15]:
confusion_matrix

Predicted,ham,spam,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ham,578,6,584
spam,47,54,101
All,625,60,685
