# Naive Bayes Classifier

In [1]:
# label = 1 : SPAM
# label = 0 : NORMAL
training_set = [
    ['me free lottery', 1],
    ['free get free you', 1],
    ['you free scholarship', 0],
    ['free to contact me', 0],
    ['you won award', 0],
    ['you ticket lottery', 1]
]

In [2]:
from collections import defaultdict

doc_cnt0 = 0 # normal count
doc_cnt1 = 0 # spam count

wordfreq = defaultdict(lambda : [0, 0])

for doc, label in training_set:
    words = doc.split()
    for word in words:
        wordfreq[word][label] += 1
        
wordfreq

defaultdict(<function __main__.<lambda>()>,
            {'me': [1, 1],
             'free': [2, 3],
             'lottery': [0, 2],
             'get': [0, 1],
             'you': [2, 2],
             'scholarship': [1, 0],
             'to': [1, 0],
             'contact': [1, 0],
             'won': [1, 0],
             'award': [1, 0],
             'ticket': [0, 1]})

In [3]:
for key, (cnt0, cnt1) in wordfreq.items():
    doc_cnt0 += cnt0 # normal count
    doc_cnt1 += cnt1 # spam count
    
doc_cnt0, doc_cnt1

(10, 10)

In [4]:
cond_prob_dict = wordfreq.copy()
k = 0.5

for key, (cnt0, cnt1) in wordfreq.items():
    # 스팸메일일 때 해당 단어가 출현할 확률
    prob_word_given_spam = (k + cnt1) / (2 * k + doc_cnt1)
    
    # 일반메일일 때 해당 단어가 출현할 확률
    prob_word_given_norm = (k + cnt0) / (2 * k + doc_cnt0)
    
    cond_prob_dict[key] = [prob_word_given_norm, prob_word_given_spam]
    
cond_prob_dict

defaultdict(<function __main__.<lambda>()>,
            {'me': [0.13636363636363635, 0.13636363636363635],
             'free': [0.22727272727272727, 0.3181818181818182],
             'lottery': [0.045454545454545456, 0.22727272727272727],
             'get': [0.045454545454545456, 0.13636363636363635],
             'you': [0.22727272727272727, 0.22727272727272727],
             'scholarship': [0.13636363636363635, 0.045454545454545456],
             'to': [0.13636363636363635, 0.045454545454545456],
             'contact': [0.13636363636363635, 0.045454545454545456],
             'won': [0.13636363636363635, 0.045454545454545456],
             'award': [0.13636363636363635, 0.045454545454545456],
             'ticket': [0.045454545454545456, 0.13636363636363635]})

In [6]:
import numpy as np
log_cond_prob_dict = {key:[np.log(v1), np.log(v2)] for (key,[v1, v2]) in cond_prob_dict.items()}
log_cond_prob_dict

{'me': [-1.9924301646902063, -1.9924301646902063],
 'free': [-1.4816045409242156, -1.1451323043030026],
 'lottery': [-3.0910424533583156, -1.4816045409242156],
 'get': [-3.0910424533583156, -1.9924301646902063],
 'you': [-1.4816045409242156, -1.4816045409242156],
 'scholarship': [-1.9924301646902063, -3.0910424533583156],
 'to': [-1.9924301646902063, -3.0910424533583156],
 'contact': [-1.9924301646902063, -3.0910424533583156],
 'won': [-1.9924301646902063, -3.0910424533583156],
 'award': [-1.9924301646902063, -3.0910424533583156],
 'ticket': [-3.0910424533583156, -1.9924301646902063]}

In [9]:
prob_norm = (doc_cnt0) / (doc_cnt0 + doc_cnt1)
prob_spam = 1 - prob_norm

prob_norm, prob_spam

(0.5, 0.5)

In [12]:
# 
prob_norm_given_free_lottery_numerator = np.exp(\
                log_cond_prob_dict['free'][0] + log_cond_prob_dict['lottery'][0] + np.log(prob_norm))

prob_spam_given_free_lottery_numerator = np.exp(\
                log_cond_prob_dict['free'][1] + log_cond_prob_dict['lottery'][1] + np.log(prob_spam))

prob_normspam_given_free_lottery_numerator = prob_norm_given_free_lottery_numerator +\
                                                    prob_spam_given_free_lottery_numerator

In [13]:
prob_norm_given_free_lottery = prob_norm_given_free_lottery_numerator / prob_normspam_given_free_lottery_numerator
prob_spam_given_free_lottery = prob_spam_given_free_lottery_numerator / prob_normspam_given_free_lottery_numerator

prob_norm_given_free_lottery, prob_spam_given_free_lottery

(0.12500000000000008, 0.8749999999999999)

## Naive Bayesian 스팸 탐지 함수 작성

In [104]:
def detect_spam_NB(training_set, email_words, unseen_word_accept=True):
    from collections import defaultdict
    
    doc_cnt0 = 0 # normal count
    doc_cnt1 = 0 # spam count

    wordfreq = defaultdict(lambda : [0, 0])

    for doc, label in training_set:
        words = doc.split()
        for word in words:
            wordfreq[word][label] += 1
    
    
    # word check
    for word in email_words:
        if word not in wordfreq.keys():
            if not unseen_word_accept:
                assert word not in email_words, "Invalid word list." # assert
            wordfreq[word] = [1, 1] # set 50% / 50%
                
    
    for key, (cnt0, cnt1) in wordfreq.items():
        doc_cnt0 += cnt0 # normal count
        doc_cnt1 += cnt1 # spam count
    
    cond_prob_dict = wordfreq.copy()
    k = 0.5
    prob_norm = (doc_cnt0) / (doc_cnt0 + doc_cnt1)
    prob_spam = 1 - prob_norm
    
    # Laplace Smoothing 적용
    for key, (cnt0, cnt1) in wordfreq.items():
        # 스팸메일일 때 해당 단어가 출현할 확률
        prob_word_given_spam = (k + cnt1) / (2 * k + doc_cnt1)

        # 일반메일일 때 해당 단어가 출현할 확률
        prob_word_given_norm = (k + cnt0) / (2 * k + doc_cnt0)
        
        # cond_prob_dict 업데이트
        cond_prob_dict[key] = [prob_word_given_norm, prob_word_given_spam]
    
    # ㅣog 변환 - 언더플로우 방지
    log_cond_prob_dict = {key:[np.log(v1), np.log(v2)] for (key,[v1, v2]) in cond_prob_dict.items()}
    
    #print(log_cond_prob_dict)
    
    # 계산
    numerators_norm_given_words = np.exp(np.sum([log_cond_prob_dict[word][0] for word in email_words]))
    numerators_spam_given_words = np.exp(np.sum([log_cond_prob_dict[word][1] for word in email_words]))
    denominator = numerators_norm_given_words + numerators_spam_given_words
    
    #print(numerators_spam_given_words, denominator)
    
    prob_norm_given_words = numerators_norm_given_words / denominator
    prob_spam_given_words = 1 - prob_norm_given_words
    
    return round(prob_spam_given_words, 3)

## Test

In [105]:
detect_spam_NB(training_set,
              ['me', 'award'], False)

0.25

In [106]:
detect_spam_NB(training_set,
              ['free', 'lottery'], False)

0.875

In [107]:
detect_spam_NB(training_set,
              ['free', 'lottery', 'ticket'], False)

0.955

In [108]:
detect_spam_NB(training_set,
              ['free', 'lottery', 'ticket', 'me'], False) # me는 정상, 스팸메일 출현빈도가 동일하므로 정보량 X

0.955

In [111]:
detect_spam_NB(training_set,
              ['free', 'lottery', 'ticket', 'me', 'award'], False)

0.875

In [109]:
detect_spam_NB(training_set,
              ['free', 'lottery', 'ticket', 'aha', 'gogo'], # aha, gogo는 정상, 스팸메일 출현빈도 동일하므로 정보량X
               True)

0.955

In [110]:
detect_spam_NB(training_set,
              ['free', 'lottery', 'ticket', 'aha', 'gogo'],
              False)

AssertionError: Invalid word list.

# Scikit-learn Multinomial Naive Bayesian

In [112]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [118]:
twenty_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [128]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [119]:
twenty_train.target

array([7, 4, 4, ..., 3, 1, 8])

In [121]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

In [122]:
text_clf = Pipeline([('vect', CountVectorizer()), # DTM - Document-Term Matrix
                     ('tfidf', TfidfTransformer()), # tf-idf 계산
                     ('clf', MultinomialNB())]) # tf-idf 수치 기반으로 NB classification

text_clf = text_clf.fit(twenty_train.data,
                        twenty_train.target)

In [125]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)

In [126]:
twenty_test.data[0]

'From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)\nSubject: Need info on 88-89 Bonneville\nOrganization: University at Buffalo\nLines: 10\nNews-Software: VAX/VMS VNEWS 1.41\nNntp-Posting-Host: ubvmsd.cc.buffalo.edu\n\n\n I am a little confused on all of the models of the 88-89 bonnevilles.\nI have heard of the LE SE LSE SSE SSEI. Could someone tell me the\ndifferences are far as features or performance. I am also curious to\nknow what the book value is for prefereably the 89 model. And how much\nless than book value can you usually get them for. In other words how\nmuch are they in demand this time of year. I have heard that the mid-spring\nearly summer is the best time to buy.\n\n\t\t\tNeil Gandler\n'

In [129]:
# 모델 예측한 문서 카테고리
twenty_train.target_names[predicted[0]]

'rec.autos'

In [131]:
# 실제 문서 카테고리
twenty_train.target_names[twenty_test.target[0]]

'rec.autos'

In [132]:
# accuracy
np.mean(predicted == twenty_test.target)

0.7738980350504514

## Tuning

In [139]:
from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range' : [(1, 1), (1, 2)], # (1,1) = unigram, (1,2) = bigram
              'tfidf__use_idf' : (True, False),
              'clf__alpha' : (0.1, 0.25, 0.5, 0.75, 1.0)
             }

gscv_clf = GridSearchCV(text_clf, parameters,
                       n_jobs=-1, verbose=3)

In [141]:
gscv_clf_fit = gscv_clf.fit(twenty_train.data, twenty_train.target)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.0min finished


In [144]:
gscv_clf.best_score_

0.9017146241794632

In [142]:
gscv_clf.best_params_

{'clf__alpha': 0.1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

In [145]:
predicted = gscv_clf.best_estimator_.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.8263409453000531