# 문서 분류 (Document Classification)

# 1 나이브 베이즈 분류(Naive Bayes Classifier)

## 1.1 직접구현

### Naive Bayes Classifier

In [1]:
# 1: 스팸, 0: 정상
training_set = [
    ['me free lottery', 1],
    ['free get free you', 1],
    ['you free scholarship', 0],
    ['free to contact me', 0],
    ['you won award', 0],
    ['you ticket lottery', 1]
]

### 토큰 빈도수 및 문서별 토큰수 계산 (확률 계산을 위한 준비)

![대체 텍스트](https://wikimedia.org/api/rest_v1/media/math/render/svg/98f086c560aa2f66650060277dda4f90e54e30c0)

In [None]:
[
    # 스팸, #정상
    [0,     0 ], # 토큰 1

]

In [2]:
training_set

[['me free lottery', 1],
 ['free get free you', 1],
 ['you free scholarship', 0],
 ['free to contact me', 0],
 ['you won award', 0],
 ['you ticket lottery', 1]]

In [7]:
# 사전확률 계산을 위한 준비
# 1. token2idx
# 2. 각 라벨별 토큰들
unique_tokens = []

token_dict = {
    0: [], # 정상, 토큰들
    1: []  # 스팸, 토큰들
}

for sent, label in training_set:
    tokens = sent.split()
    unique_tokens.extend(tokens)
    token_dict[label].extend(tokens)

print(token_dict, set(unique_tokens), sep='\n')
token2idx = {token:i for i, token in enumerate(set(unique_tokens))}

{0: ['you', 'free', 'scholarship', 'free', 'to', 'contact', 'me', 'you', 'won', 'award'], 1: ['me', 'free', 'lottery', 'free', 'get', 'free', 'you', 'you', 'ticket', 'lottery']}
{'won', 'me', 'free', 'ticket', 'lottery', 'get', 'you', 'contact', 'to', 'award', 'scholarship'}


In [8]:
token2idx

{'won': 0,
 'me': 1,
 'free': 2,
 'ticket': 3,
 'lottery': 4,
 'get': 5,
 'you': 6,
 'contact': 7,
 'to': 8,
 'award': 9,
 'scholarship': 10}

### Training : 토큰별 조건부 확률 계산

In [9]:
# 토큰별 스팸, 정상 카운팅
prior_list = []

for token in token2idx:
    token_0 = token_dict[0].count(token) # 정상 토큰들 (list)
    token_1 = token_dict[1].count(token) # 스팸 토큰들 (list)

    prior_list.append( (token_0, token_1) )

prior_list

[(1, 0),
 (1, 1),
 (2, 3),
 (0, 1),
 (0, 2),
 (0, 1),
 (2, 2),
 (1, 0),
 (1, 0),
 (1, 0),
 (1, 0)]

In [10]:
import pandas as pd

In [16]:
nb_df = pd.DataFrame(prior_list, index=token2idx)

In [18]:
label_cols = nb_df.columns

In [19]:
# laplace smoothing 적용 조건부확률 계산
k = 0.5

for label in label_cols:
    nb_df[f'p_{label}'] = (nb_df[label] + k) / (nb_df[label].sum() + 2 * k)
nb_df

Unnamed: 0,0,1,p_0,p_1
won,1,0,0.136364,0.045455
me,1,1,0.136364,0.136364
free,2,3,0.227273,0.318182
ticket,0,1,0.045455,0.136364
lottery,0,2,0.045455,0.227273
get,0,1,0.045455,0.136364
you,2,2,0.227273,0.227273
contact,1,0,0.136364,0.045455
to,1,0,0.136364,0.045455
award,1,0,0.136364,0.045455


In [20]:
import numpy as np
# 로그 씌우기
for label in label_cols:
    nb_df[f'log_p_{label}'] = np.log(nb_df[f'p_{label}'])

nb_df

Unnamed: 0,0,1,p_0,p_1,log_p_0,log_p_1
won,1,0,0.136364,0.045455,-1.99243,-3.091042
me,1,1,0.136364,0.136364,-1.99243,-1.99243
free,2,3,0.227273,0.318182,-1.481605,-1.145132
ticket,0,1,0.045455,0.136364,-3.091042,-1.99243
lottery,0,2,0.045455,0.227273,-3.091042,-1.481605
get,0,1,0.045455,0.136364,-3.091042,-1.99243
you,2,2,0.227273,0.227273,-1.481605,-1.481605
contact,1,0,0.136364,0.045455,-1.99243,-3.091042
to,1,0,0.136364,0.045455,-1.99243,-3.091042
award,1,0,0.136364,0.045455,-1.99243,-3.091042


### Classify : 신규 텍스트가 주어졌을 때 확률 계산

In [25]:
nb_df[label].sum() / nb_df[label_cols].sum().sum()

0.5

In [29]:
target_text = 'free lottery'
target_tokens = target_text.split()

# target_text가 스팸일 확률?
# P(스팸) * P(free | 스팸) * P(lottery | 스팸)
# 단어들 공간

log_event_prob = [] # [-0.6931, -0.6931]

for label in label_cols:
    log_event_prob.append(np.log(nb_df[label].sum() / nb_df[label_cols].sum().sum()))

log_event_prob

# log(P(스팸) * P(free | 스팸) * P(lottery | 스팸))
# log(P(스팸)) + log(P(free | 스팸)) + log(P(lottery | 스팸)))

log_sum_0 = log_event_prob[0]
log_sum_1 = log_event_prob[1]
for token in target_tokens:
    log_sum_0 += nb_df['log_p_0'][token]
    log_sum_1 += nb_df['log_p_1'][token]

spam_prob = np.exp(log_sum_1) / (np.exp(log_sum_1) + np.exp(log_sum_0))
spam_prob
# target_text가 정상일 확률?
norm_prob = np.exp(log_sum_0) / (np.exp(log_sum_0) + np.exp(log_sum_1))
spam_prob, norm_prob

(0.8749999999999999, 0.12500000000000008)

## 1.2 sklearn 활용 (영문 뉴스 분류)

- naive_bayes.MultinomialNB() : 빈도수 기반 Naive Bayse Classifier

### 뉴스 데이터 다운로드



In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print(len(twenty_train.target_names)) #뉴스 카테고리 출력
print(twenty_train.data[0]) #뉴스 데이터 출력

### 문서 분류(파이프 라인 사용)

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [34]:
CountVectorizer?

In [35]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

In [38]:
accuracy = np.mean(predicted == twenty_test.target)

In [39]:
accuracy

0.7738980350504514

### Grid Search

In [40]:
from sklearn.model_selection import GridSearchCV
parameters_clf = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
}
gs_clf = GridSearchCV(text_clf, parameters_clf, n_jobs=-1, verbose=2)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
print("Best score: {0}".format(gs_clf.best_score_))
print("Best parameters set:")
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(list(best_parameters.keys())):
    print("\t{0}: {1}".format(param_name, best_parameters[param_name]))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


KeyboardInterrupt: ignored

### Parameter 적용

In [None]:
import numpy as np
predicted = gs_clf.best_estimator_.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

# 2 서포트 벡터 머신(SVM, Support Vector Machine)

- linear_model.SGDClassifier() : 선형 경사하강법 분류 모델

### 뉴스 데이터 다운로드

In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print(twenty_train.target_names) #뉴스 카테고리 출력
print(twenty_train.data[0]) #뉴데 이터 출력

### 문서 분류 (파이프 라인 사용)

In [42]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter_no_change=5, random_state=42)),
])
text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.8240839086563994

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1, verbose=2)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)
gs_clf_svm.best_score_, gs_clf_svm.best_params_

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
print("Best score: {0}".format(gs_clf_svm.best_score_))
print("Best parameters set:")
best_parameters = gs_clf_svm.best_estimator_.get_params()
for param_name in sorted(list(best_parameters.keys())):
    print("\t{0}: {1}".format(param_name, best_parameters[param_name]))