---
# Dataset
---

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB # 다항분포 나이브 베이즈 모델
from sklearn.metrics import accuracy_score #정확도 계산
from pprint import pprint
import pandas as pd

import nltk
from nltk.corpus import stopwords
import re

In [2]:
data = fetch_20newsgroups(subset = 'train')
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [3]:
pprint(data.data[0])

("From: lerxst@wam.umd.edu (where's my thing)\n"
 'Subject: WHAT car is this!?\n'
 'Nntp-Posting-Host: rac3.wam.umd.edu\n'
 'Organization: University of Maryland, College Park\n'
 'Lines: 15\n'
 '\n'
 ' I was wondering if anyone out there could enlighten me on this car I saw\n'
 'the other day. It was a 2-door sports car, looked to be from the late 60s/\n'
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition,\n'
 'the front bumper was separate from the rest of the body. This is \n'
 'all I know. If anyone can tellme a model name, engine specs, years\n'
 'of production, where this car is made, history, or whatever info you\n'
 'have on this funky looking car, please e-mail.\n'
 '\n'
 'Thanks,\n'
 '- IL\n'
 '   ---- brought to you by your neighborhood Lerxst ----\n'
 '\n'
 '\n'
 '\n'
 '\n')


In [4]:
data.filenames

array(['/root/scikit_learn_data/20news_home/20news-bydate-train/rec.autos/102994',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51861',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51879',
       ...,
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.ibm.pc.hardware/60695',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38319',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/rec.motorcycles/104440'],
      dtype='<U86')

In [5]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
data.target

array([7, 4, 4, ..., 3, 1, 8])

In [7]:
data.DESCR



In [8]:
print(len(data.data), len(data.filenames), len(data.target_names), len(data.target))

11314 11314 20 11314


---
# 전처리
---

- 특수문자 제거

In [9]:

def extract_word(text):
  text = text.lower() # 소문자 변환

  convert = re.compile("^\d*\d$|[^a-zA-Z0-9.'@]") # 해당 문자만 남기고 나머지 제거
  result = convert.sub(' ',text) # 변환

  result = re.sub(r'[" "]+', " ",result) # 공백 여러개를 한개로 변환
  result = result.strip() # 양쪽 공백 제거

  return result

In [10]:
df = pd.DataFrame({'data' :data.data})
df

Unnamed: 0,data
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...
...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...
11311,From: westes@netcom.com (Will Estes)\nSubject:...
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...


In [11]:
df['data'] = df['data'].apply(lambda x : extract_word(x))

In [12]:
data.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [13]:
df['data'][0]

"from lerxst@wam.umd.edu where's my thing subject what car is this nntp posting host rac3.wam.umd.edu organization university of maryland college park lines 15 i was wondering if anyone out there could enlighten me on this car i saw the other day. it was a 2 door sports car looked to be from the late 60s early 70s. it was called a bricklin. the doors were really small. in addition the front bumper was separate from the rest of the body. this is all i know. if anyone can tellme a model name engine specs years of production where this car is made history or whatever info you have on this funky looking car please e mail. thanks il brought to you by your neighborhood lerxst"

- 불용어 제거


In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
stop_words = stopwords.words('english')
token_doc = df['data'].apply(lambda x : x.split()) # 토큰화
token_doc = token_doc.apply(lambda x : [item for item in x if item not in stop_words]) # 불용어 제거 

- 역토큰화

In [16]:
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(token_doc[i])
    detokenized_doc.append(t)

df['clean_doc'] = detokenized_doc

In [17]:
df['clean_doc'][0]

"lerxst@wam.umd.edu where's thing subject car nntp posting host rac3.wam.umd.edu organization university maryland college park lines 15 wondering anyone could enlighten car saw day. 2 door sports car looked late 60s early 70s. called bricklin. doors really small. addition front bumper separate rest body. know. anyone tellme model name engine specs years production car made history whatever info funky looking car please e mail. thanks il brought neighborhood lerxst"

- 입력한 텍스트를 자동으로 BoW로 만들어주는 CountVectorizer 사용 

In [18]:
dtm = CountVectorizer()
x_train = dtm.fit_transform(df['clean_doc'])

In [19]:
x_train.shape

(11314, 126477)

- TF-IDF 행렬 변환

In [20]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(x_train)

In [21]:
tfidf.shape

(11314, 126477)

---
# NB
----

### 라플라스 스무딩(Laplace smoothing)

- $$P(X|Y=k) = \frac{T_{N,k}}{T_k} ⇒ \frac{T_{N,k} + a}{T_k + 2a}$$

- 훈련에 주로 사용된 데이터가 들어왔을 땐 분류기가 잘 작동하지만, 훈련 데이터에 없던 값이 들어오거나 이상값이 들어올 경우 그에 대한 우도가 0이 되어, 정상적인 분류가 되지 않을 때 사용되는 방법

- 가중치 $a$를 이용해 스무딩으 정도를 선정 (주로 0.5 또는 1 사용)

- 위 a 값에 스무딩 값을 더해줘 0이 되지 않게끔 막아주는 역할

In [22]:
model = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
model.fit(tfidf,data.target)

MultinomialNB()

---
# test
----

In [23]:
test = fetch_20newsgroups(subset='test', shuffle = True) # 테스트 데이터 불러오기

df_test = pd.DataFrame({'data' :test.data})
df_test['data'] = df_test['data'].apply(lambda x : extract_word(x))
test_token = df_test['data'].apply(lambda x : x.split()) # 토큰화
test_token = test_token.apply(lambda x : [item for item in x if item not in stop_words]) # 불용어 제거 

In [24]:
test_doc = []
for i in range(len(df_test)):
    t = ' '.join(test_token[i])
    test_doc.append(t)

df_test['clean_doc'] = test_doc

In [25]:
x_test = dtm.transform(df_test['clean_doc'] ) # 테스트 데이터를 DTM으로 변환
tfidf_test = tfidf_transformer.transform(x_test) # DTM을 TF-IDF로 변환


In [26]:
# 예측
pred = model.predict(tfidf_test)
print("정확도:", accuracy_score(test.target, pred)) #예측값과 실제값 비교

정확도: 0.8108072225172597


---
# 전처리X
---

In [27]:
x_train = dtm.fit_transform(data.data)
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(x_train)

In [28]:
model = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
model.fit(tfidf,data.target)

MultinomialNB()

In [29]:
x_test = dtm.transform(test.data) # 테스트 데이터를 DTM으로 변환
tfidf_test = tfidf_transformer.transform(x_test) # DTM을 TF-IDF로 변환


In [30]:
# 예측
pred = model.predict(tfidf_test)
print("정확도:", accuracy_score(test.target, pred)) #예측값과 실제값 비교

정확도: 0.7738980350504514
