In [2]:
import pandas as pd

def read_data(filename):
    returnValue = pd.read_csv(filename)
    return returnValue


crypto = read_data('crypto.csv')
cooking = read_data('cooking.csv')
biology = read_data('biology.csv')
diy = read_data('diy.csv')
robotics = read_data('robotics.csv')
travel = read_data('travel.csv')



#nltk로 tokenize 하는 것입니다, 글자만 뽑아냅니다
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

#stop word걸러 내기 위한 것
from stop_words import get_stop_words
en_stop = get_stop_words('en')

#stemming
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()



def tokenize(doc):
    tokens = tokenizer.tokenize(doc.lower())
    stem = [i for i in tokens if not i in en_stop]
    returnValue = [p_stemmer.stem(i) for i in stem]
    return returnValue


#tag는 다음과 같이 만든다
# crypto ->  0 , cooking ->  1 , biology ->  2 , diy ->  3 , robotics ->  4 , travel ->  5

docs_crypto = [(tokenize(row), 0) for row in crypto['title']]
docs_cooking = [(tokenize(row), 1) for row in cooking['title']]
docs_biology = [(tokenize(row), 2) for row in biology['title']]
docs_diy = [(tokenize(row), 3) for row in diy['title']]
docs_robotics = [(tokenize(row), 4) for row in robotics['title']]
docs_travel = [(tokenize(row), 5) for row in travel['title']]


#비율은 일반적으로 80:20 비율로 잡는다
train_crypto = docs_crypto[:4000]
test_crypto = docs_crypto[4000:5000]

train_cooking = docs_cooking[:4000]
test_cooking = docs_cooking[4000:5000]

train_biology = docs_biology[:4000]
test_biology = docs_biology[4000:5000]

train_diy = docs_diy[:4000]
test_diy = docs_diy[4000:5000]

train_robotics = docs_robotics[:2000]
test_robotics = docs_robotics[2000:]

train_travel = docs_travel[:4000]
test_travel = docs_travel[4000:5000]



train_data = []
test_data = []


train_data.extend(train_crypto)
train_data.extend(train_cooking)
train_data.extend(train_biology)
train_data.extend(train_diy)
train_data.extend(train_robotics)
train_data.extend(train_travel)


test_data.extend(test_crypto)
test_data.extend(test_cooking)
test_data.extend(test_biology)
test_data.extend(test_diy)
test_data.extend(test_robotics)
test_data.extend(test_travel)



tokens = [t for d in train_data for t in d[0]]

print(len(tokens))


import nltk
text = nltk.Text(tokens, name="NMSC")
print(text)



from pprint import pprint
pprint(text.vocab().most_common(10))

selected_words = [f[0] for f in text.vocab().most_common(3000)]

def term_exists(doc):
    return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}

125524
<Text: NMSC>
[('can', 2657),
 ('use', 1638),
 ('s', 827),
 ('key', 773),
 ('make', 653),
 ('way', 647),
 ('encrypt', 630),
 ('get', 622),
 ('differ', 611),
 ('travel', 548)]


In [3]:
train_xy = [(term_exists(d), c) for d, c in train_data]
test_xy = [(term_exists(d), c) for d, c in test_data]

In [4]:
classifier = nltk.NaiveBayesClassifier.train(train_xy)

In [5]:
print(nltk.classify.accuracy(classifier, test_xy))

0.9123202217986485


In [6]:
classifier.show_most_informative_features(10)

Most Informative Features
           exists(robot) = True                4 : 2      =    520.5 : 1.0
         exists(encrypt) = True                0 : 2      =    397.0 : 1.0
          exists(travel) = True                5 : 1      =    349.0 : 1.0
             exists(key) = True                0 : 1      =    271.0 : 1.0
           exists(motor) = True                4 : 5      =    261.9 : 1.0
            exists(hash) = True                0 : 1      =    243.0 : 1.0
             exists(rsa) = True                0 : 5      =    227.0 : 1.0
           exists(human) = True                2 : 1      =    200.3 : 1.0
          exists(sensor) = True                4 : 2      =    142.0 : 1.0
           exists(floor) = True                3 : 2      =    137.0 : 1.0
