In [0]:
## run_nltk

from konlpy.tag import Twitter
import nltk

twitter = Twitter()

print(twitter.morphs(u'한글형태소분석기 테스트 중 입니다')) # ??
print(twitter.nouns(u'한글형태소분석기 테스트 중 입니다!')) #명사
print(twitter.pos(u'한글형태소분석기 테스트 중 입니다.')) #형태소

def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    return data

def tokenize(doc):
  # norm, stem은 optional
  return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

def term_exists(doc):
    return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}

# 트래이닝 데이터와 테스트 데이터를 읽기
train_data = read_data('data/ratings_train.txt')
test_data = read_data('data/ratings_test.txt')

# row, column의 수가 제대로 읽혔는지 확인
print(len(train_data))      # nrows: 150000
print(len(train_data[0]))   # ncols: 3
print(len(test_data))       # nrows: 50000
print(len(test_data[0]))     # ncols: 3

# 형태소 분류
train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data[1:]]

#Training data의 token 모으기
tokens = [t for d in train_docs for t in d[0]]
print(len(tokens))

# Load tokens with nltk.Text()
text = nltk.Text(tokens, name='NMSC')
print(text.vocab().most_common(10))

# 텍스트간의 연어 빈번하게 등장하는 단어 구하기
text.collocations()


# term이 존재하는지에 따라서 문서를 분류
selected_words = [f[0] for f in text.vocab().most_common(2000)] # 여기서는 최빈도 단어 2000개를 피쳐로 사용
train_docs = train_docs[:10000] # 시간 단축을 위한 꼼수로 training corpus의 일부만 사용할 수 있음
train_xy = [(term_exists(d), c) for d, c in train_docs]
test_xy = [(term_exists(d), c) for d, c in test_docs]

# nltk의 NaiveBayesClassifier으로 데이터를 트래이닝 시키고, test 데이터로 확인
classifier = nltk.NaiveBayesClassifier.train(train_xy) #Naive Bayes classifier 적용
print(nltk.classify.accuracy(classifier, test_xy))
# => 0.80418

classifier.show_most_informative_features(10)







#nltk.polarity_scores("i love you")

In [0]:
## doc2vec_run

from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy
import pickle

twitter = Twitter()

def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    return data

def tokenize(doc):
  # norm, stem은 optional
  return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]


# 실제 구동 데이터를 읽기
run_data = read_data('data/ratings_run.txt')

# 형태소 분류
run_docs = [(tokenize(row[1]), row[2]) for row in run_data[1:]]

# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_run_docs = [TaggedDocument(d, [c]) for d, c in run_docs]

# load train data
doc_vectorizer = Doc2Vec.load('model/doc2vec.model')

# 분류를 위한 피쳐 생성
run_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_run_docs]
run_y = [doc.tags[0] for doc in tagged_run_docs]

# load the model from disk
filename = 'model/finalized_model.sav'

# 실제 분류 확인
loaded_model = pickle.load(open(filename, 'rb'))
print(loaded_model.predict(run_x[0].reshape(1, -1)))
print(loaded_model.predict(run_x[1].reshape(1, -1)))

In [0]:
## doc2vec_train
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint

twitter = Twitter()

def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    return data

def tokenize(doc):
  # norm, stem은 optional
  return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]



#doc2vec parameters
cores = multiprocessing.cpu_count()

vector_size = 300
window_size = 15
word_min_count = 2
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 1
worker_count = cores


# 트래이닝 데이터 읽기
train_data = read_data('/content/drive/My Drive/Colab Notebooks/Final_project/data/output/contents_text.txt')

# 형태소 분류
train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]

# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

# 사전 구축
doc_vectorizer = doc2vec.Doc2Vec(size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)

# Train document vectors!
for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha  # fix the learning rate, no decay

#To save
doc_vectorizer.save('model/doc2vec.model')

pprint(doc_vectorizer.most_similar('공포/Noun'))
pprint(doc_vectorizer.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle'))


In [0]:
## doc2vec_test

from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy
import pickle


twitter = Twitter()

def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    return data

def tokenize(doc):
  # norm, stem은 optional
  return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]


# 테스트 데이터를 읽기
train_data = read_data('data/ratings_train.txt')
test_data = read_data('data/ratings_test.txt')

# 형태소 분류
train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data[1:]]

# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
tagged_test_docs = [TaggedDocument(d, [c]) for d, c in test_docs]

# load train data
doc_vectorizer = Doc2Vec.load('model/doc2vec.model')

# 분류를 위한 피쳐 생성
train_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_train_docs]
train_y = [doc.tags[0] for doc in tagged_train_docs]
test_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_test_docs]
test_y = [doc.tags[0] for doc in tagged_test_docs]


#classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
classifier = LogisticRegression(random_state=1234)
classifier.fit(train_x, train_y)

# 테스트 socre 확인
print( classifier.score(test_x, test_y) )
# 0.63904

# save the model to disk
filename = 'model/finalized_model.sav'
pickle.dump(classifier, open(filename, 'wb'))