In [1]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words #쓸모없는 단어(is, a) 리스트
from nltk.stem.porter import PorterStemmer # 전처리. 어간추출
from gensim import corpora, models
import gensim #Topic model을 위해 설치 필요
import pandas as pd
import string

In [2]:
data = pd.read_csv('./data/topicdata.csv')
data

Unnamed: 0,id,contents
0,doc1,Sugar is bad to consume. My sister likes to ha...
1,doc2,My father spends a lot of time driving my sist...
2,doc3,Doctors suggest that driving may cause increas...
3,doc4,Sometimes I feel pressure to perform well at s...
4,doc5,Health experts say that Sugar is not good for ...


In [3]:
docs = list(data['contents'])

In [4]:
docs

['Sugar is bad to consume. My sister likes to have sugar, but not my father.',
 'My father spends a lot of time driving my sister around to dance practice.',
 'Doctors suggest that driving may cause increased stress and blood pressure.',
 'Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.',
 'Health experts say that Sugar is not good for your lifestyle.']

In [5]:
# docs 안의 문서를 읽어서 전처리 후 다시 저장
tokenizer = RegexpTokenizer(r'\w+') # 문장을 단어로 분리하는 모듈
stop = get_stop_words('en')         # 불용어 리스트
stemmer = PorterStemmer()           # 어간 추출기
texts = []                          # 전처리 후 문서 저장하는 리스트
for d in docs:
    if(d) :
        raw = d.lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in stop]
        stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

In [None]:
''' ## list comprehension
stopped_tokens = []
for i in tokens:
    if not i in stop:
        stopped_tokens.append(i)

stemmed_tokens = []
for i in stopped_tokens:
    stemmed_tokens.append(stemmer.stem(i))
'''


In [6]:
texts

[['sugar', 'bad', 'consum', 'sister', 'like', 'sugar', 'father'],
 ['father',
  'spend',
  'lot',
  'time',
  'drive',
  'sister',
  'around',
  'danc',
  'practic'],
 ['doctor',
  'suggest',
  'drive',
  'may',
  'caus',
  'increas',
  'stress',
  'blood',
  'pressur'],
 ['sometim',
  'feel',
  'pressur',
  'perform',
  'well',
  'school',
  'father',
  'never',
  'seem',
  'drive',
  'sister',
  'better'],
 ['health', 'expert', 'say', 'sugar', 'good', 'lifestyl']]

In [7]:
### LDA 모델을 적용하기 위해 document-term matrix 형태를 만들어줌

# 전체 문서가 포함하고 있는 모든 단어로 이루어진 사전
dictionary = corpora.Dictionary(texts) 
# texts 문서를 이용하여 doc-term matrix를 만들어줌
corpus = [dictionary.doc2bow(text) for text in texts]  
# 단어 빈도수가 아닌 tfidf 가중치를 적용한 모델
tfidf = models.TfidfModel(corpus, id2word = dictionary)
# tfidf 모델로 corpus를 적용한 결과
corpus_tfidf = tfidf[corpus]
# 위의 결과를 다시 LDA를 적용하기 위한 list 형태로 변형
corpus_tfidf_list = [doc for doc in corpus_tfidf]         

In [8]:
# LDA 모델 학습
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf_list, num_topics=3, id2word = dictionary, passes=50)

In [9]:
# 학습 결과 프린팅. 상위 단어 3개까지
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.044*"sugar" + 0.041*"like" + 0.041*"bad"'), (1, '0.049*"caus" + 0.049*"increas" + 0.049*"doctor"'), (2, '0.056*"say" + 0.056*"lifestyl" + 0.056*"expert"')]


In [10]:
# 토픽별 결과 
ldamodel.show_topic(2,3)

[('say', 0.055517994), ('lifestyl', 0.055517994), ('expert', 0.055517994)]

## Clustering

In [11]:
from sklearn.cluster import KMeans
import numpy as np

X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

In [12]:
kmeans.labels_

array([1, 0, 1, 0, 0, 1])

In [13]:
kmeans.predict([[0, 0], [4, 4]])

array([1, 0], dtype=int32)

In [14]:
kmeans.cluster_centers_

array([[3.        , 3.33333333],
       [2.        , 0.66666667]])