In [1]:
import pandas as pd
df = pd.read_csv("data/suggestion.csv")

df.shape

(118, 8)

In [2]:
# https://lovit.github.io/nlp/2018/04/16/krwordrank/
from krwordrank.hangle import normalize

texts = df['content'].fillna('')
texts = [normalize(text, english=True, number=True) for text in texts]

In [3]:
from krwordrank.word import KRWordRank

wordrank_extractor = KRWordRank(
    min_count = 10, # 단어의 최소 출현 빈도수 (그래프 생성 시)
    max_length = 10, # 단어의 최대 길이
    verbose = True
    )

beta = 0.85    # PageRank의 decaying factor beta
max_iter = 10

keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)

scan vocabs ... 
num vocabs = 1420
done = 10


In [4]:
for word, r in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:100]:
    print('%8s:\t%.4f' % (word, r))

     합니다:	5.8869
      있는:	5.7212
      한다:	5.6284
     유치원:	5.2886
    있습니다:	5.2644
      교육:	4.5119
      경우:	3.7315
      하는:	3.6856
      대한:	3.1027
      설립:	2.8839
      하고:	2.7539
      생각:	2.7424
      있다:	2.7169
      사립:	2.5572
      필요:	2.4635
      또는:	2.4528
      현재:	2.2076
     정하는:	2.1972
      또한:	2.1752
      개선:	2.1477
      없는:	2.1471
      문제:	2.1347
      위한:	2.1022
     외국인:	2.1018
      지원:	2.0843
      따라:	2.0823
      국민:	2.0777
      위해:	2.0227
      많은:	2.0147
      따른:	1.9835
      개인:	1.9013
      제도:	1.8711
      정부:	1.8623
      이상:	1.8316
      의무:	1.8296
      있음:	1.8093
    적격심사:	1.7538
      제안:	1.7387
      사람:	1.7329
      운영:	1.6110
      관련:	1.6034
      이런:	1.5941
      하지:	1.5764
      해당:	1.5645
      학교:	1.5510
      모든:	1.5509
      무상:	1.5508
      사회:	1.5422
     그리고:	1.4919
      국가:	1.4894
      해야:	1.4836
      등의:	1.4813
      많이:	1.4811
     위하여:	1.4540
      이에:	1.4483
      20:	1.4476
      저는:	1.4447
     일자리:	1.4196
      자는:	1.41

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    min_df=0,
    max_df=1,
    ngram_range=(1,1),
    lowercase=True,
    tokenizer=lambda x:x.split())

X = vectorizer.fit_transform(df['content'].fillna(''))

In [6]:
vocabulary = vectorizer.vocabulary_

In [7]:
from collections import OrderedDict

ordered = OrderedDict(sorted(vocabulary.items(), key=lambda t:t[1], reverse=True))

In [8]:
import itertools
dict(itertools.islice(ordered.items(), 10))

{'힘듭니다.직계가족이거나': 10245,
 '힘을': 10246,
 '힘을믿어보자.제발': 10247,
 '힘이': 10248,
 '\ufeff\u200b새': 10249,
 '\ufeff★강동구도시관리공단': 10250,
 '\ufeff★\ufeff국가유공자☆\u200b나\ufeff라를': 10251,
 '\ufeff각지자체나': 10252,
 '\ufeff삼일혁명과': 10253,
 '\ufeff신사임당의': 10254}

In [9]:
from collections import Counter
counter = Counter(vocabulary)

print(counter.most_common(100))
print(len(vocabulary))

[('\ufeff신사임당의', 10254), ('\ufeff삼일혁명과', 10253), ('\ufeff각지자체나', 10252), ('\ufeff★\ufeff국가유공자☆\u200b나\ufeff라를', 10251), ('\ufeff★강동구도시관리공단', 10250), ('\ufeff\u200b새', 10249), ('힘이', 10248), ('힘을믿어보자.제발', 10247), ('힘을', 10246), ('힘듭니다.직계가족이거나', 10245), ('힘듭니다.여러가지', 10244), ('힘듭니다.보훈대상자본인은', 10243), ('힘듭니다.', 10242), ('힘들지만', 10241), ('힘들어지고', 10240), ('힘들어서라고', 10239), ('힘들겠더군요오히려', 10238), ('힘들겠구나', 10237), ('힘들게', 10236), ('힘들거라', 10235), ('힘겨워', 10234), ('힘겨운', 10233), ('희한한', 10232), ('희망합니다.국민여가시간', 10231), ('희망하며', 10230), ('희망이라', 10229), ('희망이', 10228), ('희망센터를', 10227), ('희망기업과', 10226), ('희귀병으로', 10225), ('흡수할', 10224), ('흙이', 10223), ('흙을', 10222), ('흙냄새를', 10221), ('흘러버려', 10220), ('흘러내렸다.하늘에서의', 10219), ('흘러가도록', 10218), ('흔히', 10217), ('흔적을', 10216), ('흐름을', 10215), ('휴직중인', 10214), ('휴전국입니다.', 10213), ('휴원을', 10212), ('휴원도', 10211), ('휴업ㆍ폐업)규제공역무(공공서비스)계속의', 10210), ('휴업', 10209), ('휴대폰으로', 10208), ('휴대전화로', 10207), ('휭령하거나', 10206), ('휭령', 10205), ('휩니다.(한과목당', 10204), 