In [2]:
from konlpy.tag import Kkma
from konlpy.utils import pprint
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import math
import operator
# TF(term frequency)는 특정한 단어가 문서 내에 얼마나 자주 등장하는지를 나타내는 값. 이 값이 높을수록 문서에서 중요하다고 생각할 수 있다.
# DF(Document frequency)는 단어 자체가 문서군 내에서 자주 사용되는 경우, 이것은 그 단어가 흔하게 등장한다는 것을 의미함. IDF는 그 반대
# TF-IDF는 TF와 IDF를 곱한 값으로 점수가 높은 단어일수록 다른 문서에는 많지 않고 해당 문서에서 자주 등장하는 단어를 의미한다.

class DoublespaceLineCorpus:    
    def __init__(self, corpus_fname, iter_sent = False):
        self.corpus_fname = corpus_fname
        self.iter_sent = iter_sent
            
    def __iter__(self):
        with open(self.corpus_fname, encoding='utf-8') as f:
            if self.iter_sent:
                json_data = json.load(f)
                for doc in json_data:
                    json_content = doc["doc_content"]
                    yield [json_content]
            else:
                json_data = json.load(f)
                for doc in json_data:
                    json_content = doc["doc_content"]
                    yield [json_content.strip()]
                    
# def json_open(corpus_fname,iter_sent = False):
#     with open(corpus_fname, encoding='utf-8') as f:
#             if iter_sent:
#                 json_data = json.load(f)
#                 for doc in json_data:
#                     json_content = doc["doc_content"]
#                     yield [json_content]
#             else:
#                 json_data = json.load(f)
#                 for doc in json_data:
#                     json_content = doc["doc_content"]
#                     yield [json_content.strip()]

# =======================================
# -- TF-IDF function
# =======================================
def f(t, d):
    # d is document == tokens
    return d.count(t)

def tf(t, d):
    # d is document == tokens
    return 0.5 + 0.5*f(t,d)/max([f(w,d) for w in d])

def idf(t, D):
    # D is documents == document list
    numerator = len(D)
    denominator = 1 + len([ True for d in D if t in d])
    return math.log10(numerator/denominator)

def tfidf(t, d, D):
    return tf(t,d)*idf(t, D)
    
    
def tokenizer(d):
    # return [ t for t in d.split() if len(t) > 1 ]
    # return d.split()
    return okt.nouns(d)

def tfidfScorer(D):
    tokenized_D = [tokenizer(d) for d in D]
    result = []
    for d in tokenized_D:
        result.append([(t, tfidf(t, d, tokenized_D)) for t in d])
    return result


if __name__ == '__main__':
    corpus_path = 'C:\ex/와이즈넛/주제어 추출/cafe_data/D-00-201908060736-07977.json'
                    
    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=False)
    #corpus = json_open(corpus_path)
    okt = Okt()
    T=[]
    for cont in corpus:
        T.append(cont)
    t =[]
    for i in T:
        t.append(str(i))
        
    sort = {}
    #for cont in corpus:
    result = tfidfScorer(t[0:10])
    for i in range(0,len(result)):
        sort[str(i)] = sorted(set(result[i]), key = operator.itemgetter(1),reverse=True)
   

-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


In [3]:
sort

{'0': [('자유', 0.4659800028906792),
  ('주변', 0.4659800028906792),
  ('상권', 0.4659800028906792),
  ('데크', 0.4659800028906792),
  ('골목', 0.4659800028906792),
  ('초등학교', 0.4659800028906792),
  ('타일', 0.4659800028906792),
  ('물', 0.4659800028906792),
  ('최적화', 0.4659800028906792),
  ('공원', 0.4659800028906792),
  ('외부', 0.4659800028906792),
  ('천정', 0.4659800028906792),
  ('야외', 0.4659800028906792),
  ('영업', 0.4659800028906792),
  ('그대로', 0.4659800028906792),
  ('수도', 0.3485858301868917),
  ('바로', 0.3485858301868917),
  ('시설', 0.3485858301868917),
  ('조명', 0.3485858301868917),
  ('시공', 0.3485858301868917),
  ('앞', 0.3485858301868917),
  ('퇴계동', 0.3485858301868917),
  ('인테리어', 0.2652933391146917),
  ('방', 0.2652933391146917),
  ('바닥', 0.2652933391146917),
  ('화장실', 0.2652933391146917),
  ('몸', 0.2652933391146917),
  ('상가', 0.2218487496163564),
  ('매매', 0.2218487496163564),
  ('주차', 0.20068666377598746),
  ('가능', 0.18487395801363032),
  ('매물', 0.14789916641090425),
  ('주소', 0.14789916641090425

In [191]:
result = tfidfScorer(t[0:10])