https://blog.theeluwin.kr/post/146188165713/summariz3

In [1]:
import re 
from konlpy.tag import Kkma
from konlpy.tag import Twitter
from collections import Counter

kkma = Kkma()
twitter = Twitter()

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [2]:
def xplit(*delimiters):
    return lambda value: re.split('|'.join([re.escape(delimiter) for delimiter in delimiters]), value)

In [3]:
xplit('. ', '? ', '! ', '\n', '.\n')("This is a sentence. Here is another sentence.\nHello, world!") 

['This is a sentence', 'Here is another sentence', 'Hello, world!']

In [4]:
kkma = Kkma()
kkma.nouns("이세돌은 알파고를 이겼다. 이세돌은 강하다. 알파고도 짱쎔.")

['이세', '이세돌', '돌', '파고']

In [5]:
bow1 = Counter(twitter.nouns("고양이 강아지 좋다"))
bow2 = Counter(twitter.nouns("고양이는 생선을 좋아한다"))

print(bow1)
print(bow2)

Counter({'고양이': 1, '강아지': 1})
Counter({'고양이': 1, '생선': 1})


In [6]:
j_index = sum((bow1 & bow2).values()) / sum((bow1 | bow2).values())

In [7]:
j_index

0.3333333333333333

In [8]:
class Sentence:
    @staticmethod
    def co_occurence(sentence1, sentence2):
        p = sum((sentence1.bow & sentence2.bow).values())
        q = sum((sentence1.bow | sentence2.bow).values())
        return p / q if q else 0
    
    def __init__(self, text, index=0):
        self.index = index
        self.text = text
        self.nouns = twitter.nouns(self.text)
        self.bow = Counter(self.nouns)
        
    def __eq__(self, another):
        return hashattr(another, 'index') and self.index == another.index
    
    def __hash__(self):
        return self.index 

In [9]:
def get_sentence(text):
    candidates = xplit('. ', '? ', '! ', '\n', '.\n')(text.strip())
    sentences = []
    index = 0
    
    for candidate in candidates:
        candidate = candidate.strip()
        
        if len(candidate):
            sentences.append(Sentence(candidate, index))
            index += 1
            
    return sentences

In [10]:
def build_graph(sentences):
    graph = networkx.Graph()
    graph.add_nodes_from(sentences)
    pairs = list(itertools.combinations(sentences, 2))
    
    for eins, zwei in pairs:
        graph.add_edge(eins, zwei, weight=Sentence.co_occurence(eins, zwei))
        
    return graph

In [21]:
def all_process(text):
    sentences = get_sentence(text)
    graph = build_graph(sentences)
    pagerank = networkx.pagerank(graph, weight='weight')
    reordered = sorted(pagerank, key=pagerank.get, reverse=True)
    
    return reordered

In [15]:
text = '''계절이 지나가는 하늘에는
가을로 가득 차 있습니다.

나는 아무 걱정도 없이
가을 속의 별들을 다 헤일 듯합니다.

가슴 속에 하나 둘 새겨지는 별을
이제 다 못 헤는 것은
쉬이 아침이 오는 까닭이요,
내일 밤이 남은 까닭이요,
아직 나의 청춘이 다하지 않은 까닭입니다.

별 하나에 추억과
별 하나에 사랑과
별 하나에 쓸쓸함과
별 하나에 동경과
별 하나에 시와
별 하나에 어머니, 어머니,

어머님, 나는 별 하나에 아름다운 말 한마디씩 불러봅니다.'''

In [22]:
all_process(text)


{<__main__.Sentence object at 0x14de06430>: 0.011152416356877325, <__main__.Sentence object at 0x14de061f0>: 0.011152416356877325, <__main__.Sentence object at 0x1053be4f0>: 0.032295656788849166, <__main__.Sentence object at 0x1053beb20>: 0.04101352697380999, <__main__.Sentence object at 0x107f56ee0>: 0.07321742547607142, <__main__.Sentence object at 0x107f56f40>: 0.011152416356877325, <__main__.Sentence object at 0x14de15c10>: 0.049564545351278536, <__main__.Sentence object at 0x14de15160>: 0.049564545351278536, <__main__.Sentence object at 0x14de153d0>: 0.0739743752793817, <__main__.Sentence object at 0x14de15af0>: 0.09339547939550787, <__main__.Sentence object at 0x14de15f70>: 0.09339547939550787, <__main__.Sentence object at 0x14de15310>: 0.11187018452265063, <__main__.Sentence object at 0x14de158b0>: 0.09339547939550785, <__main__.Sentence object at 0x14de15a30>: 0.09339547939550785, <__main__.Sentence object at 0x14de15b20>: 0.07932636809298467, <__main__.Sentence object at 0x14

[<__main__.Sentence at 0x14de15310>,
 <__main__.Sentence at 0x14de15af0>,
 <__main__.Sentence at 0x14de15f70>,
 <__main__.Sentence at 0x14de158b0>,
 <__main__.Sentence at 0x14de15a30>,
 <__main__.Sentence at 0x14de15a60>,
 <__main__.Sentence at 0x14de15b20>,
 <__main__.Sentence at 0x14de153d0>,
 <__main__.Sentence at 0x107f56ee0>,
 <__main__.Sentence at 0x14de15c10>,
 <__main__.Sentence at 0x14de15160>,
 <__main__.Sentence at 0x1053beb20>,
 <__main__.Sentence at 0x1053be4f0>,
 <__main__.Sentence at 0x14de06430>,
 <__main__.Sentence at 0x14de061f0>,
 <__main__.Sentence at 0x107f56f40>]

In [14]:
import networkx
import itertools

In [20]:
for i in all_process(text):
    print(i)

<__main__.Sentence object at 0x14de15be0>
<__main__.Sentence object at 0x14de15d60>
<__main__.Sentence object at 0x14de15f70>
<__main__.Sentence object at 0x14de15400>
<__main__.Sentence object at 0x14de15c70>
<__main__.Sentence object at 0x12768bbb0>
<__main__.Sentence object at 0x14de15b20>
<__main__.Sentence object at 0x14de06430>
<__main__.Sentence object at 0x105357b80>
<__main__.Sentence object at 0x105343670>
<__main__.Sentence object at 0x14de066d0>
<__main__.Sentence object at 0x1276bf460>
<__main__.Sentence object at 0x1276bf040>
<__main__.Sentence object at 0x14dbd8e80>
<__main__.Sentence object at 0x14dc36d60>
<__main__.Sentence object at 0x105357a90>
