In [2]:
import json
import pandas as pd
import re

from collections import Counter
# from collections import defaultdict
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.preprocessing import normalize
import math

In [3]:
def preprocess_sentence(article):
    article = re.sub('"',"'", article)
    article = re.sub('[-=+,#/\?:^$.@*※~&%ㆍ!』\\‘|\[\]\<\>`\'…》]', '', article)
    bracket = re.findall(r'\([^)]*\)', article )
    for i in bracket:
        word = i.strip('()')
        if word.isupper():
            end_index = article.find(i)
            word_len = article[end_index:0:-1].find(' ')
            start_index = end_index - word_len +1
            origin = article[start_index : end_index]
            article = article[:end_index+len(i)] + article[end_index+len(i):].replace(word, origin)
        else:
            if '이하' in word:
                word = word[3:]
                n_space = word.count(' ')
                end_index = article.find(word)-4
                range_candidate = article[end_index-30:end_index].split(' ')[::-1]
                origin = ' '.join(range_candidate[:n_space+1][::-1])
                article = article[:end_index+len(i)] + article[end_index+len(i):].replace(word, origin)
        # 괄호는 다 제거
        article = article.replace(i,'')
    article = ''.join(re.findall('[ 가-힣a-zA-Z0-9]',  article) )
    if len(article.strip(' ')) > 1:
        return article
    else:
        return ''

In [4]:
!pip install konlpy
# 토크나이저
from konlpy.tag import Komoran

komoran = Komoran()
def komoran_tokenize(sent):
    words = komoran.pos(sent, join=True)
    words = [w for w in words if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)]
    return words

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.2MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 12.1MB/s 
[?25hCollecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/fd/96/1030895dea70855a2e1078e3fe0d6a63dcb7c212309e07dc9ee39d33af54/JPype1-1.1.2-cp36-cp36m-manylinux2010_x86_64.whl (450kB)
[K     |████████████████████████████████| 460kB 50.3MB/s 
Collecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl
Collecting colorama
  Download

In [5]:
def scan_vocabulary(sents, tokenize, min_count=2):
    counter = Counter(w for sent in sents for w in tokenize(sent))
    counter = {w:c for w,c in counter.items() if c >= min_count}
    idx_to_vocab = [w for w, _ in sorted(counter.items(), key=lambda x:-x[1])]
    vocab_to_idx = {vocab:idx for idx, vocab in enumerate(idx_to_vocab)}
    return idx_to_vocab, vocab_to_idx

In [6]:
def pagerank(x, df=0.85, max_iter=30):
    assert 0 < df < 1

    # initialize
    A = normalize(x, axis=0, norm='l1')
    R = np.ones(A.shape[0]).reshape(-1,1)
    bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)
    
    # iteration
    for _ in range(max_iter):
        R = df * (A * R) + bias
    return R

In [7]:
def textrank_sent_sim(s1, s2):
    n1 = len(s1)
    n2 = len(s2)
    if (n1 <= 1) or (n2 <= 1):
        return 0
    common = len(set(s1).intersection(set(s2)))
    base = math.log(n1) + math.log(n2)
    return common / base

In [8]:
def cosine_sent_sim(s1, s2):
    if (not s1) or (not s2):
        return 0

    s1 = Counter(s1)
    s2 = Counter(s2)
    norm1 = math.sqrt(sum(v ** 2 for v in s1.values()))
    norm2 = math.sqrt(sum(v ** 2 for v in s2.values()))
    prod = 0
    for k, v in s1.items():
        prod += v * s2.get(k, 0)
    return prod / (norm1 * norm2)

In [9]:
def sent_graph(sents, tokenize, similarity, min_count=2, min_sim=0.3):
    _, vocab_to_idx = scan_vocabulary(sents, tokenize, min_count)

    tokens = [[w for w in tokenize(sent) if w in vocab_to_idx] for sent in sents]
    rows, cols, data = [], [], []
    n_sents = len(tokens)
    for i, tokens_i in enumerate(tokens):
        for j, tokens_j in enumerate(tokens):
            if i >= j:
                continue
            sim = similarity(tokens_i, tokens_j)
            if sim < min_sim:
                continue
            rows.append(i)
            cols.append(j)
            data.append(sim)
    return csr_matrix((data, (rows, cols)), shape=(n_sents, n_sents))

In [10]:
def textrank_keysentence(sents, tokenize, min_count, min_sim, similarity, df=0.85, max_iter=30, topk= 3 ):
    g = sent_graph(sents, tokenize,  similarity ,min_count, min_sim )
    R = pagerank(g, df, max_iter).reshape(-1)
    idxs = R.argsort()[-topk:]
    key_index = [ idx for idx in reversed(idxs)]
#     keysents = [(idx, R[idx], sents[idx]) for idx in reversed(idxs)]
#     summary_3 = '\n'.join( [sents[idx] for idx in reversed(idxs) ]  )
    return key_index

In [11]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [12]:
input_file_name = '/content/gdrive/My Drive/extract_summary/train.jsonl'

In [None]:
for similar in [textrank_sent_sim , cosine_sent_sim ]:
    for mc in range(2,7):
        for ms in np.arange(0.1,1,0.1):
            with open(input_file_name, 'r', encoding = 'utf-8', newline = '') as input_file:
                correct_list = []
                i = 0
                for line in input_file:
                    line = json.loads(line)
                    id_num, sents , _ ,answer_index = list(line.values())[1:]
                    preprocessed = [ preprocess_sentence(sent) for sent in sents ]
                    key_index = textrank_keysentence(preprocessed , komoran_tokenize , mc , ms , similar )
                    correct = len([ind for ind in key_index if ind in answer_index])
                    correct_list.append(correct)
                    
                print(f'similarity_function : {similar} , min_count : {mc} , min_sim : {ms} ==> {sum(correct_list)/len(correct_list)}')

similarity_function : <function textrank_sent_sim at 0x7fb88a6fcea0> , min_count : 2 , min_sim : 0.1 ==> 1.6242085835105016
similarity_function : <function textrank_sent_sim at 0x7fb88a6fcea0> , min_count : 2 , min_sim : 0.2 ==> 1.6218022101254586
similarity_function : <function textrank_sent_sim at 0x7fb88a6fcea0> , min_count : 2 , min_sim : 0.30000000000000004 ==> 1.598696353059365
similarity_function : <function textrank_sent_sim at 0x7fb88a6fcea0> , min_count : 2 , min_sim : 0.4 ==> 1.586594397588954
similarity_function : <function textrank_sent_sim at 0x7fb88a6fcea0> , min_count : 2 , min_sim : 0.5 ==> 1.5511996822652618
similarity_function : <function textrank_sent_sim at 0x7fb88a6fcea0> , min_count : 2 , min_sim : 0.6 ==> 1.5162488610611407
similarity_function : <function textrank_sent_sim at 0x7fb88a6fcea0> , min_count : 2 , min_sim : 0.7000000000000001 ==> 1.4647337803424993
similarity_function : <function textrank_sent_sim at 0x7fb88a6fcea0> , min_count : 2 , min_sim : 0.8 ==

In [16]:
with open(input_file_name, 'r', encoding = 'utf-8', newline = '') as input_file:
    correct_list = []
    i = 0
    for line in input_file:
        line = json.loads(line)
        id_num, sents , _ ,answer_index = list(line.values())[1:]
        preprocessed = [ preprocess_sentence(sent) for sent in sents ]
        key_index = textrank_keysentence(preprocessed , komoran_tokenize , 2 , 0.1 , cosine_sent_sim )
        correct = len([ind for ind in key_index if ind in answer_index])
        correct_list.append(correct)
        
    print(f'similarity_function : {cosine_sent_sim} , min_count : {2} , min_sim : {0.1} ==> {sum(correct_list)/len(correct_list)}')

similarity_function : <function cosine_sent_sim at 0x7f902143e8c8> , min_count : 2 , min_sim : 0.1 ==> 1.584678644020279


In [None]:
# test2
with open(input_file_name, 'r', encoding = 'utf-8', newline = '') as input_file:
    correct_list = []
    i = 0
    for line in input_file:
        line = json.loads(line)
        id_num, sents , _ ,answer_index = list(line.values())[1:]
        preprocessed = [ preprocess_sentence(sent) for sent in sents ]
        key_index = textrank_keysentence(preprocessed , komoran_tokenize , 2 , 0.1 , textrank_sent_sim )
        correct = len([ind for ind in key_index if ind in answer_index])
        correct_list.append(correct)
        
    print(f'similarity_function : {cosine_sent_sim} , min_count : {2} , min_sim : {0.1} ==> {sum(correct_list)/len(correct_list)}')

In [31]:
# test 3
for df in [0.5, 0.85] :
  for max_iter in [60, 30] :
    with open(input_file_name, 'r', encoding = 'utf-8', newline = '') as input_file:
        correct_list = []
        i = 0
        for line in input_file:
            line = json.loads(line)
            id_num, sents , _ ,answer_index = list(line.values())[1:]
            preprocessed = [ preprocess_sentence(sent) for sent in sents ]
            key_index = textrank_keysentence(preprocessed , komoran_tokenize , 2 , 0.1 , textrank_sent_sim , df = df, max_iter= max_iter )
            correct = len([ind for ind in key_index if ind in answer_index])
            correct_list.append(correct)
            
        print(f'df : {df} , max_iter : {max_iter}  ==> {sum(correct_list)/len(correct_list)}')

df : 0.5 , max_iter : 60  ==> 1.6289278788869939
df : 0.5 , max_iter : 30  ==> 1.6289278788869939
df : 0.85 , max_iter : 60  ==> 1.6242085835105016
df : 0.85 , max_iter : 30  ==> 1.6242085835105016


In [32]:
# test 4
for df in [0.1, 0.2, 0.3 ] :
  with open(input_file_name, 'r', encoding = 'utf-8', newline = '') as input_file:
      correct_list = []
      i = 0
      for line in input_file:
          line = json.loads(line)
          id_num, sents , _ ,answer_index = list(line.values())[1:]
          preprocessed = [ preprocess_sentence(sent) for sent in sents ]
          key_index = textrank_keysentence(preprocessed , komoran_tokenize , 2 , 0.1 , textrank_sent_sim , df = df  )
          correct = len([ind for ind in key_index if ind in answer_index])
          correct_list.append(correct)
          
      print(f'df : {df} , max_iter : {max_iter}  ==> {sum(correct_list)/len(correct_list)}')

df : 0.1 , max_iter : 30  ==> 1.625843982898395
df : 0.2 , max_iter : 30  ==> 1.6286007990094151
df : 0.3 , max_iter : 30  ==> 1.6293717730065649
