# TextRank 논문 텍스트를 TextRank로 분석하기.

## TextRank 알고리즘 구현

In [14]:
import numpy as np
import pandas as pd
import re

In [3]:
def make_pairs(tokens, window=2):
    
    nodes = list(set(tokens))
    vocab = nodes
    vocab2idx = {k:i for (i,k) in enumerate(nodes)}
    idx2vocab = {i:k for (i,k) in enumerate(nodes)}
    
    idx = list(idx2vocab.keys())
    #print(idx)
    
    res_dict = {k:[] for k in vocab}

    for i, token in enumerate(tokens):
        #print(token)
        leng = len(tokens)
        
        min_idx = max(0, i-window+1)
        max_idx = min(leng, i+window)
        
        for t in tokens[min_idx:max_idx]:  
            if t != token:
                res_dict[token].append(t)
    
    #print('with duplicates', res_dict)
    res_dict = {k:list(set(v)) for k, v in res_dict.items()}
    return res_dict

In [4]:
class trNode:
    def __init__(self, name, damping_factor=0.85):
        self.name = name
        self.d = damping_factor
        self.fnodes = []
        
        # initial score
        self.score = 1
        
    def update_score(self):
        # update node score
        input_weights = [fnode.score/len(fnode.fnodes) for fnode in self.fnodes]
        self.score = (1-self.d) + self.d * sum(input_weights)
        
    def update_fnodes(self, nodes):
        for node in nodes:
            self.fnodes.append(node)

In [5]:
class textRank:
    def __init__(self, nodes):
        self.nodes = nodes
        
    def update_nodes(self, times=None, threshold = 0.0001):
        # limited times
        if times:
            for i in range(times):
                [n.update_score() for n in self.nodes]
                return [(n.name, n.score) for n in self.nodes]
        # until convergence (< threshold)
        else:
            diff_scores = np.ones(len(self.nodes))
            while not (diff_scores < threshold).all():
                current_scores = np.array([n.score for n in self.nodes])
                [n.update_score() for n in self.nodes]
                new_scores = np.array([[n.score for n in self.nodes]])
                diff_scores = np.abs(new_scores - current_scores)
            return [(n.name, n.score) for n in self.nodes]        

In [6]:
def compute_textrank(tokens, window=2):
    # step1. convert a list of tokens into a dicionary of window information
    window_dict = make_pairs(tokens, window)

    # step2. update fnodes to each textrank node
    name2node = {name:trNode(name) for name in list(window_dict.keys())}
    #print(name2node)
    for node_name, node_fnodes in window_dict.items():
        fnodes_ = [name2node[fnode] for fnode in node_fnodes]
        name2node[node_name].update_fnodes(fnodes_)

    # step3. calculate node weight
    tr = textRank(list(name2node.values()))
    scores = tr.update_nodes()
    df_scores_sorted = pd.DataFrame(scores,
             columns = ['token', 'weight']).sort_values(by='weight', ascending=False)
    
    return df_scores_sorted

## Preprocess for Paper text
- nltk 영문 stopwords 적용
- 논문 특성상 나타나는 문자열 특성 파악
    - 전처리 코드 구현

In [8]:
# download stopwords for english
import nltk
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/choigww/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## TextRank application to TextRank paper

### Abstract


In [23]:
abstract = '''
In this paper, we introduce TextRank – a graph-based ranking model for text processing, and show how this model can be successfully used in natural language applications. In particular, we propose two innova- tive unsupervised methods for keyword and sentence extraction, and show that the results obtained com- pare favorably with previously published results on established benchmarks.
'''
abstract = abstract.replace("- ", "")
abstract

'\nIn this paper, we introduce TextRank – a graph-based ranking model for text processing, and show how this model can be successfully used in natural language applications. In particular, we propose two innovative unsupervised methods for keyword and sentence extraction, and show that the results obtained compare favorably with previously published results on established benchmarks.\n'

In [24]:
pattern = r'[^a-zA-Z0-9]'
abstract_tokens = re.sub(pattern, ' ',abstract).split()
abstract_tokens_sw = [token.lower() for token in abstract_tokens\
                         if token.lower() not in stopwords]
article_textrank = compute_textrank(abstract_tokens_sw)
article_textrank[:10]

Unnamed: 0,token,weight
6,results,1.678049
0,model,1.613064
31,show,1.586814
5,introduce,1.144579
7,textrank,1.067155
10,graph,1.013384
19,propose,0.994907
20,two,0.994809
16,particular,0.993202
22,innovative,0.992872


### 1. Introduction

In [26]:
intro = '''
Graph-based ranking algorithms like Kleinberg’s HITS algorithm (Kleinberg, 1999) or Google’s PageRank (Brin and Page, 1998) have been success- fully used in citation analysis, social networks, and the analysis of the link-structure of the World Wide Web. Arguably, these algorithms can be singled out as key elements of the paradigm-shift triggered in the field of Web search technology, by providing a Web page ranking mechanism that relies on the col- lective knowledge of Web architects rather than in- dividual content analysis of Web pages. In short, a graph-based ranking algorithm is a way of deciding on the importance of a vertex within a graph, by tak- ing into account global information recursively com- puted from the entire graph, rather than relying only on local vertex-specific information.
Applying a similar line of thinking to lexical or semantic graphs extracted from natural language documents, results in a graph-based ranking model that can be applied to a variety of natural language processing applications, where knowledge drawn from an entire text is used in making local rank- ing/selection decisions. Such text-oriented ranking methods can be applied to tasks ranging from auto- mated extraction of keyphrases, to extractive summa- rization and word sense disambiguation (Mihalcea et al., 2004).
In this paper, we introduce the TextRank graph- based ranking model for graphs extracted from nat- ural language texts. We investigate and evaluate the application of TextRank to two language processing tasks consisting of unsupervised keyword and sen-
tence extraction, and show that the results obtained with TextRank are competitive with state-of-the-art systems developed in these areas.
'''

In [27]:
intro = intro.replace("- ", "")
pattern = r'[^a-zA-Z0-9]'
intro_tokens = re.sub(pattern, ' ',intro).split()
intro_tokens_sw = [token.lower() for token in intro_tokens\
                         if token.lower() not in stopwords]
article_textrank = compute_textrank(intro_tokens_sw)
article_textrank[:10]

Unnamed: 0,token,weight
93,ranking,3.439573
82,web,3.300161
61,graph,2.278878
85,textrank,2.176259
88,analysis,2.068224
74,language,1.751236
7,extraction,1.583042
51,information,1.553126
40,tasks,1.446419
76,kleinberg,1.424731
