In [1]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import os

In [2]:
nlp = spacy.load('en')

In [3]:
class TextRank4Keyword:
    """Extract keywords from text"""
    def __init__(self):
        self.d = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold  0.00001
        self.steps = 10  # iteration steps
        self.node_weight = None  # save keywords and its weight
    def set_stopwords(self, stopwords):
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i + 1, i + window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1

        # Get Symmeric matrix
        g = self.symmetrize(g)

        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm != 0)  # this is ignore the 0 element in norm
        return g_norm
    def get_keywords(self, file_name, number=10):
        """Print top number keywords"""
        outFile = open('Input/result_'+file_name, 'w', encoding="utf8")
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            outFile.write(key + ' - ' + str(value))
            outFile.write("\n")
            if i > number:
                break
        outFile.close()                
    def analyze(self, file_name, text, candidate_pos=['NOUN', 'PROPN', 'VERB'], window_size=3, lower=False, stopwords=list()):
        """Main function to analyze text"""

        # Set stop words
        self.set_stopwords(stopwords)

        # Pare text by spaCy
        doc = nlp(text)

        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower)  # list of list of words

        # Build vocabulary
        vocab = self.get_vocab(sentences)

        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)

        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)

        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1 - self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]

        self.node_weight = node_weight

In [4]:
def analyze_files():
    dir_path = '.\\Input'
    all_files = os.listdir(dir_path)
    txt_files = list(filter(lambda x: x[-4:] == '.txt', all_files))
    for file in txt_files:
        text = open(dir_path+'\\'+file, "r", errors='ignore')
        tr4w = TextRank4Keyword()
        tr4w.analyze(file, text.read(), candidate_pos = ['NOUN', 'PROPN', 'VERB'], window_size=4, lower=False)
        print("------> Keywords of "+file+" are:")
        tr4w.get_keywords(file, 10)
        print()

In [6]:
analyze_files()

------> Keywords of i1.txt are:
Roundup - 3.474717819589368
California - 3.346399196919054
Monsanto - 2.781928006449759
Bayer - 2.735789458605436
award - 2.5375336733643254
regulators - 2.33947138553843
Smith - 2.333728633670195
based - 2.285202703229221
damages - 2.035912061427344
jury - 1.9095770674633439
said - 1.8933706144878424
case - 1.8264373794404167

------> Keywords of i2.txt are:
Twitter - 5.247483003298711
users - 4.905279028016895
% - 3.2785649339703475
revenue - 3.1249577594239137
platform - 2.9848391408003145
said - 2.887012402269453
quarter - 2.4720092789584727
site - 2.1065516876253
company - 2.0967995060266227
content - 2.0688847019873498
growth - 1.7616043639941767
year - 1.7234142272246213

------> Keywords of i3.txt are:
% - 6.715146696397326
growth - 5.349439414528776
economy - 4.133499903732388
Trump - 3.5557031423111276
tax - 3.1403892904153192
year - 3.074499665148791
quarter - 3.0535080509077646
measure - 2.163811606661924
cuts - 2.1373967221490977
rate - 2.01