### Keyword Extraction using TextRank

In [4]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [63]:
keyphrase_extractor = TextRank4Keyword()


In [8]:
text = """Disney will have a competitive advantage over Netflix when the entertainment conglomerate launches a competing video streaming platform later this year, according to Wall Street analyst David Trainer.

″[Disney’s] got the ability to merchandise, which is another way to monetize content in a way that Netflix does not have,” Trainer said on CNBC’s “Closing Bell” Wednesday. He’s chief of the New Constructs research firm.

Netflix increased its subscription prices Tuesday, sending the stock up 6.5 percent that day as well.

However, Trainer called it a “key dilemma” for the company and it “makes their competitors more viable.” The dilemma, he explained, is that the streaming company relies too much on its subscribers to generate revenue.

“It’s a Catch-22 for a business model that, when you look at the fundamentals, really just doesn’t work,” Trainer alleged.
The Netflix price increase for U.S. subscribers ranges between 13 percent and 18 percent, which Victory Anthony of Aegis Capital sees as a positive, a view generally shared by much of the investment community.

“It’s all profit for the price increase and so they can either use that to invest in more original content or they can let that drop down to their down to the bottom line,” Anthony said on “Squawk Alley” Thursday.

Aegis put a hold on Neflix at current levels because it’s about 8 percent higher than Anthony’s price target of $325. The stock was trading steady around $351 midday Thursday, up more than 50 percent since the Christmas Eve washout. Netflix releases its fourth quarter earnings after the bell Thursday. Netflix last reported double-digit user growth, with 58 million U.S. and 78 million international subscribers.

Original content aside, Netflix has built its large subscriber base, in part, on licensed content from a number of third-party TV and movie studios that plan to crowd into the video streaming market, which already includes other established online rivals such as Amazon and Hulu.

Disney, which agreed to purchase Twenty-First Century Fox assets last summer, said it would pull its movies from Netflix when it launches Disney+ in late 2019. AT&T’s WarnerMedia announced in October it would release a platform in the fourth quarter of 2019. Apple could be dropping a service this year, Comcast’s NBCUniversal on Monday revealed plans for a free streaming program with ads slated for early 2020.

New Constructs’ Trainer said Netflix is vulnerable because it’s a “one-trick pony” with an online distribution system that is not “defensible.” The company can keep growing its subscriber base, but it will need to address cash flow, he said.

“You can count on one hand the number of firms that have, over time, successfully monetized original content. It’s an expensive, difficult proposition,” he argued. “Disney’s done it and part of the reason they’ve done [it] is because they’ve got better ways of monetizing.”"""
tr4w = TextRank4Keyword()
tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN',"ADP"], window_size=8, lower=False)
tr4w.get_keywords(10)

Netflix - 5.142884247372746
Trainer - 3.104858212116699
content - 2.8629801477461343
percent - 2.6161740340469946
Anthony - 1.9855734135916085
streaming - 1.898886168209727
Disney - 1.8977665674389792
video - 1.8701483706096886
year - 1.7830143220665544
“ - 1.7500370343901208
Thursday - 1.7118131407082973
platform - 1.7019497829033114


### Another TextRank Implementation

In [42]:
### if you are getting an error in the block below, run the following command
#!conda install -c anaconda nltk

In [43]:
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string
    
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda word__pos__chunk: word__pos__chunk[2] != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

In [45]:
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates

In [46]:
def score_keyphrases_by_tfidf(texts, candidates='chunks'):
    import gensim, nltk
    
    # extract candidates from each text in texts, either chunks or words
    if candidates == 'chunks':
        boc_texts = [extract_candidate_chunks(text) for text in texts]
    elif candidates == 'words':
        boc_texts = [extract_candidate_words(text) for text in texts]
    # make gensim dictionary and corpus
    dictionary = gensim.corpora.Dictionary(boc_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
    # transform corpus with tf*idf model
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    return corpus_tfidf, dictionary

In [59]:
def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee
    import operator
    import networkx, nltk
    
    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return zip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top n_keywords
    ranks = networkx.pagerank(graph)
    if 0 < n_keywords < 1:
        n_keywords = int(round(len(candidates) * n_keywords))
    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.items(), key=operator.itemgetter(1), reverse=True)[:n_keywords]}
                  #for word_rank in sorted(ranks.iteritems(), key=lambda x: x[1], reverse=True)[:n_keywords]}
                  
    #sorted(max_value_score.items(), key=operator.itemgetter(1), reverse=True)[:3]
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)
            
    return sorted(keyphrases.items(), key=operator.itemgetter(1), reverse=True)
    #return sorted(keyphrases.iteritems(), key=lambda x: x[1], reverse=True)

In [62]:
text = """Disney will have a competitive advantage over Netflix when the entertainment conglomerate launches a competing video streaming platform later this year, according to Wall Street analyst David Trainer.

″[Disney’s] got the ability to merchandise, which is another way to monetize content in a way that Netflix does not have,” Trainer said on CNBC’s “Closing Bell” Wednesday. He’s chief of the New Constructs research firm.

Netflix increased its subscription prices Tuesday, sending the stock up 6.5 percent that day as well.

However, Trainer called it a “key dilemma” for the company and it “makes their competitors more viable.” The dilemma, he explained, is that the streaming company relies too much on its subscribers to generate revenue.

“It’s a Catch-22 for a business model that, when you look at the fundamentals, really just doesn’t work,” Trainer alleged.
The Netflix price increase for U.S. subscribers ranges between 13 percent and 18 percent, which Victory Anthony of Aegis Capital sees as a positive, a view generally shared by much of the investment community.

“It’s all profit for the price increase and so they can either use that to invest in more original content or they can let that drop down to their down to the bottom line,” Anthony said on “Squawk Alley” Thursday.

Aegis put a hold on Neflix at current levels because it’s about 8 percent higher than Anthony’s price target of $325. The stock was trading steady around $351 midday Thursday, up more than 50 percent since the Christmas Eve washout. Netflix releases its fourth quarter earnings after the bell Thursday. Netflix last reported double-digit user growth, with 58 million U.S. and 78 million international subscribers.

Original content aside, Netflix has built its large subscriber base, in part, on licensed content from a number of third-party TV and movie studios that plan to crowd into the video streaming market, which already includes other established online rivals such as Amazon and Hulu.

Disney, which agreed to purchase Twenty-First Century Fox assets last summer, said it would pull its movies from Netflix when it launches Disney+ in late 2019. AT&T’s WarnerMedia announced in October it would release a platform in the fourth quarter of 2019. Apple could be dropping a service this year, Comcast’s NBCUniversal on Monday revealed plans for a free streaming program with ads slated for early 2020.

New Constructs’ Trainer said Netflix is vulnerable because it’s a “one-trick pony” with an online distribution system that is not “defensible.” The company can keep growing its subscriber base, but it will need to address cash flow, he said.

“You can count on one hand the number of firms that have, over time, successfully monetized original content. It’s an expensive, difficult proposition,” he argued. “Disney’s done it and part of the reason they’ve done [it] is because they’ve got better ways of monetizing.”"""

score_keyphrases_by_textrank(text)

[('’', 0.039220978108779),
 ('netflix', 0.0350876915880868),
 ('“', 0.028118985335236824),
 ('”', 0.02776800702643579),
 ('’ trainer', 0.027651990850738942),
 ('“ disney ’', 0.0268940251718819),
 ('disney ’', 0.026281545090204438),
 ('” trainer', 0.021925505309567335),
 ('” thursday', 0.01998445696552105),
 ('percent', 0.018642227577005545),
 ('trainer', 0.01608300359269888),
 ('content', 0.015398075302705833),
 ('disney', 0.013342112071629868),
 ('subscribers', 0.01272545094646994),
 ('thursday', 0.012200906904606313),
 ('company', 0.012101345634754597)]