In [2]:
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string
    
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

In [None]:
text = "Okay, now that I’ve scared/bored away all but the truly interested, let’s dig into some code and results! As an example document, I’ll use all of the text in this post up to this results section; as a reference corpus, I’ll use all other posts on this blog. In principle, a reference corpus isn’t necessary for single-document keyphrase extraction (case in point: TextRank), but it’s often helpful to compare a document’s candidates against other documents’ in order to characterize its particular content. Consider that tf*idf reduces to just tf (term frequency) in the case of a single document, since idf (inverse document frequency) is the same value for every candidate."
extract_candidate_chunks(text)