# Keyphrases Identification

## 0. Preparation

In this section, the corpus is created either from a text file or a url, which is then split into sentences and tokenized. Other functions include build tagger based on brown corpus, and clean outut under certain characters.

In [141]:
import nltk
from urllib.request import urlopen
import urllib.parse
import string
from nltk.util import ngrams
import re
from nltk.corpus import brown
from nltk.stem.wordnet import WordNetLemmatizer


def create_corpus_from_file(file):
    '''Create corpus from file
    ''' 
    try:
        with open(file, 'r') as fp:
            corpus = fp.read()
    except:
        print("Can't open specified file: {0}".format(file))
        corpus = ""
    return corpus

def create_corpus_from_url(url):
    '''Create corpus from url
    ''' 
    try:
        res = urllib.request.urlopen(url)
        corpus = res.read().decode('utf-8')
    except:
        corpus = ""
        print("Can't open url: {0}".format(url))
    finally:
        res.close()
    return corpus   


def tokenize_text(corpus):
    '''Split text into sentences and tokenize
    '''
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus.replace("\ufeff", "")) 
    return [nltk.word_tokenize(word) for word in raw_sents]

def build_backoff_tagger(train_sents):
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2

def train_tagger(already_tagged_sents):
    ngram_tagger = build_backoff_tagger(already_tagged_sents)
#     print ("%0.3f pos accuracy on test set" % ngram_tagger.evaluate(test_sents))
    return ngram_tagger

# train tagger
brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
    'humor', 'learned', 'lore', 'mystery', 'religion', 'reviews', 'romance', 'science_fiction'])

tagger = train_tagger(brown_tagged_sents)


def clean_output(candidates, limit=0):
    '''Output clean phrases from freqdist format, and limit characters under count.
    '''
    keyphrases = []
    sum = 0
    for (keyphrase, count) in candidates:
        tmp = list(keyphrase.split())
        kp = " ".join([ word.replace("(", "").replace("'", "").replace(",", "") for word in tmp[0::2] ])
        sum += len(kp)
        if sum < limit:
            keyphrases.append(kp)
        else:
            return keyphrases
    return keyphrases


url = "http://www.gutenberg.org/cache/epub/1342/pg1342.txt"
file = "pride_and_prejudice.txt"
corpus = create_corpus_from_file(file)
sents = tokenize_text(corpus)
# corpus = create_corpus_from_url(url)

## 1. Showing Frequent Terms 

In section 1, I tried to extract phrases that appear the most in text file with unigram segamentation. The unigrame candidates are pruned by removing punctions, stop words (with some extra modal words), proper noune, and words less than 2 characters, and also by stemming.

In [142]:
def build_stop_words():
    '''Build stop words from SMART (Salton,1971).  Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop
    '''
    stop_words = []
    file = "SmartStoplist.txt"
    try:
        with open(file, 'r') as fp:
            tmp = fp.readlines()
            stop_words = [ word.replace('\n', '') for word in tmp ]
    except:
        print("Can't open specified file: {0}".format(file))
        
    return stop_words

def extract_unigram(sents):
    '''Extract unigram candidates, and prune the candidates.

    '''
    unigram_raw_candidates = []
    unigram_candidates = []
    lm = WordNetLemmatizer()
    for sent in sents:
        unigram_raw_candidates += list(ngrams(sent,1))    
    
    # Unigram pruning: remove punctions, stop words, words that are capitalized in the first character, words less than 2 characters.
    stopwords = build_stop_words()
  
    unigram_pattern = r"\d+|\'+|\`+|^[A-Z]\w*"
    for element in unigram_raw_candidates:
        if re.match(unigram_pattern, element[0]) or element[0] in string.punctuation or element[0] in stopwords or len(element[0])<2 :
            continue
        else:
            unigram_candidates.append(lm.lemmatize(element[0], 'v'))
    return unigram_candidates


raw_candidates = extract_unigram(sents)
tagged_candidates = tagger.tag(raw_candidates)
candidates = []
for (word, pos) in tagged_candidates:
    if re.match(r'NN.*', pos):
        candidates.append(word)

candidates_1 = nltk.FreqDist(candidates).most_common(700)
print(clean_output(candidates_1, 700))

['time', 'sister', 'family', 'man', 'father', 'day', 'letter', 'mother', 'reply', 'room', 'friend', 'return', 'house', 'manner', 'work', 'love', 'answer', 'pleasure', 'cry', 'subject', 'part', 'aunt', 'daughter', 'dear', 'morning', 'place', 'walk', 'word', 'sisters', 'talk', 'ladies', 'brother', 'happiness', 'surprise', 'party', 'opinion', 'attention', 'eye', 'character', 'moment', 'world', 'uncle', 'marriage', 'town', 'smile', 'conversation', 'kind', 'mind', 'woman', 'affection', 'side', 'point', 'dance', 'life', 'reason', 'turn', 'object', 'behaviour', 'friends', 'person', 'husband', 'lady', 'honour', 'call', 'acquaintance', 'look', 'delight', 'people', 'occasion', 'daughters', 'hope', 'cousin', 'idea', 'pass', 'doubt', 'spirit', 'power', 'business', 'interest', 'wife', 'pride', 'manners', 'heart', 'visit', 'carriage', 'account', 'address', 'girls', 'concern', 'promise', 'fear', 'endeavour', 'question', 'ladyship', 'term', 'thing', 'advantage', 'country', 'civility', 'respect', 'ball

## 2. Collocations with bigram

In [143]:
from nltk.collocations import *
# corpus_words = nltk.word_tokenize(corpus)

bigram_measures = nltk.collocations.BigramAssocMeasures()

# stem sentences
lm = WordNetLemmatizer()
sents_stemmed = []
for sent in sents:
    sents_stemmed.append([lm.lemmatize(word).lower() for word in sent])

candidates_tmp = []
for sent in sents_stemmed:
    # tag collocation words with trained brown tagger
    finder = BigramCollocationFinder.from_words(tagger.tag(sent))
    tmp = finder.nbest(bigram_measures.pmi, 700)
    candidates_tmp += tmp

candidates_2 = nltk.FreqDist(candidates_tmp).most_common(700)

stopwords = build_stop_words()
candidates_ = []
limit = 700
sum = 0
for candidate in candidates_2:
    # filter out descriptive noun phrases
    if re.match(r'NN.*', candidate[0][1][1]):
        # filter out punctuation, stopwords
        if  (candidate[0][0][0] not in string.punctuation) and (candidate[0][0][0] not in stopwords) and (candidate[0][1][0] not in stopwords):
            kp = candidate[0][0][0] + " " + candidate[0][1][0]
            sum += len(kp)
            if sum < limit:
                candidates_.append(kp)
            else:
                break
print(candidates_)


['mr. darcy', 'mrs. bennet', 'mr. collins', 'lady catherine', 'mr. bingley', 'mr. bennet', 'miss bingley', 'mr. wickham', 'miss bennet', 'elizabeth wa', 'mrs. gardiner', 'sir william', 'young lady', 'project gutenberg-tm', 'de bourgh', 'miss darcy', 'young man', 'bennet wa', 'mr. gardiner', 'colonel fitzwilliam', 'mrs. collins', 'bingley wa', 'colonel forster', 'cried elizabeth', 'electronic work', 'miss lucas', 'great deal', 'mrs. hurst']


## 3. Information obtained from Syntax aka Partial Parsing (Chunking)

In [144]:
txt_tagged = []

# Stem words. Without stemming, top keyphrases consist of phrases with same structure but different forms.
lm = WordNetLemmatizer()
sents_stemmed = []
for sent in sents:
    sents_stemmed.append([lm.lemmatize(word).lower() for word in sent])

for sent in sents_stemmed:
    txt_tagged.append(tagger.tag(sent))

# descriptive keyphrases pattern
dkp = nltk.RegexpParser('DK: {(<JJ>* <NN.*>+ <IN>)? <JJ>+ <NN.*>+}') 

descriptive_keyphrases = []
for sent in txt_tagged:
    tree = dkp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == 'DK': 
            dk = " ".join(str(e) for e in subtree[0:])
            descriptive_keyphrases.append(dk)


            
candidates = nltk.FreqDist(descriptive_keyphrases).most_common(200)
print(clean_output(candidates, 700))

['young lady', 'young man', 'electronic work', 'great deal', 'dear lizzy', 'whole party', 'sure i', 'literary archive foundation', 'dear sir', 'good humour', 'great pleasure', 'good opinion', 'dear aunt', 'young men', 'good deal', 'young woman', 'dear jane', 'fair cousin', 'own family', 'own feeling', 'dear mr. bennet', 'whole family', 'late mr. darcy', 'own room', 'dear wickham', 'dear lydia', 'dear sister', 'full project gutenberg-tm license', 'real character', 'such thing', 'own child', 'short pause', 'large party', 'dear madam', 'public domain', 'intimate friend', 'perfect indifference', 'good heaven', 'dear father', 'own way', 'humble abode', 'present i', 'good news', 'good spirit', 'such term', 'dear charlotte', 'slight bow', 'such circumstance', 'good sense', 'sweet girl', 'short time', 'dear miss elizabeth', 'different manner', 'dear eliza', 'young person', 'poor mother', 'human nature', 'short silence']


## 4. Reflections

Sinple unigram approach does not work well, since the goal of extracting descriptive keyphrases maynot be reflected by a single noun word. 

The top output of Collocations approach contains title nouns, which can't represent key phrases.

Chunking phrase seems to work better, but some keyphrases may have same duplicated marning, such as "young lady" and "young woman" both are keyphrases extracted, but to some extent, have same meaning. 

Based on three approaches I tried, I think chunking works best among the three, but still need extra work on thesaurus analysis.