This notebook explores part-of-speech tagging through its impact on keyword extraction. Keyphrase extraction is a task designed to select a small number of terms (or phrases) from a document that best represent its content.  Here we'll use a tf-idf metric for ranking terms in a document, and use POS information to filter those terms.

In [None]:
import spacy, glob, os, operator, math, random
from collections import Counter

In [None]:
nlp = spacy.load('en', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

Here's how you get a word and its POS tag from SpaCy.

In [None]:
def get_spacy_tags(text):
    """ Get spacy tags for an input text """
    doc=nlp(text)
    for word in doc:
        print(word.text, word.tag_)

get_spacy_tags("Time flies like an arrow")

In [None]:
def read_docs(inputDir):
    """ Read in movie documents (all ending in .txt) from an input folder"""
    
    docs=[]
    for filename in glob.glob(os.path.join(inputDir, '*.txt')):
        with open(filename) as file:
            docs.append((filename, nlp(file.read())))
    return docs

In [None]:
# directory with 2000 movie summaries from Wikipedia
inputDir="../data/movie_summaries/"
original_docs=read_docs(inputDir)

Q1. We covered tf-idf in lecture 9 ("lexical semantics") and in the `7.embeddings/TFIDF.ipynb` notebook. Write a method for extracting the 10 terms with highest tf-idf score for each document in a collection.

In [None]:
def random_words(docs):
    """ Function to return random 10 terms from doc.
    
    Input: a list of (filename, [spacy tokens]) documents
    Returns: a dict mapping "filename" -> [list of 10 keyphrases, ranked from highest tf-idf score to lowest]
 
    Used just to illustrate expected output of functions below """
    
    keyphrases={}
    
    for filename, doc in docs:
        tokens=list(set([x.text for x in doc]))
        random.shuffle(tokens)
  
        keyphrases[filename]=tokens[:10]
    
    return keyphrases

In [None]:
terms=random_words(original_docs)
for filename in ["Jaws.txt", "Harry_Potter_and_the_Philosophers_Stone.txt", "Back_to_the_Future.txt"]:
    print("\n%s\n" % filename)
    print('\n'.join(terms[os.path.join(inputDir, filename)]))

In [None]:
def tf_idf_ranking(docs):
    """
    Function to rank terms in document by tf-idf score, and return the top 10 terms
    
    Input: a list of (filename, [spacy tokens]) documents
    Returns: a dict mapping "filename" -> [list of 10 keyphrases, ranked from highest tf-idf score to lowest]
    
    
    """
    
    def get_tf(tokens):
        counter=Counter()
        for token in tokens:
            counter[token.text]+=1
        return counter
    
    def get_idfs(docs):
        counts=Counter()
        for _, doc in docs:
            doc_types={}
            for token in doc:
                doc_types[token.text]=1

            for word in doc_types:
                counts[word]+=1

        idfs={}
        for term in counts:
            idfs[term]=math.log(float(len(docs))/counts[term])

        return idfs

    idfs=get_idfs(docs)

    keyphrases={}
    
    for filename, doc in docs:
        tf=get_tf(doc)
        candidates={}
        for term in tf:
            candidates[term]=tf[term]*idfs[term]

        sorted_x = sorted(candidates.items(), key=operator.itemgetter(1), reverse=True)
       
        keyphrases[filename]=[k for k,v in sorted_x[:10]]
    
    return keyphrases
            

In [None]:
terms=tf_idf_ranking(original_docs)
for filename in ["Jaws.txt", "Harry_Potter_and_the_Philosophers_Stone.txt", "Back_to_the_Future.txt"]:
    print("\n%s\n" % filename)
    print('\n'.join(terms[os.path.join(inputDir, filename)]))

Q2.  Write a method for extracting the 10 terms with highest tf-idf score for each document in a collection that *excludes all proper names*.

In [None]:
def keyphrase_no_proper_nouns(docs):
    """
    Function to rank terms in document by tf-idf score, and return the top 10 terms.  
    Constraint: None of the top 10 terms should be proper nouns.
    
    Input: a list of (filename, [spacy tokens]) documents
    Returns: a dict mapping "filename" -> [list of 10 keyphrases, ranked from highest tf-idf score to lowest]
    
    """
    
    def remove_proper_nouns(docs):
        new_docs=[]
        for filename, doc in docs:
            new_doc=[]
            for token in doc:
                if token.tag_ != "NNP" and token.tag_ != "NNPS":
                    new_doc.append(token)
            new_docs.append((filename, new_doc))
       
        return new_docs
            
    new_docs=remove_proper_nouns(docs)
    terms=tf_idf_ranking(new_docs)
    return terms

In [None]:
terms=keyphrase_no_proper_nouns(original_docs)
for filename in ["Jaws.txt", "Harry_Potter_and_the_Philosophers_Stone.txt", "Back_to_the_Future.txt"]:
    print("\n%s\n" % filename)
    print('\n'.join(terms[os.path.join(inputDir, filename)]))

Q3.  Write a method for extracting the 10 terms with highest tf-idf score for each document in a collection that *includes only common nouns*.

In [None]:
def keyphrase_only_common_nouns(docs):
    """
    Function to rank terms in document by tf-idf score, and return the top 10 terms.  
    Constraint: All of the top 10 terms should be common nouns.
    
    Input: a list of (filename, [spacy tokens]) documents
    Returns: a dict mapping "filename" -> [list of 10 keyphrases, ranked from highest tf-idf score to lowest]
    
    """
        
    def remove_proper_nouns(docs):
        new_docs=[]
        for filename, doc in docs:
            new_doc=[]
            for token in doc:
                if token.tag_ == "NN" or token.tag_ == "NNS":
                    new_doc.append(token)
            new_docs.append((filename, new_doc))
       
        return new_docs
            
    new_docs=remove_proper_nouns(docs)
    terms=tf_idf_ranking(new_docs)
    return terms

In [None]:
terms=keyphrase_only_common_nouns(original_docs)
for filename in ["Jaws.txt", "Harry_Potter_and_the_Philosophers_Stone.txt", "Back_to_the_Future.txt"]:
    print("\n%s\n" % filename)
    print('\n'.join(terms[os.path.join(inputDir, filename)]))