This notebook explores identifying multiword expressions using the part-of-speech filtering technique of Justeson and Katz (1995), "[Technical terminology: some linguistic properties and an algorithm for identification in text](https://brenocon.com/JustesonKatz1995.pdf)".

In [None]:
import spacy, re
from collections import Counter

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['ner,parser'])
nlp.remove_pipe('ner')
nlp.remove_pipe('parser')

In [None]:
def getTokens(filename, top=1000):
    
    """ Read the first *top* lines of an input file """
    docs=[]
    with open(filename) as file:
        for idx,line in enumerate(file):
            docs.append(nlp(line))
            if idx > top:
                break
    return docs

In [None]:
docs=getTokens("../data/wiki.10K.txt")

Let's simplify the POS tags to make the regex easier to understand.

In [None]:
def convert_tokens_to_simple_pos(tokens):
    adjectives=set(["JJ", "JJR", "JJS"])
    nouns=set(["NN", "NNS", "NNP", "NNPS"])

    tags=[]
    for x in tokens:
        if x.tag_ in adjectives:
            tags.append("ADJ")
        elif x.tag_ in nouns:
            tags.append("NOUN")
        elif x.tag == "IN":
            tags.append("PREP")
        else:
            tags.append("O")

    tags=' '.join(tags)    
    
    return tags

In [None]:
def getChar2TokenMap(tags):
    
    """  We'll search over the postag sequence, so we need to get the token ID for any
    character to be able to match the word token. """
    
    ws=re.compile(" ")
    char2token={}

    lastStart=0
    for idx, m in enumerate(ws.finditer(tags)):
        char2token[lastStart]=idx
        lastStart=m.start()+1
        
    return char2token

def getToken(tokenId, char2token):
    
    """ Find the token ID for given character in the POS sequence """
    while(tokenId > 0):
        if tokenId in char2token:
            return char2token[tokenId]
        tokenId-=1
    return None

Now let's find all sequences of POS tags that match the Justeson and Katz pattern of `(((ADJ|NOUN) )+|((ADJ|NOUN) )*(NOUN PREP )((ADJ|NOUN) )*)NOUN`

"In words, a candidate term is a multi-word noun phrase; and it either is a string of nouns and/or adjectives, ending in a noun, or it consists of two such strings, separated by a single preposition." (JK 17)

In [None]:
def get_mwes_from_docs(docs, top_mwe=1000):
    p = re.compile("(((ADJ|NOUN) )+|((ADJ|NOUN) )*(NOUN PREP )((ADJ|NOUN) )*)NOUN")

    mweCount=Counter()

    for tokens in docs:
        tags=convert_tokens_to_simple_pos(tokens)
        char2token=getChar2TokenMap(tags)
        words=[x.text for x in tokens]
        
        for m in p.finditer(tags):
            startToken=getToken(m.start(),char2token)
            endToken=getToken(m.end(),char2token)
            mwe=' '.join(words[startToken:endToken+1])
            mweCount[mwe]+=1

    for k,v in mweCount.most_common(10):
        print(k,v)
        
    # We'll define our MWE dictionary to be the *top_mwe* most frequent sequences matched.
    
    my_mwe=[k for (k,v) in mweCount.most_common(top_mwe)]
    return my_mwe

In [None]:
my_mwe=get_mwes_from_docs(docs)

Now let's transform each MWE into a single token (e.g., replace `New York City` with `New_York_City`)

In [None]:
def replaceMWE(text, mweList):
    
    """ Replace all instances of MWEs in text with single token 
    
    MWEs are ranked from longest to shortest so that longest replacements are made first (e.g.,
    "New York City" is matched first before "New York")
    
    """
    
    sorted_by_length = sorted(mweList, key=len, reverse=True)
    for mwe in sorted_by_length:
        text=re.sub(re.escape(mwe), re.sub(" ", "_", mwe), text)
    return text.split(" ")

In [None]:
processedText=replaceMWE("The New York Times is located in New York City", my_mwe)
print(processedText)

In [None]:
def replace_mwe_docs(docs, my_mwe):
    mwe_docs=[]
    for doc in docs:
        processedText=replaceMWE(' '.join([x.text for x in doc]), my_mwe)    
        mwe_docs.append(processedText)
    return mwe_docs

Q1. When we used topic modeling in `4.topics/TopicModel.ipynb`, we represented a document as a bag of words.  MWE allow us to add a little more structure (phrases) but still preserve that same basic accumption (a document is a bag of phrases and words).  Combine both of these methods to create a topic model that reasons over words and phrases.  The only thing you should have to change is how the input text is tokenized (i.e., using the `replaceMWE` function above).  Run that topic model on `data/plot_summaries.txt` (the same data used in the original `4.topics/TopicModel.ipynb` notebook). How would you characterize the difference between the topics that each model learns?

(You'll find the code from `4.topics/TopicModel.ipynb` below to get you started.)

In [None]:
import nltk
import re
import gensim
from gensim import corpora
import operator

nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import random

random.seed(1)

In [None]:
def read_stopwords(filename):
    stopwords={}
    with open(filename) as file:
        for line in file:
            stopwords[line.rstrip()]=1
    return stopwords

In [None]:
stop_words = {k:1 for k in stopwords.words('english')}
stop_words.update(read_stopwords("../data/jockers.stopwords"))
stop_words["'s"]=1
stop_words=list(stop_words.keys())

In [None]:
def filter(word, stopwords):
    
    """ Function to exclude words from a text """

    word=word.lower()
    
    # no stopwords
    if word in stopwords:
        return False
    
    # has to contain at least one letter
    if re.search("[A-Za-z]", word) is not None:
        return True
    
    return False

In [None]:
def read_docs(plotFile, metadataFile, stopwords):
    
    names={}
    box={}
    
    with open(metadataFile, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            name=cols[2]
            boxoffice=cols[4]
            if len(boxoffice) != 0:
                box[idd]=int(boxoffice)
                names[idd]=name
    
    n=5000
    target_movies={}


    sorted_box = sorted(box.items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_box[:n]:
        target_movies[k]=names[k]
    
    docs=[]
    names=[]
   
    with open(plotFile, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            text=cols[1]
            
            if idd in target_movies:
                tokens=nltk.word_tokenize(text.lower())
                tokens=[x for x in tokens if filter(x, stopwords)]
                docs.append(tokens)
                name=target_movies[idd]
                names.append(name)
    return docs, names

In [None]:
metadataFile="../data/movie.metadata.tsv"
plotFile="../data/plot_summaries.txt"
data, doc_names=read_docs(plotFile, metadataFile, stop_words)

In [None]:
# Create vocab from data; restrict vocab to only the top 10K terms that show up in at least 5 documents 
# and no more than 50% of all documents

dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=5, no_above=.5, keep_n=10000)

In [None]:
# Replace dataset with numeric ids words in vocab (and exclude all other words)
corpus = [dictionary.doc2bow(text) for text in data]

In [None]:
num_topics=20

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics, 
                                           passes=10,
                                           alpha='auto')

In [None]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))