# Purpose
### This notebook precomputes all the query expansion terms from the WordNet API

Uses word sense disambiguation to determine an accurate sense for each ambiguous word in a query.

In [65]:
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from nltk.stem import PorterStemmer
import json

In [114]:
ps = PorterStemmer()
stop_words = {"is", "a", "about", "above", "all", "along","also", "although", "am", "an", "and", "any", "are", "aren't", "as", "at","be", "because", "been", "but", "by", "can", "cannot", "could", "couldn't","did", "didn't", "do", "does", "doesn't", "e.g.", "either", "etc", "etc.","even", "ever", "enough", "for", "from", "further", "get", "gets", "got", "had", "have","hardly", "has", "hasn't", "having", "he", "hence", "her", "here","hereby", "herein", "hereof", "hereon", "hereto", "herewith", "him","his", "how", "however", "i", "i.e.", "if", "in", "into", "it", "it's", "its","me", "more", "most", "mr", "my", "near", "nor", "now", "no", "not", "or", "on", "of", "onto","other", "our", "out", "over", "really", "said", "same", "she","should", "shouldn't", "since", "so", "some", "such","than", "that", "the", "their", "them", "then", "there", "thereby","therefore", "therefrom", "therein", "thereof", "thereon", "thereto","therewith", "these", "they", "this", "those", "through", "thus", "to","too", "under", "until", "unto", "upon", "us", "very", "was", "wasn't","we", "were", "what", "when", "where", "whereby", "wherein", "whether","which", "while", "who", "whom", "whose", "why", "with", "without","would", "you", "your", "yours", "yes"}
def trim(words, min_word_len=2):
    """Removes stop words and small words from a string"""
    words = set(words)
    words -= stop_words
    words = {x.lower() for x in words if len(x) >= min_word_len}
    return words

# Word Sense Disambiguation
http://www.nltk.org/howto/wsd.html

Citation: https://dl.acm.org/citation.cfm?id=318728

Performs the classic Lesk algorithm for Word Sense Disambiguation (WSD) using a the definitions of the ambiguous word.

Given an ambiguous word and the context in which the word occurs, Lesk returns a Synset with the highest number of overlapping words between the different definitions from each Synsets of each word in the context sentence and different definitions from each Synset of the ambiguous word.

In [117]:
def getLemmaDict(query):
    if type(query) is not set:
        sent = set(query.split())
    sent = trim(sent)
    lDict = {}
    for term in sent:
        syn = lesk(sent, term)
        s_term = ps.stem(term)
        s_lemmas = None
        if syn is not None:
            # Remove multi term lemmas
            lemmas = [l.name() for l in syn.lemmas() if '_' not in l.name()]
            # Stem and remove terms that stem to the same value
            s_lemmas = set([ps.stem(l) for l in lemmas]) - set([s_term])
            if len(s_lemmas) == 0:
                s_lemmas = None
        lDict[s_term] = s_lemmas
    return lDict

In [118]:
getLemmaDict('what chemical kinetic system is applicable to hypersonic aerodynamic problems')

{'aerodynam': {'flow', 'sleek', 'streamlin'},
 'applic': None,
 'chemic': None,
 'hyperson': None,
 'kinet': None,
 'problem': {'troubl'},
 'system': None}

# Compute query substitution terms for each query

In [125]:
# Gather unique query terms
import re
qSubs = {}
for path, fkey in [('../datasets/cran/cran.qry', 'cran'), ('../datasets/adi/ADI.QRY', 'adi'), 
             ('../datasets/med/MED.QRY', 'med'), ('../datasets/time/TIME_clean.QUE', 'time')]:
    qSubs[fkey] = {}
    with open(path, 'r') as qfile: 
        lineNum = 0
        for line in qfile:
            # Finished processing last query. 
            # Compute substitution terms. 
            # Get ID for next query.
            if '.I' in line:
                if lineNum != 0:
                    print(terms, lineNum)
                    qSubs[fkey][ID] = getLemmaDict(terms)
                    break
                terms = set()
                ID = line.split()[1]
            elif '.W' not in line:
                terms |= set(re.split('[^a-zA-Z]+', line))
            lineNum += 1
        break
    break
qSubs

{'', 'aeroelastic', 'models', 'high', 'be', 'obeyed', 'heated', 'what', 'constructing', 'speed', 'must', 'similarity', 'aircraft', 'laws', 'of', 'when'} 4


UnboundLocalError: local variable 'sent' referenced before assignment