# Purpose
### This notebook precomputes all the query expansion terms from the WordNet API

Uses word sense disambiguation to determine an accurate sense for each ambiguous word in a query.

Found on average 2 query expansion terms per query

In [1]:
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from nltk.stem import PorterStemmer
import json, re

In [2]:
ps = PorterStemmer()
stop_words = {"is", "a", "about", "above", "all", "along","also", "although", "am", "an", "and", "any", "are", "aren't", "as", "at","be", "because", "been", "but", "by", "can", "cannot", "could", "couldn't","did", "didn't", "do", "does", "doesn't", "e.g.", "either", "etc", "etc.","even", "ever", "enough", "for", "from", "further", "get", "gets", "got", "had", "have","hardly", "has", "hasn't", "having", "he", "hence", "her", "here","hereby", "herein", "hereof", "hereon", "hereto", "herewith", "him","his", "how", "however", "i", "i.e.", "if", "in", "into", "it", "it's", "its","me", "more", "most", "mr", "my", "near", "nor", "now", "no", "not", "or", "on", "of", "onto","other", "our", "out", "over", "really", "said", "same", "she","should", "shouldn't", "since", "so", "some", "such","than", "that", "the", "their", "them", "then", "there", "thereby","therefore", "therefrom", "therein", "thereof", "thereon", "thereto","therewith", "these", "they", "this", "those", "through", "thus", "to","too", "under", "until", "unto", "upon", "us", "very", "was", "wasn't","we", "were", "what", "when", "where", "whereby", "wherein", "whether","which", "while", "who", "whom", "whose", "why", "with", "without","would", "you", "your", "yours", "yes"}
def trim(words, min_word_len=2):
    """Removes stop words and small words from a string"""
    words = set(words)
    words -= stop_words
    words = {x.lower() for x in words if len(x) >= min_word_len}
    return words

# Word Sense Disambiguation
http://www.nltk.org/howto/wsd.html

Citation: https://dl.acm.org/citation.cfm?id=318728

Performs the classic Lesk algorithm for Word Sense Disambiguation (WSD) using a the definitions of the ambiguous word.

Given an ambiguous word and the context in which the word occurs, Lesk returns a Synset with the highest number of overlapping words between the different definitions from each Synsets of each word in the context sentence and different definitions from each Synset of the ambiguous word.

In [3]:
def getLemmaDict(query):
    if type(query) is not set:
        query = set(query.split())
    sent = trim(query, min_word_len=3)
    lDict = {}
    for term in sent:
        syn = lesk(sent, term)
        s_term = ps.stem(term)
        s_lemmas = None
        if syn is not None:
            # Remove multi term lemmas
            lemmas = [l.name() for l in syn.lemmas() if '_' not in l.name() and '-' not in l.name()]
            # Stem and remove terms that stem to the same value
            s_lemmas = set([ps.stem(l) for l in lemmas]) - set([s_term])
            if len(s_lemmas) == 0:
                s_lemmas = None
        if s_lemmas is None:
            lDict[s_term] = None
        else:
            lDict[s_term] = [l for l in s_lemmas]
    return lDict

In [4]:
getLemmaDict('what chemical kinetic system is applicable to hypersonic aerodynamic problems')

{'aerodynam': ['streamlin', 'sleek', 'flow'],
 'applic': None,
 'chemic': None,
 'hyperson': None,
 'kinet': None,
 'problem': ['troubl'],
 'system': None}

# Compute query substitution terms for each query

In [9]:
# Gather unique query terms
import re
qSubs = {}
for path, fkey in [('../datasets/cran/cran.qry', 'cran'), ('../datasets/adi/ADI.QRY', 'adi'), 
             ('../datasets/med/MED.QRY', 'med'), ('../datasets/time/TIME_clean.QUE', 'time')]:
    qSubs[fkey] = {}
    with open(path, 'r') as qfile: 
        lineNum = 0
        ID = 0
        for line in qfile:
            # Finished processing last query. 
            # Compute substitution terms. 
            # Get ID for next query.
            if '.I' in line:
                if lineNum != 0:
                    qSubs[fkey][ID] = getLemmaDict(terms)
                terms = set()
                ID = int(ID)
                ID += 1
                ID = str(ID)
            elif '.W' not in line:
                terms |= set(re.split('[^a-zA-Z]+', line))
            lineNum += 1
        qSubs[fkey][ID] = getLemmaDict(terms)
qSubs

{'adi': {'1': {'approxim': ['guess', 'judg', 'estim', 'gaug'],
   'articl': None,
   'automat': ['mechan'],
   'concern': ['pertain', 'refer', 'relat', 'touch'],
   'content': ['messag', 'substanc'],
   'descript': None,
   'difficulti': ['troubl'],
   'involv': None,
   'make': None,
   'problem': ['troubl'],
   'relev': None,
   'retriev': None,
   'titl': None,
   'usual': None,
   'what': None},
  '10': {'abstract': ['preci', 'outlin', 'synopsi'],
   'group': ['aggroup'],
   'inform': ['entropi'],
   'mathemat': ['math'],
   'retriev': None,
   'the': None,
   'theori': None,
   'use': None},
  '11': {'consolid': None,
   'evalu': ['valuat', 'rate'],
   'inform': ['entropi'],
   'need': ['requir', 'want'],
   'research': None,
   'retriev': None,
   'scientif': None,
   'what': None},
  '12': {'distribut': None,
   'give': ['gener', 'yield', 'return', 'render'],
   'high': None,
   'journal': None,
   'method': None,
   'print': None,
   'public': ['publish'],
   'scientif': None,


In [10]:
file_name = 'substitutions_wordnet.json'
with open(file_name, 'w') as f:
    f.write(json.dumps(qSubs))

In [177]:
for name, f in qSubs.items():
    numTermSubs = 0
    numTerms = 0
    for q in f.values():
        for key, value in q.items():
            if value is not None:
                numTermSubs += 1
                numTerms += len(value)
    print(name+' Subs='+str(numTermSubs)+' Num Subs='+str(numTerms)+' Avg='+str(numTerms/numTermSubs))

cran Subs=1273 Num Subs=2415 Avg=1.8970934799685781
adi Subs=154 Num Subs=283 Avg=1.8376623376623376
med Subs=159 Num Subs=294 Avg=1.849056603773585
time Subs=357 Num Subs=706 Avg=1.977591036414566
