In [1]:
  
# Make sure that the query input is run through a stemmer and MWE so the vocab is matching.

from nltk.tokenize import MWETokenizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from unidecode import unidecode




MWE = [] 
with open('../input/STREUSLE2.1-mwes.tsv') as f:
    for line in f.readlines():
        multiword_expression = line.split('\t')[0].split()[1:]
        MWE.append(multiword_expression)
MWE_tokenizer = MWETokenizer(MWE, separator='-')
# Add whatever additional custom multi-word-expressions.
MWE_tokenizer.add_mwe((  'dive', 'bar'))
# Stemmer
stemmer = LancasterStemmer()
whitespace_tokenizer = WhitespaceTokenizer()

stops = set(stopwords.words("english") + stopwords.words("spanish"))
keep_list = ['after', 'during', 'not', 'between', 'other', 'over', 'under', 
             'most', ' without', 'nor', 'no', 'very', 'against','don','aren']
stops = set([word for word in stops if word not in keep_list])


def clean_query_nltk(phrase):
    '''
    Handles the NLP cleaning, stemming, and multiword expression work for a given search phrase.
    
    phrase : list containing the words of an expression.  Usually just one word, 
             but can be phrases surrounded by quotes. 
    returns
    '''
    
    # Multiword expression tokenizer
    text_tokenize = whitespace_tokenizer.tokenize(phrase)
    text_tokenize = map(lambda x: MWE_tokenizer.tokenize(x), text_tokenize)

    # remove stop words
    text_filtered = map(lambda x: [word for word in x if word not in stops], text_tokenize)
    # Stem words
    text_stemmed = map(lambda x: [stemmer.stem(word) for word in x], text_filtered)
    return text_stemmed



    
    

## Also can use this for unstemming...

In [3]:
import nltk
class SnowCastleStemmer(nltk.stem.SnowballStemmer):
    """ A wrapper around snowball stemmer with a reverse lookip table """
    
    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(*args, **kwargs)
        self._stem_memory = defaultdict(set)
        # switch stem and memstem
        self._stem=self.stem
        self.stem=self.memstem
        
    def memstem(self, word):
        """ Wrapper around stem that remembers """
        stemmed_word = self._stem(word)
        self._stem_memory[stemmed_word].add(word)
        return stemmed_word
        
    def unstem(self, stemmed_word):
        """ Reverse lookup """
        return sorted(self._stem_memory[stemmed_word], key=len)