# Clean the word list in NLTK. 
We clean text in several stages.  Starting with a list of reviews:
1. Divide review into sentences
2. clean words (remove punctuation and extra characters)
3. tokenize. 
4. multiword tokenize.
5. remove stop words. 
6. Stem words 
6. words that occur under 3 times in the entire corpus.

In [None]:
import pandas as pd
import cPickle as pickle
# Load the yelp review data
review = pd.read_pickle('../input/yelp_academic_dataset_review.pickle')

In [55]:
#  Adapted, but much improved from  ----   https://github.com/titipata/yelp_dataset_challenge

import re
import time
import collections
#import scipy.sparse as sp
#import nltk.data
from nltk.tokenize import WhitespaceTokenizer
from unidecode import unidecode
from itertools import chain
import numpy as np
#from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import MWETokenizer
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.stem import SnowballStemmer, WordNetLemmatizer
import sys
sys.path.append('../vectorsearch/')
from reverse_stemmer import SnowCastleStemmer
import nltk


sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
whitespace_tokenizer = WhitespaceTokenizer()
# tb_tokenizer = TreebankWordTokenizer()
stops = set(stopwords.words("english") + stopwords.words("spanish"))
keep_list = ['after', 'during', 'not', 'between', 'other', 'over', 'under', 
             'most', ' without', 'nor', 'no', 'very', 'against','don','aren']
stops = set([word for word in stops if word not in keep_list])


# Multiword tokenizer list taken from: 
# http://www.cs.cmu.edu/~ark/LexSem/
# http://www.cs.cmu.edu/~ark/LexSem/STREUSLE2.1-mwes.tsv

# This parses a list of multiword expressions from # http://www.cs.cmu.edu/~ark/LexSem/STREUSLE2.1-mwes.tsv
# into NLTK format
MWE = [] 
with open('../input/STREUSLE2.1-mwes.tsv') as f:
    for line in f.readlines():
        multiword_expression = line.split('\t')[0].split()[1:]
        MWE.append(multiword_expression)
MWE_tokenizer = MWETokenizer(MWE, separator='-')
# Add whatever additional custom multi-word-expressions.
MWE_tokenizer.add_mwe(('dive', 'bar'))

# Stemmer
stemmer = SnowCastleStemmer("english")
wnl = WordNetLemmatizer()


def clean_text(text):
    """Clean and lower string
    Parameters
    ----------
        text : in string format
    Returns
    -------
        text_clean : clean text input in string format
    """
    text_clean = re.sub(':', '', text.lower())
    text_clean = re.sub(',', '', text_clean)
    text_clean = re.sub('\.', '', text_clean)
    text_clean = re.sub('\(', '', text_clean)
    text_clean = re.sub('\)', '', text_clean)
    text_clean = re.sub('!', '', text_clean) 
    text_clean = re.sub('\\&', '', text_clean) 
    return text_clean


def clean_and_tokenize(text):
    """
    1. Divide review into sentences
    2. clean words
    3. tokenize
    4. multiword tokenize
    5. remove stop words
    6. stem words
    Returns
    ------
        text_filtered: list of word in sentence
    """
    # Splits into sentences.
    sentence = sent_detector.tokenize(unidecode(text))
    # Clean text: (remove) Remove extra puncuations marks...
    text_clean = map(clean_text, sentence)

    # Multiword expression tokenizer
    text_tokenize = map(lambda x: whitespace_tokenizer.tokenize(x), text_clean)
    text_tokenize = map(lambda x: MWE_tokenizer.tokenize(x), text_tokenize)
    
    # remove stop words
    text_filtered = map(lambda x: [word for word in x if word not in stops], text_tokenize)
    # Stem words
    text_stemmed = map(lambda x: [wnl.lemmatize(word) 
                                  if wnl.lemmatize(word).endswith('e') 
                                  else stemmer.stem(word) 
                                  for word in x], text_filtered)
    #unstem with the simplest word.  This helps readability of results...
    text_stemmed = map(lambda x: [stemmer.unstem(word)[0] 
                                  if len(stemmer.unstem(word))>0
                                  else word
                                  for word in x], text_stemmed)
    #return text_stemmed
    return text_stemmed


def remove_low_occurence_words(texts, threshold=1): 
    '''
    Remove words that appear fewer than "threshold" times.
    '''
    
    frequency = defaultdict(int)
    for text in texts:
        for sentence in text:
            for token in sentence:
                 frequency[token] += 1
    
    texts = [[[token for token in sentence if frequency[token] > threshold]
              for sentence in text] for text in texts]
    return texts
    

In [56]:
# Select reviews that correspond to the list of bars
bar_ids = pickle.load(open('../output/bar_ids.pickle', 'r'))
bar_reviews = review[review.business_id.isin(bar_ids)][:1000]
print 'Number of bars (excluding restaurants)', len(bar_ids)
print 'Number of bar reviews', np.sum(review.business_id.isin(bar_ids))

# Clean and tokenize
print 'Cleaning and tokenizing'
review_sentences = map(clean_and_tokenize, bar_reviews.text.iloc[:])

# This is a list of reviews 
# each review contains a list of sentences
# each sentence contains a list of words (tokens)
review_sentences = remove_low_occurence_words(review_sentences, threshold=0)

# They must be flattened for word2vec. 
# review_flatten = list(chain.from_iterable(review_sentences)) # This is the input to word2vec


Number of bars (excluding restaurants) 4655
Number of bar reviews 233041
Cleaning and tokenizing


In [57]:
# Append to df and save to file
bar_reviews['cleaned_tokenized'] = review_sentences
bar_reviews.to_pickle('../output/bar_reviews_cleaned_and_tokenized.pickle')

# Save the stemmer history for reverse lookup. 
stemmer.savestemmer('../output/reverse_stemmer_stem_mem.pickle')

#--------------------------------------------
# Can now perform reverse stemmer lookups
#-----------------------------------------
# import sys
# sys.path.append('../vectorsearch/')
# from reverse_stemmer import SnowCastleStemmer
# reverse_stemmer = SnowCastleStemmer('english')
# reverse_stemmer.loadstemmer('../output/reverse_stemmer_stem_mem.pickle')

In [58]:
# Examine some samples....

print 'Original'
print bar_reviews['text'].iloc[1]
print 

print 'Tokenized'
print bar_reviews['cleaned_tokenized'].iloc[1]


Original
We checked this place out this past Monday for their wing night. We have heard that their wings are great and decided it was finally time to check it out. Their wings are whole wings and crispy, which is a nice change of pace. I got their wet Cajun sauce and garlic butter wings. The Cajun did not have a bold enough flavor for me and their sauce is too thin. The sauce was also thin for the garlic butter, but that is more expected. They were better than average, but I don't like seeing all the sauce resting at the bottom of the boat. I would definitely come try this place out again to sample some of the other items on the menu, but this will probably not become a regular stop for wings anytime soon.

Tokenized
[['check', 'place', 'past', 'monday', 'wing-night'], ['heard', 'wings', 'great', 'decided', 'finally', 'time', 'check'], ['wings', 'whole', 'wings', 'crispy', 'nice', 'change', 'pace'], ['got', 'wet', 'cajun', 'sauce', 'garlic', 'butter', 'wings'], ['cajun', 'not', 'bold',