# Load bar business ids, and load the review dataset into pandas

In [21]:
import pandas as pd
import pickle
import numpy as np

review = pd.read_pickle('../input/yelp_academic_dataset_review.pickle')
review.head()



In [24]:
import sys
sys.path.append('../../yelp_dataset_challenge/')
import yelp_util

In [31]:

bar_ids = pickle.load(open('../output/bar_ids.pickle', 'r'))
# Select reviews that correspond to the list of bars
bar_reviews = review[review.business_id.isin(bar_ids)]


print 'Number of bars (excluding restaurants)', len(bar_ids)
print 'Number of bar reviews', np.sum(review.business_id.isin(bar_ids))




Number of bars (excluding restaurants) 4655
Number of bar reviews 233041


In [74]:
yelp_review_sample = list(bar_reviews.text.iloc[:])
model = yelp_util.create_word2vec_model(yelp_review_sample, ) # word2vec model

breaking into sentence...
trianing word2vec model...


In [36]:
#From  ----   https://github.com/titipata/yelp_dataset_challenge


# functions for preprocessing various fields of the raw data
import re
import time
import collections
import scipy.sparse as sp
import nltk.data
import tensorflow as tf
from nltk.tokenize import WhitespaceTokenizer
from gensim.models import Word2Vec
from unidecode import unidecode
from itertools import chain
import numpy as np
from nltk.tokenize.treebank import TreebankWordTokenizer



__all__ = ["taglist_to_matrix",
           "create_word2vec_model",
           "clear_tensorflow_graph",
           "get_stream_seq",
           "get_word_embedding",
           "create_vocab",
           "word2id"
           ]


sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
whitespace_tokenizer = WhitespaceTokenizer()
tb_tokenizer = TreebankWordTokenizer()


def taglist_to_matrix(taglist):
    """
    This function
    Args:
        taglist: list of list of tags. For example, each element of the list is the list of tags of a business category:
        [u'Doctors', u'Health & Medical']
    Returns:
        A sparse matrix num_docs x tags where element i, j has the counts of how many time tag j appear in document i
    """

    all_tags = [w for doc in taglist for w in doc]
    counter = collections.Counter(all_tags)
    count_pairs = sorted(counter.items(), key=lambda x: -x[1])
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    # sparse matrix indices
    i_indices = [doc_idx for doc_idx in range(len(taglist)) for _ in taglist[doc_idx]]
    j_indices = [word_to_id[w] for doc_idx in range(len(taglist)) for w in taglist[doc_idx]]
    data = [1]*len(all_tags)
    m = sp.csc_matrix((data, (i_indices, j_indices)))
    m.sum_duplicates()
    return m


def clean_text(text):
    """Clean and lower string
    Parameters
    ----------
        text : in string format
    Returns
    -------
        text_clean : clean text input in string format
    """
    text_clean = re.sub(':', '', text.lower())
    text_clean = re.sub(',', '', text_clean)
    text_clean = re.sub('\.', '', text_clean)
    return text_clean


def clean_and_tokenize(text):
    """
    Divide review into sentence, clean words,
    and tokenize it.
    Returns
    ------
        text_tokenize: list of word in sentence
    """
    sentence = sent_detector.tokenize(unidecode(text))
    text_clean = map(clean_text, sentence)
    text_tokenize = map(lambda x: whitespace_tokenizer.tokenize(x), text_clean)
    return text_tokenize


def clean_and_tokenize_word(text):
    """
    Clean and divide text (review) into list of words
    Returns
    ------
        text_clean: list of word in sentence
    """
    if isinstance(text, list):
        text_clean = map(clean_text, text)
        text_tokenize = map(whitespace_tokenizer.tokenize, text_clean)
    elif isinstance(text, basestring):
        text_clean = clean_text(text)
        text_tokenize = whitespace_tokenizer.tokenize(text_clean)
    else:
        text_tokenize = []
    return text_tokenize






def get_stream_seq(review_list, word2vec_model):
    """
    From review list and word2vec model,
    generate output stream of output of review index
    correspond to concatenated review list
    """
    review_list_clean = clean_and_tokenize_word(review_list)
    review_list_flatten = list(chain.from_iterable(review_list_clean))
    review_words_stream = filter(lambda x: x is not None,
                             map(lambda x: word2vec_model.vocab.get(x).index if x in word2vec_model.vocab else None,
                                 review_list_flatten)
                                 )
    return review_words_stream


def get_word_embedding(word2vec_model):
    embeddings = word2vec_model.syn0
    print 'Vocabulary size: ', embeddings.shape[0]
    print 'Word vector dimension: ', embeddings.shape[1]
    return embeddings


def create_vocab(review_list):
    """
    Create dictionary out of review list
    ref: http://deeplearning.net/tutorial/lstm.html
    """

    # Tokenized sentences
    review_list = map(lambda x: x.lower(), review_list)
    tksents = [tb_tokenizer.tokenize(review) for review in review_list]
    print('Building dictionary..')
    wordcount = dict()
    for sent in tksents:
        for w in sent:
            if w.lower() not in wordcount:
                wordcount[w.lower()] = 1
            else:
                wordcount[w.lower()] += 1

    counts = wordcount.values()
    keys = wordcount.keys()
    sorted_idx = np.argsort(counts)[::-1]

    worddict = dict()
    for idx, ss in enumerate(sorted_idx):
        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)

    print(np.sum(counts), ' total words ', len(keys), ' unique words')
    return worddict, tksents


def word2id(tksents, dictionary):
    seqs = [None] * len(tksents)
    for idx, ss in enumerate(tksents):
        seqs[idx] = [dictionary[w.lower()] if w.lower() \
                        in dictionary else 1 for w in ss]
    return seqs


def load_yelp_review(X, labels, nb_words=None, skip_top=10,\
                        maxlen=None, test_split=0.2, seed=113, oov_char=1):
    '''
        Preprocess and load Yelp Reviews word2id sequences and labels for
        polarity analysis
        nb_words : Maximum number of words to index, else assign oov_char
        skip_top : Skip n top most common words
        maxlen   : Maximum sequence length
        oov_char : Out-Of-Vocabulary word id
        test_split : Train-Test split
        ref:https://github.com/fchollet/keras/blob/master/keras/datasets/imdb.py
    '''

    np.random.seed(seed)
    np.random.shuffle(X)
    np.random.seed(seed)
    np.random.shuffle(labels)

    if maxlen:
        new_X = []
        new_labels = []
        for x, y in zip(X, labels):
            if len(x) < maxlen:
                new_X.append(x)
                new_labels.append(y)
        X = new_X
        labels = new_labels

    if not nb_words:
        nb_words = max([max(x) for x in X])

    if oov_char is not None:
        X = [[oov_char if (w >= nb_words or w < skip_top) else w for w in x] for x in X]
    else:
        nX = []
        for x in X:
            nx = []
            for w in x:
                if (w >= nb_words or w < skip_top):
                    nx.append(w)
            nX.append(nx)
        X = nX

    X_train = X[:int(len(X)*(1-test_split))]
    y_train = labels[:int(len(X)*(1-test_split))]

    X_test = X[int(len(X)*(1-test_split)):]
    y_test = labels[int(len(X)*(1-test_split)):]

    return (X_train, y_train), (X_test, y_test)

In [43]:
import gensim

def create_vector_model(model, review_list, **kwargs):
    """
    Create gensim Word2Vec model out of review list
    where each element contains review
    """
    print 'breaking into sentence...'
    review_sentence = map(clean_and_tokenize, review_list)
    review_flatten = list(chain.from_iterable(review_sentence))
    print 'training word2vec model...'
    vec_model = model(review_flatten,**kwargs)
    return vec_model


model_args = {'size':200, 'window':5, 'min_count':10, 'workers':12}
word2vec_model = create_vector_model(model=gensim.models.Word2Vec, review_list=yelp_review_sample, **model_args)

# model_args = {'num_topics':100}
# lda_model = create_vector_model(model=gensim.models.LdaModel, review_list=yelp_review_sample, **model_args)



#model.similarity('bar')
#model.most_similar('bar', topn=20)

breaking into sentence...
training word2vec model...


In [38]:
# from gensim import corpora, models, similarities
# model = models.ldamodel.LdaModel(yelp_review_sample, num_topics=10)

In [75]:
print yelp_review_sample[0]

All the food is great here. But the best thing they have is their wings. Their wings are simply fantastic!!  The "Wet Cajun" are by the best & most popular.  I also like the seasoned salt wings.  Wing Night is Monday & Wednesday night, $0.75 whole wings!

The dining area is nice. Very family friendly! The bar is very nice is well.  This place is truly a Yinzer's dream!!  "Pittsburgh Dad" would love this place n'at!!


In [111]:
word2vec_model.most_similar(positive=['beer','sweet'])


#bar_word = word2vec_model.vocab['bar']


[('spicy', 0.6386415958404541),
 ('margarita', 0.619797945022583),
 ('smooth', 0.5970010161399841),
 ('guinness', 0.5922343730926514),
 ('creamy', 0.5905068516731262),
 ('yummy', 0.585984468460083),
 ('tasty', 0.583294153213501),
 ('thick', 0.5832158327102661),
 ('scotch', 0.583088219165802),
 ('dressing', 0.5767841339111328)]

In [109]:


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words='english')


from time import time
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(yelp_review_sample)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

done in 23.743s.
Extracting tf features for LDA...


NameError: name 'n_features' is not defined

In [None]:
lda.fit(yelp_review_sample)