# Yelp Data Challenge - NLP


Nov 3rd 2017

In [1]:
import pandas as pd

### Note: I processed review data from 2016-01-01 as stated in Data_preprocessing file.

In [2]:
df = pd.read_csv('last_2_year_restaurant_reviews.csv')

In [3]:
df.head()

Unnamed: 0,business_id,name,categories,ave_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Cajun/Creole, Restaurants]",4.0,0,2017-01-20,0,L8lo5SKXfZRlbn1bpPiC9w,5,Went here for guys weekend. Unbelievable. Ravi...,0,nT8zgjoc-PbdBoQsFEXFLw
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Cajun/Creole, Restaurants]",4.0,1,2017-02-12,0,3cnTdE45VrsS0o4cVhfGog,3,"Located inside my favorite hotel Venetian, Del...",1,rOIrilMC7VFwFVBeQNiKMw
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Cajun/Creole, Restaurants]",4.0,1,2017-04-18,0,QtLQQlmFINUSb2K_gE7J1Q,4,"Great food, great service. Expect to pay a pre...",1,ez3GBw83OIgzzgvc0R4jzw
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Cajun/Creole, Restaurants]",4.0,0,2017-01-03,0,oqQsexnfmYxRO-0NvxJN9A,5,My must stop at Vegas. Highly recommend to any...,0,gJrOPH-DSZWY_NX2j6Bugw
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Cajun/Creole, Restaurants]",4.0,0,2017-02-26,0,aw_5aKHlAzV0PSM7F92YFw,5,This place is top notch. We didn't order any a...,0,3Y_gz3wb7T5ur9FHqxBcgQ


### Define your feature variables, here is the text of the review

In [4]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df['text'].values # here .values convert panda.series to numpy array

In [5]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
documents.dtype, documents.shape

(dtype('O'), (110462,))

In [6]:
documents[3]

"My must stop at Vegas. Highly recommend to anyone who appreciate a good steak. And try the BBQ shrimp as well. You won't be disappointed."

### Define your target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [7]:
# Make a column and take the values, save to a variable named "target"
df['favorable'] = (df['stars'] > 4)

In [8]:
target = df['favorable'].values

In [9]:
target[:10]

array([ True, False, False,  True,  True,  True,  True, False,  True, False], dtype=bool)

#### You may want to look at the statistic of the target variable

In [10]:
# To be implemented
target.mean(), target.std()

(0.48462819793232059, 0.49976365184073973)

In [11]:
target.shape, documents.shape

((110462,), (110462,))

## Let's create training dataset and test dataset

In [12]:
from sklearn.cross_validation import train_test_split



In [13]:
# Documents is your X (review text), target is your y (whether it is favorable)
# Now split the data to training set and test set

In [14]:
# Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test, target_train, target_test = train_test_split(
    documents,
    target,
    test_size = 0.8, # large test size to make training faster
    random_state = 42
)

In [15]:
documents_train.shape, documents_test.shape

((22092,), (88370,))

## Let's get NLP representation of the documents

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words = "english", max_features = 5000)

In [18]:
# Train the model with your training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [19]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()
len(words)

5000

In [20]:
vectors_train.shape

(22092, 5000)

In [21]:
# Use the trained model to transform your test data
vectors_test = vectorizer.transform(documents_test).toarray()

In [22]:
vectors_test.shape

(88370, 5000)

## Similar review search engine
### Input a review and return a similar review

In [23]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]] # return a list of bottom values

In [24]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
# Draw an arbitrary review from test (unseen in training) documents
some_random_number = 100
search_query = documents_test[some_random_number]
search_queries = [search_query]
print(search_query)
print(search_queries)

This place is not bad. Customer service was great. They were really quick! :) 

I got the chicken teriyaki bento box (comes with rice, salad, and miso soup) and it's a decent size. It filled me up. I liked it. We also got sashimi salad. That was ok. I think it would be better if they had a different dressing because the one that comes with the salad is really salty in my opinion. But other than that, it was good.
["This place is not bad. Customer service was great. They were really quick! :) \n\nI got the chicken teriyaki bento box (comes with rice, salad, and miso soup) and it's a decent size. It filled me up. I liked it. We also got sashimi salad. That was ok. I think it would be better if they had a different dressing because the one that comes with the salad is really salty in my opinion. But other than that, it was good."]


In [26]:
# Transform the drawn review(s) to vector(s)
vector_search_queries = vectorizer.transform(search_queries).toarray()

In [27]:
vector_search_queries.shape

(1, 5000)

In [28]:
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_scores = cosine_similarity(vector_search_queries, vectors_train)

In [29]:
similarity_scores.shape

(1, 22092)

In [30]:
similarity_scores[0, :10]

array([ 0.01186774,  0.02703682,  0.03445728,  0.01446918,  0.        ,
        0.02135746,  0.00575196,  0.00943934,  0.08724556,  0.03298441])

In [31]:
# Let's find top 5 similar reviews
n = 5
returned_reviews = get_top_values(similarity_scores[0], n, documents_train)

In [32]:
print('Our search query:‘)
print(search_queries[0]) # To be added

Our search query:
This place is not bad. Customer service was great. They were really quick! :) 

I got the chicken teriyaki bento box (comes with rice, salad, and miso soup) and it's a decent size. It filled me up. I liked it. We also got sashimi salad. That was ok. I think it would be better if they had a different dressing because the one that comes with the salad is really salty in my opinion. But other than that, it was good.


In [33]:
print('\nMost %s similar reviews:' % n)
for i, review in enumerate(returned_reviews):
    print('#%s:' % i) # print the similarity ranking
    print(review) # print the review


Most 5 similar reviews:
#0:
Great food we had the Chicken Teriyaki and the Beef Teriyaki with salad and Miso soup. This was our first time there went for Valentines.
#1:
Came in around 7 pm on a Sunday night. They were pretty busy, probably only had two vacant tables. I ate here once before (I think during their soft opening) & wasn't a big fan. However, my friend wanted a place where they serve bento boxes so we came here. 

I ordered the salmon teriyaki bento box. This is actually a very large meal. It comes with soup, salad, two pieces of shumai, salmon teriyaki, rice, few pieces of vegetable tempura, and a California roll. For $15?? That's a pretty good deal. I love the creamy dressing that they put on the salad, though I know that's probably not healthy lol. The miso soup is okay, nothing spectacular. I'm pretty sure the shumai is store bought & the same ones I have from home lol only they fry theirs & I steam mine. The salmon teriyaki is very good, not drenched in teriyaki sauce

**The result makes sense** The input review (review_query) is for a Japanese restaurant and the main dish is "chicken teriyaki bento box". The reviews we found in the train documents are all for Japanese restaurants and "teriyaki"  and "bento box" appears in most reviews.

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [34]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [35]:
# Get score for training set
model_nb.score(vectors_train, target_train)

0.82821835958718093

In [36]:
# Get score for test set
model_nb.score(vectors_test, target_test)

0.80813624533212625

#### Logistic Regression Classifier

In [37]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

model_lrc = LogisticRegression()
model_lrc.fit(vectors_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
# Get score for training set
model_lrc.score(vectors_train, target_train)

0.86080934274850629

In [39]:
# Get score for test set
model_lrc.score(vectors_test, target_test)

0.82422767907660965

In [40]:
model_lrc.coef_[0]

array([-0.90888943,  0.19534017,  0.39365173, ...,  0.19473474,
       -0.15943231,  0.49773404])

#### Q: What are the key features(words) that make the positive prediction?

In [41]:
# Let's find it out by ranking
n = 20
get_top_values(model_lrc.coef_[0], n, words)

[u'amazing',
 u'best',
 u'delicious',
 u'awesome',
 u'great',
 u'thank',
 u'perfect',
 u'love',
 u'highly',
 u'excellent',
 u'fantastic',
 u'gem',
 u'wonderful',
 u'vegas',
 u'favorite',
 u'outstanding',
 u'bomb',
 u'wow',
 u'incredible',
 u'superb']

In [42]:
# The coefficients for top 20 features(words) contribute to positive prediction
model_lrc.coef_[0][np.argsort(model_lrc.coef_[0])[::-1][1:10]]

array([ 5.63802546,  4.53400555,  4.36808379,  3.90244479,  3.72657845,
        3.67398152,  3.48741831,  3.46438169,  3.43399677])

A: The top 20 words contributing to 5-star reviews are all very positive words to describe the dishes and/or the restaurants. They have more positive weights in the LOR (log(odd)) term to make it larger. This makes sense as these words indicate that the reviewers are very satisfied with the experience in the restaurants. 

#### Q: What are the key features(words) that make the negative prediction?

In [43]:
# Let's find it out by ranking
n = 20
get_bottom_values(model_lrc.coef_[0], n, words)

[u'ok',
 u'slow',
 u'average',
 u'worst',
 u'bland',
 u'okay',
 u'horrible',
 u'rude',
 u'decent',
 u'terrible',
 u'disappointing',
 u'reason',
 u'bad',
 u'mediocre',
 u'dry',
 u'stars',
 u'overpriced',
 u'poor',
 u'wasn',
 u'didn']

In [44]:
# The coefficients for top 20 features(words) contribute to negative prediction
model_lrc.coef_[0][np.argsort(model_lrc.coef_[0])[::1][1:10]]

array([-4.56412264, -4.55411089, -4.49064088, -4.2868617 , -4.16567211,
       -4.1526603 , -3.96408732, -3.87967873, -3.72603531])

A: The last 20 words contributing to 5-star reviews are almost all negative words. They have more negative weights in the LOR (log(odd)) term to make it smaller. These words indicate that the customers are not very satisfied with the service/dishes/environments and there is still room for the restaurants to improve. 

#### Random Forest Classifier

In [45]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier(max_depth = None,
                                   n_estimators = 15,
                                   min_samples_leaf = 10)

model_rfc.fit(vectors_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
# Get score for training set
model_rfc.score(vectors_train, target_train)

0.82862574687669743

In [47]:
# Get score for test set
model_rfc.score(vectors_test, target_test)

0.78164535475840213

#### Q: What do you see from the training score and the test score?

A: Random Forest model gives relatively low scores on the test data compared to Naive Bayes and Logistic Regression. As I increase n_estimators from 5 to 15(number of tress in the Random Forest model), the test score slowly increases from 0.76, to 0.78, still much lower compared to Naive Bayes and Logistic regression.

But increasing n_estimators also increase the computation time.

#### Q: Can you tell what features (words) are important by inspecting the RFC model?

In [48]:
n = 20
get_top_values(model_rfc.feature_importances_, n, words)

[u'amazing',
 u'great',
 u'best',
 u'delicious',
 u'bad',
 u'vegas',
 u'don',
 u'love',
 u'didn',
 u'pretty',
 u'ok',
 u'rude',
 u'worst',
 u'awesome',
 u'definitely',
 u'perfect',
 u'minutes',
 u'said',
 u'wasn',
 u'good']

### Comment: Unlike Logistic Regression, Random Forest tree model gives the most impotant 20 features, either positive or negative. Comparing this to Logistic regression, the advantage of Logistic regression is clear: it is easy to inteprete the coefficients. 
 


## cross validation

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [49]:
# Cross validation for Logistic regression model
from sklearn.cross_validation import cross_val_score
cv_scores = cross_val_score(model_lrc,
                            vectors_train,
                            target_train,
                            cv = 5,
                            scoring = "accuracy")

In [50]:
cv_scores_lrc = cv_scores

In [51]:
print ('Accuracy scores from cross validation for Logistic Regression model:')
print (cv_scores_lrc)
print ('Std for the scores for Logistic Regression model:')
print (cv_scores_lrc.std())

Accuracy scores from cross validation for Logistic Regression model:
[ 0.81059063  0.82032134  0.82688391  0.82684473  0.81684401]
Std for the scores for Logistic Regression model:
0.00620318334356


In [52]:
# Cross validation for Naive Bayes model
from sklearn.cross_validation import cross_val_score
cv_scores = cross_val_score(model_nb,
                            vectors_train,
                            target_train,
                            cv = 5,
                            scoring = "accuracy")


In [53]:
cv_scores_nb = cv_scores

In [54]:
print ('Accuracy scores from cross validation for Naive Bayes model:')
print (cv_scores_nb)
print ('Std for the scores for Naive Bayes model:')
print (cv_scores_nb.std())

Accuracy scores from cross validation for Naive Bayes model:
[ 0.79882326  0.80810138  0.8119484   0.80828429  0.80167535]
Std for the scores for Naive Bayes model:
0.00479485078044


In [55]:
# Cross validation for Random Forest model
from sklearn.cross_validation import cross_val_score
cv_scores = cross_val_score(model_rfc,
                            vectors_train,
                            target_train,
                            cv = 5,
                            scoring = "accuracy")

In [56]:
cv_scores_rfc = cv_scores

In [57]:
print ('Accuracy scores from cross validation for Random Forest model:')
print (cv_scores_rfc)
print ('Std for the scores for Random Forest model:')
print (cv_scores_rfc.std())

Accuracy scores from cross validation for Random Forest model:
[ 0.77370446  0.77212039  0.79361847  0.77908556  0.77722436]
Std for the scores for Random Forest model:
0.00764353443805


## Looks like the Logistic Regression is the best classifier.
## Do grid search for Logistic Regression.

## Use grid search to find best predictable classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

In [58]:
# Let's turn Logistic Regression Regularization parameter C and different penalty 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

C1 = np.linspace(0.1, 1, 10)
C2 = np.linspace(2, 10, 9)

param_grid = [{'penalty': ['l1'], 'C': np.append(C1, C2)},
              {'penalty': ['l2'], 'C': np.append(C1, C2)}]

scores = ['accuracy']

for score in scores:
    print ("# Tuning hyper-parameters for %s" % score + "\n\n")
    clf = GridSearchCV(LogisticRegression(),
                       param_grid,
                       cv = 5,
                       scoring = score)
    clf.fit(vectors_train[:500,:], target_train[:500])
    print ("Best parameters set found on ")
    print (clf.best_params_)
    print ("\nGrid scores on development set:")
    means = clf.cv_results_["mean_test_score"]
    stds = clf.cv_results_["std_test_score"]
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.3f) for %r"
               % (mean, std*2, params))
    
    print ("\nDetailed classificationi report:\n")
    print ("The model is strained on the full develpment set.")
    print ("The scores are computed on the full evaluation set.")
    print ("\n")
    y_true, y_pred = target_test, clf.predict(vectors_test)
    print (classification_report(y_true, y_pred))
    print ("\n")
pass

# Tuning hyper-parameters for accuracy


Best parameters set found on 
{'penalty': 'l2', 'C': 1.0}

Grid scores on development set:
0.520 (+/-0.000) for {'penalty': 'l1', 'C': 0.10000000000000001}
0.520 (+/-0.000) for {'penalty': 'l1', 'C': 0.20000000000000001}
0.520 (+/-0.000) for {'penalty': 'l1', 'C': 0.30000000000000004}
0.586 (+/-0.027) for {'penalty': 'l1', 'C': 0.40000000000000002}
0.622 (+/-0.050) for {'penalty': 'l1', 'C': 0.5}
0.624 (+/-0.045) for {'penalty': 'l1', 'C': 0.59999999999999998}
0.632 (+/-0.034) for {'penalty': 'l1', 'C': 0.70000000000000007}
0.652 (+/-0.046) for {'penalty': 'l1', 'C': 0.80000000000000004}
0.656 (+/-0.048) for {'penalty': 'l1', 'C': 0.90000000000000002}
0.654 (+/-0.055) for {'penalty': 'l1', 'C': 1.0}
0.684 (+/-0.065) for {'penalty': 'l1', 'C': 2.0}
0.684 (+/-0.082) for {'penalty': 'l1', 'C': 3.0}
0.688 (+/-0.085) for {'penalty': 'l1', 'C': 4.0}
0.680 (+/-0.077) for {'penalty': 'l1', 'C': 5.0}
0.680 (+/-0.077) for {'penalty': 'l1', 'C': 6.0}
0.694