In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('last_2_years_restaurant_reviews.csv')

### The feature variables is the text of the review

In [3]:
documents = df['text'].values
documents.dtype, documents.shape
documents[20]

'It\'s hard for me to only give four stars but....  I ate at CUT WHICH IS a two minute walk from Delmonicos two nights ago.   The service was great the food was good but CUT is a masterpiece the Mona Lisa.  I ordered the spinach salad which was very good fresh and delicious.  For my main course I ordered the bone in rib eye "house specialty"  it was great but it did not compare to the magic of my CUT ribeye.  This is an excellent restaurant the only flaw is its two minutes away from CUT!  It\'s like playing basketball next to Jordan baseball next to Ruth.'

### Target variable here is the stars

In [4]:
# Make a column and take the values, save to a variable named "target"
df['favorable'] = df['stars'] > 4
target = df['favorable'].values
target[:10]

array([ True,  True,  True, False,  True,  True,  True, False, False,
       False])

In [5]:
target.mean(), target.std()

(0.4782396579185261, 0.49952626308583337)

### Train test split
 

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test, target_train, target_test = train_test_split(
    documents,
    target,
    test_size = 0.2,
    random_state = 42
)

###  NLP representation of the documents

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_features=5000)

In [9]:
# Train the model with your training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()
# Use the trained model to transform your test data
vectors_test = vectorizer.transform(documents_test).toarray()

In [10]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  

def get_bottom_values(lst, n, labels):
    return [labels[i] for i in np.argsort(lst)[:n]]

### Classify positive and neigative review

#### Naive-Bayes Classifier 

In [11]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()

model_nb.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
# Get score for training set
train_score = model_nb.score(vectors_train, target_train)
test_score = model_nb.score(vectors_test, target_test)
print("Train Score {0:3} Test Score {1:3}".format(train_score, test_score))

Train Score 0.8114283049461193 Test Score 0.811195132800505


In [17]:
n = 5
get_top_values(model_nb.coef_[0], n, words)

['great', 'food', 'place', 'service', 'amazing']

In [18]:
get_bottom_values(model_nb.coef_[0], n, words)

['ignoring', 'insult', 'rudely', 'diarrhea', 'condescending']

#### Logistic Regression Classifier 

In [14]:
from sklearn.linear_model import LogisticRegression

model_lrc = LogisticRegression()
model_lrc.fit(vectors_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
train_score = model_lrc.score(vectors_train, target_train)
test_score = model_lrc.score(vectors_test, target_test)

In [16]:
print("For Logistic Regression: Train Score {0:3} Test Score {1:3}".format(train_score, test_score))

For Logistic Regression: Train Score 0.8416007805885982 Test Score 0.8369086395660845


In [19]:
n = 5
get_top_values(model_lrc.coef_[0], n, words)

['amazing', 'best', 'awesome', 'incredible', 'thank']