In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import random

In [2]:
!ls ../data

yelp_test.json  yelp_train.json


# Load Dataset

In [3]:
data_train = pd.read_json("../data/yelp_train.json")
data_test = pd.read_json("../data/yelp_test.json")

In [4]:
data_train.head()

Unnamed: 0,review,sentiment
0,"i've been here a couple times, once ordered Po...",0
1,I've been to Dragon Pearl Buffet on countless ...,1
2,Service is Great. Food is excellent and great...,1
3,Went to this place right after Germany's victo...,1
4,Great price point for very tasty soon tofu.\n\...,1


In [5]:
data_train.describe()

Unnamed: 0,sentiment
count,223050.0
mean,0.759695
std,0.42727
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [6]:
data_test.head()

Unnamed: 0,review,sentiment
0,Cute idea for a board game cafe! Definitely a ...,1
1,Pressed Cubano or pork belly medianoche a must...,1
2,"Rooftop has a great vibe, music was phenomenal...",1
3,My friend and I from the area first went here ...,1
4,I had the Dahl with roti for breakfast. It was...,0


In [7]:
data_test.describe()

Unnamed: 0,sentiment
count,74350.0
mean,0.756691
std,0.429083
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [8]:
X_train = data_train['review'].tolist()
y_train = data_train['sentiment'].tolist()

In [9]:
X_test = data_test['review'].tolist()
y_test = data_test['sentiment'].tolist()

# Remove Regular Expressions

In [10]:
# import re

# EMOJI_PATTERNS = re.compile("["
#                             u"\U0001F600-\U0001F64F"  # emoticons
#                             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                             u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                             "]+", flags=re.UNICODE)
# REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
# REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|\n")

# def remove_regs(review):
#     r = EMOJI_PATTERNS.sub(" ", review)
#     r = REPLACE_NO_SPACE.sub(" ", r)
#     r = REPLACE_WITH_SPACE.sub(" ", r)
#     return r

In [11]:
# X_train = [remove_regs(review.lower()) for review in X_train]

In [12]:
# X_test = [remove_regs(review.lower()) for review in X_test]

# Vectorization

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer(binary=False)

In [15]:
# Fit on the train
cv.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [16]:
# Transform both train and test
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [17]:
len(cv.get_feature_names())

102837

# Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [19]:
# Cross-validation
X_cv_train, X_cv_val, y_cv_train, y_cv_val = train_test_split(X_train, y_train, train_size = 0.75)

In [20]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c, max_iter=1000)
    lr.fit(X_cv_train, y_cv_train)
    print ("Validation Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_cv_val, lr.predict(X_cv_val))))

Validation Accuracy for C=0.01: 0.9470437386797698
Validation Accuracy for C=0.05: 0.9521187884439503
Validation Accuracy for C=0.25: 0.9534637662966483
Validation Accuracy for C=0.5: 0.9533023689543245
Validation Accuracy for C=1: 0.9519753241396625


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
lr_final = LogisticRegression(C=0.25, max_iter=1000)
lr_final.fit(X_train, y_train)

LogisticRegression(C=0.25, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
print ("Test Accuracy: %s"
       % accuracy_score(y_test, lr_final.predict(X_test)))

Test Accuracy: 0.9540820443846671


In [23]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), lr.coef_[0]
    )
}

In [24]:
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)

('disappoint', 2.535154654146146)
('pleasantly', 2.4342050875461934)
('delish', 2.3202852530506135)
('phenomenal', 2.261689371040765)
('delectable', 2.2340676805811666)
('hesitant', 2.219149708956132)
('hesitation', 2.177212708430026)
('perfection', 2.141536367310232)
('superb', 2.1265451067771517)
('gem', 2.085689915765679)


In [25]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)

('poisoning', -3.0913442047126227)
('unacceptable', -2.974055833573347)
('worst', -2.910852698497391)
('mediocre', -2.616281861563469)
('overrated', -2.5974706989209433)
('ruined', -2.579755562697012)
('forgettable', -2.5303569579858003)
('disgusting', -2.5148807282417724)
('horrible', -2.498330570661087)
('tasteless', -2.483978803396368)


# Evaluation

In [26]:
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve

In [27]:
#tn, fp
#fn, tp
y_pred = lr_final.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[16014,  2076],
       [ 1338, 54922]])

In [28]:
# Example of FP (truth is negative, prediction is positive)
fps = []
for i in range(len(y_pred)):
    if y_pred[i] == 1 and y_test[i] == 0:
        fps.append(i)

In [29]:
r = random.choice(fps)
print("{0} : {1}".format(r, data_test.loc[random.choice(fps),'review']))

8167 : Came to Moroco with my boyfriend for a last-minute brunch in Yorkville as we were staying in the area.  My boyfriend had been to Moroco a few years prior and experienced a great meal, so I was excited to try it out.  First impression- The place is very chic!  Loved the decor (especially the patio- those macaroon-inspired pillows were too cute!).  We sat on the patio and ordered Shakshuka and the Avocado Benny.  My shakshuka came out very hot, but the portion was quite small.. I also found it to be on the salty side.  I had to request extra baguette slices as it only came with 3 thin slices, which was barely enough.  Overall, Moroco was nothing special, and I would hesitate to come back.


In [30]:
lr_final.predict(X_test[30528])

array([1])

In [31]:
# Example of FN (truth is positive, prediction is negative)
fns = []
for i in range(len(y_pred)):
    if y_pred[i] == 0 and y_test[i] == 1:
        fns.append(i)

In [32]:
r = random.choice(fns)
print("{0} : {1}".format(r, data_test.loc[random.choice(fns),'review']))

8863 : This place has ruined  my view of all other Brazilian restaurants that I thought where amazing. Where do I begin the all you can eat food, I'm about to burst as I write this. All I can say for the food is I made one trip to the galleria, after that I was at the table. Why? The frequency of the carvers bringing meat to my table was ridiculously fast,How fast? as I was finishing my meat and trying to eat my side dish  I was being offered more meat. I literally never had an empty plate from the time I sat down until I finally tapped out to sit and enjoy my drink and the entertainment. The entertainment is one of the things about this restaurant that will make others seem bland. From the guitarist, to the capoeira fighters this place is an over all experience, and one I won't soon forget.


In [33]:
lr_final.predict(X_test[38565])

array([0])