In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import random

In [2]:
!ls ../data

yelp_test.json  yelp_train.json


# Load Dataset

In [3]:
data_train = pd.read_json("../data/yelp_train.json")
data_test = pd.read_json("../data/yelp_test.json")

In [4]:
data_train.head()

Unnamed: 0,review,sentiment
0,"i've been here a couple times, once ordered Po...",0
1,I've been to Dragon Pearl Buffet on countless ...,1
2,Service is Great. Food is excellent and great...,1
3,Went to this place right after Germany's victo...,1
4,Great price point for very tasty soon tofu.\n\...,1


In [5]:
data_train.describe()

Unnamed: 0,sentiment
count,223050.0
mean,0.759695
std,0.42727
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [6]:
data_test.head()

Unnamed: 0,review,sentiment
0,Cute idea for a board game cafe! Definitely a ...,1
1,Pressed Cubano or pork belly medianoche a must...,1
2,"Rooftop has a great vibe, music was phenomenal...",1
3,My friend and I from the area first went here ...,1
4,I had the Dahl with roti for breakfast. It was...,0


In [7]:
data_test.describe()

Unnamed: 0,sentiment
count,74350.0
mean,0.756691
std,0.429083
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [22]:
X_train = data_train['review'].tolist()
y_train = data_train['sentiment'].tolist()

In [23]:
X_test = data_test['review'].tolist()
y_test = data_test['sentiment'].tolist()

# Remove Regular Expressions

In [24]:
# import re

# EMOJI_PATTERNS = re.compile("["
#                             u"\U0001F600-\U0001F64F"  # emoticons
#                             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                             u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                             "]+", flags=re.UNICODE)
# REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
# REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|\n")

# def remove_regs(review):
#     r = EMOJI_PATTERNS.sub(" ", review)
#     r = REPLACE_NO_SPACE.sub(" ", r)
#     r = REPLACE_WITH_SPACE.sub(" ", r)
#     return r

In [25]:
# X_train = [remove_regs(review.lower()) for review in X_train]
X_train = [review.lower() for review in X_train]

In [26]:
# X_test = [remove_regs(review.lower()) for review in X_test]
X_test = [review.lower() for review in X_test]

# Vectorization

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
cv = CountVectorizer(binary=True)

In [29]:
# Fit on the train
cv.fit(X_train)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [30]:
# Transform both train and test
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [31]:
len(cv.get_feature_names())

102837

# Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [33]:
# Cross-validation
X_cv_train, X_cv_val, y_cv_train, y_cv_val = train_test_split(X_train, y_train, train_size = 0.75)

In [34]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c, max_iter=1000)
    lr.fit(X_cv_train, y_cv_train)
    print ("Validation Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_cv_val, lr.predict(X_cv_val))))

Validation Accuracy for C=0.01: 0.9415920951168337
Validation Accuracy for C=0.05: 0.948227319190144
Validation Accuracy for C=0.25: 0.9505227480587486
Validation Accuracy for C=0.5: 0.9499488908415975
Validation Accuracy for C=1: 0.9494288327385543


In [35]:
lr_final = LogisticRegression(C=0.25, max_iter=1000)
lr_final.fit(X_train, y_train)

LogisticRegression(C=0.25, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
print ("Test Accuracy: %s"
       % accuracy_score(y_test, lr_final.predict(X_test)))

Test Accuracy: 0.952427706792199


In [37]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), lr.coef_[0]
    )
}

In [38]:
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)

('disappoint', 2.369433557666123)
('incredible', 2.328761109425859)
('perfection', 2.258298161982495)
('bomb', 2.139433374472415)
('excellent', 2.0720392111498542)
('phenomenal', 2.0450670383815726)
('gem', 2.0310506736009706)
('delicious', 2.0207183342073156)
('impeccable', 1.974891230963787)
('delectable', 1.9724602841392969)


In [39]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)

('poisoning', -3.568179173006003)
('unacceptable', -2.9906924157188377)
('worst', -2.9751452387375075)
('slowest', -2.902247175980214)
('horrible', -2.7327165141871546)
('mediocre', -2.6374133096699794)
('terrible', -2.5371866328563772)
('forgettable', -2.523744009074766)
('overrated', -2.520471105179542)
('subpar', -2.451364178878386)


# Evaluation

In [40]:
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve

In [41]:
#tn, fp
#fn, tp
y_pred = lr_final.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[15967,  2123],
       [ 1414, 54846]])

In [42]:
# Example of FP (truth is negative, prediction is positive)
fps = []
for i in range(len(y_pred)):
    if y_pred[i] == 1 and y_test[i] == 0:
        fps.append(i)

In [65]:
r = random.choice(fps)
print("{0} : {1}".format(r, data_test.loc[random.choice(fps),'review']))

51637 : Servers were friendly, it's great that they give samples upon entry, however this place leaves something to be desired every time I give it a chance....

Once my mom ordered a curry bowl to go, we ate it in the food court anyways just a few minutes after it was ordered, and for some reason it was cold!!! Doesn't make sense!!

Then I came recently, ordered a butter chicken bowl. The portion was huge and it would have been nice to know beforehand. I could only eat half, so I would probably suggest sharing. It was served with some sort of curried chickpeas and rice. The chickpeas weren't cooked well, I didn't enjoy them at all. Basically didn't touch them. The butter chicken was on the sweeter side. It had good taste but it really wasn't THAT great. I would eat there again, it just wasn't the Indian food I know and love. 

Got a side of garlic naan. This was possibly the only really enjoyable part of the meal. It was actually amazing, but really who could go wrong with naan. 

But

In [60]:
lr_final.predict(X_test[30528])

array([1])

In [44]:
# Example of FN (truth is positive, prediction is negative)
fns = []
for i in range(len(y_pred)):
    if y_pred[i] == 0 and y_test[i] == 1:
        fns.append(i)

In [62]:
r = random.choice(fns)
print("{0} : {1}".format(r, data_test.loc[random.choice(fns),'review']))

38565 : One of my favorite places to eat! I always go for the pad thai, the fritters, tom yum soup and chicken wings. The chicken wings THE BEST I've ever had. One time though, we ordered pad thai and it came out very bland. It was as if there wasn't any sauce on it. I complained and they gave me another one for free(which was good this time) but told me that no one else complained about their Pad Thai that night. Not sure what the waitress was getting at but i think it was unnecessary to say. You could clearly see the first Pad thai we got was dry and pale compared to the second one we got.


In [63]:
lr_final.predict(X_test[38565])

array([0])