In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import random

In [2]:
!ls ../data

yelp_test.json  yelp_train.json


# Load Dataset

In [3]:
data_train = pd.read_json("../data/yelp_train.json")
data_test = pd.read_json("../data/yelp_test.json")

In [4]:
data_train.head()

Unnamed: 0,review,sentiment
0,"i've been here a couple times, once ordered Po...",0
1,I've been to Dragon Pearl Buffet on countless ...,1
2,Service is Great. Food is excellent and great...,1
3,Went to this place right after Germany's victo...,1
4,Great price point for very tasty soon tofu.\n\...,1


In [5]:
data_train.describe()

Unnamed: 0,sentiment
count,223050.0
mean,0.759695
std,0.42727
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [6]:
data_test.head()

Unnamed: 0,review,sentiment
0,Cute idea for a board game cafe! Definitely a ...,1
1,Pressed Cubano or pork belly medianoche a must...,1
2,"Rooftop has a great vibe, music was phenomenal...",1
3,My friend and I from the area first went here ...,1
4,I had the Dahl with roti for breakfast. It was...,0


In [7]:
data_test.describe()

Unnamed: 0,sentiment
count,74350.0
mean,0.756691
std,0.429083
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [8]:
X_train = data_train['review'].tolist()
y_train = data_train['sentiment'].tolist()

In [9]:
X_test = data_test['review'].tolist()
y_test = data_test['sentiment'].tolist()

# Remove Regular Expressions

In [10]:
# import re

# EMOJI_PATTERNS = re.compile("["
#                             u"\U0001F600-\U0001F64F"  # emoticons
#                             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                             u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                             "]+", flags=re.UNICODE)
# REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
# REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|\n")

# def remove_regs(review):
#     r = EMOJI_PATTERNS.sub(" ", review)
#     r = REPLACE_NO_SPACE.sub(" ", r)
#     r = REPLACE_WITH_SPACE.sub(" ", r)
#     return r

In [11]:
# X_train = [remove_regs(review.lower()) for review in X_train]

In [12]:
# X_test = [remove_regs(review.lower()) for review in X_test]

# Vectorization

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer(binary=False)

In [15]:
# Fit on the train
cv.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [16]:
# Transform both train and test
X_train = cv.transform(X_train)
X_test = cv.transform(X_test)

In [17]:
len(cv.get_feature_names())

102837

# Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [19]:
# Cross-validation
X_cv_train, X_cv_val, y_cv_train, y_cv_val = train_test_split(X_train, y_train, train_size = 0.75)

In [None]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c, max_iter=1000)
    lr.fit(X_cv_train, y_cv_train)
    print ("Validation Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_cv_val, lr.predict(X_cv_val))))

In [None]:
lr_final = LogisticRegression(C=0.25, max_iter=1000)
lr_final.fit(X_train, y_train)

In [None]:
print ("Test Accuracy: %s"
       % accuracy_score(y_test, lr_final.predict(X_test)))

In [None]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), lr.coef_[0]
    )
}

In [None]:
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)

In [None]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)

# Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve

In [None]:
#tn, fp
#fn, tp
y_pred = lr_final.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Example of FP (truth is negative, prediction is positive)
fps = []
for i in range(len(y_pred)):
    if y_pred[i] == 1 and y_test[i] == 0:
        fps.append(i)

In [None]:
r = random.choice(fps)
print("{0} : {1}".format(r, data_test.loc[random.choice(fps),'review']))

In [None]:
lr_final.predict(X_test[30528])

In [None]:
# Example of FN (truth is positive, prediction is negative)
fns = []
for i in range(len(y_pred)):
    if y_pred[i] == 0 and y_test[i] == 1:
        fns.append(i)

In [None]:
r = random.choice(fns)
print("{0} : {1}".format(r, data_test.loc[random.choice(fns),'review']))

In [None]:
lr_final.predict(X_test[38565])