In [18]:
import scrape
import json
import pandas as pd
import re
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    return reviews

In [3]:
with open('scrapes/bars_test.json', 'r') as fp:
    bars_test = json.load(fp)

In [4]:
reviews_test = []
for i,x in enumerate(bars_test):
    key = list(bars_test[i].keys())[0]
    for y in x[key]['reviews']:
        reviews_test.append(y)
reviews_df = pd.DataFrame(reviews_test)

In [43]:
reviews = reviews_df.content

In [44]:
reviews_df.loc[ reviews_df.rating <= 3, 'binary_ratings' ] = 0
reviews_df.loc[ reviews_df.rating > 3, 'binary_ratings' ] = 1
ratings = reviews_df.binary_ratings

In [45]:
review_train, review_test, rating_train, rating_test = model_selection.train_test_split(reviews, ratings, test_size=.2, random_state=7)

In [46]:
reviews_train_clean = preprocess_reviews(review_train)
reviews_test_clean = preprocess_reviews(review_test)
target = rating_train

In [47]:
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [48]:
X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1, 100]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 1.0
Accuracy for C=0.05: 1.0
Accuracy for C=0.25: 1.0
Accuracy for C=0.5: 1.0
Accuracy for C=1: 1.0
Accuracy for C=100: 1.0




In [49]:
final_model = LogisticRegression(C=100)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(rating_test, final_model.predict(X_test)))

Final Accuracy: 1.0


In [50]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}

In [56]:
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:20]:
    print (best_positive)

('great', 2.0166229650593537)
('and', 1.4050389044138623)
('spot', 1.3862083723354706)
('service', 1.3687528391743515)
('in', 1.248672889372918)
('cocktails', 1.2352323326610861)
('amazing', 1.169556513907185)
('best', 0.9356000775308336)
('friendly', 0.9257779273952741)
('awesome', 0.8985736336421548)
('drinks', 0.8974541652234439)
('is', 0.8506242117084476)
('classic', 0.8245582900710686)
('from', 0.8189651944151123)
('with', 0.7857152595035227)
('fantastic', 0.7332341127504686)
('bar', 0.7029835659496853)
('we', 0.7015342867456736)
('place', 0.6747743437801043)
('cocktail', 0.6149616956124313)


In [57]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:20]:
    print (best_negative)

('that', -1.5698427328502966)
('werent', -1.3781952298369162)
('if', -1.217980155067944)
('overpriced', -1.101330218131531)
('slow', -1.030044797601466)
('way', -0.9701351658782865)
('are', -0.9563440125290876)
('up', -0.9429240074257292)
('call', -0.9035645735307478)
('inattentive', -0.9035645735307478)
('joke', -0.9035645735307478)
('paaaaaasss', -0.9035645735307478)
('front', -0.8882227127110365)
('bad', -0.8826179269170903)
('how', -0.8783864108030562)
('tell', -0.8607427382142208)
('hit', -0.8566569925935117)
('girlfriend', -0.8411990924805482)
('trying', -0.8411990924805482)
('used', -0.8241884005902941)
