In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

In [7]:
df = pd.read_json('review.json',nrows=1500000,  lines=True)

In [3]:
review_count = df.pivot_table(index='business_id', aggfunc='size')
review_count_sorted = review_count.sort_values(ascending=False)
review_count_sorted

business_id
GBTPC53ZrG1ZBY3DT8Mbcw    4661
DcBLYSvOuWcNReolRVr12A    3217
j-qtdD55OLfSqfsWuQTDJg    2884
PY9GRfzr4nTZeINf346QOw    2715
xlMQBBt9wrtahdqiRDcVSg    2618
                          ... 
wEVpM-VjORyYOxatJfB6Ig       1
560cSCfRSGkDPbwZeYnmkA       1
55VhtmQc85cAePrmarn_1g       1
Kcehah5eUGfqJRCg200wvg       1
IuasKM5Wjgj7KycJ1CpUqw       1
Length: 39503, dtype: int64

In [8]:
df = df[df['business_id'] == "GBTPC53ZrG1ZBY3DT8Mbcw"]
df = df.drop(df[df["stars"] == 3].index)
df["stars"] = df["stars"].replace(1, 'bad')
df["stars"] = df["stars"].replace(2, 'bad')
df["stars"] = df["stars"].replace(4, 'good')
df["stars"] = df["stars"].replace(5, 'good')
df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
108,yyrMqY7sNp5gT7KJ1AaYWA,pitYOVSsF8R1gWG1G0qxsA,GBTPC53ZrG1ZBY3DT8Mbcw,good,0,0,0,We have been here twice for brunch and have en...,2016-03-07 17:25:30
263,wEfzqOfbwn4Ohe2ZDOLAzw,VMtyZjaEJB9nfmjr4xdVlw,GBTPC53ZrG1ZBY3DT8Mbcw,good,1,1,0,First meal in New Orleans. I had the $15 lunch...,2012-11-06 22:28:18
618,XT_3UpEhO5eJIFxevnv_Yw,0WqEkKMu03irkMiEtsFxZg,GBTPC53ZrG1ZBY3DT8Mbcw,good,0,0,0,Service is excellent. Atmosphere is great. The...,2015-02-02 07:25:28
865,YpHIixkA2K96wFLbI6FHkQ,tt2qIFKZumubxk_UwXhijA,GBTPC53ZrG1ZBY3DT8Mbcw,good,0,0,0,This was our second choice after finding out R...,2009-03-01 00:24:17
938,gnEmBGyBs7DOtIXGmJQdKQ,XuadFePn8P6l5epbEFnfKw,GBTPC53ZrG1ZBY3DT8Mbcw,good,0,0,0,Great oyster happy hours that go on from 5-7pm...,2015-08-19 18:35:37
...,...,...,...,...,...,...,...,...,...
714770,JmxApNH1H_8DoZdUfU0ocQ,QIkfj7Kkv0Ingg61rW-80g,GBTPC53ZrG1ZBY3DT8Mbcw,good,1,0,0,Ate and licked everything Off the plate. This ...,2018-09-23 21:55:20
715199,F3v8J9gEE22hjTVEU4e63g,J4yF68T9LEUWRvI1v1OQFw,GBTPC53ZrG1ZBY3DT8Mbcw,good,1,0,1,"This is one of John Besh's more ""casual"" resta...",2017-02-14 21:40:45
715479,ZK8JNks8Pf5BxaYr8AWvag,zh9vPYEETbOxivf1Ed4p-Q,GBTPC53ZrG1ZBY3DT8Mbcw,good,6,4,5,Happy hour every day!!!!! Not that I care abou...,2019-03-03 15:08:41
716014,NQSgGa3T0tNA_Gu9A8Vi7g,NmLBU6ow_tAXJbj_x0sa7A,GBTPC53ZrG1ZBY3DT8Mbcw,good,3,1,1,This review was solely based on our experience...,2018-09-05 17:42:22


In [9]:
reviews = df["text"]
labels = df["stars"]

In [10]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2)

In [12]:
clf = LogisticRegression(solver= 'liblinear',penalty= 'l2', intercept_scaling= 1, random_state= 42, max_iter= 50000, dual= True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.9470517448856799


In [13]:
feature_names = vectorizer.get_feature_names_out()

coefs = clf.coef_[0]
sorted_idx = coefs.argsort()
for i in sorted_idx[:10000]:
    print(feature_names[i])


disappointed
mediocre
bland
told
poor
cold
salty
looked
overrated
average
not
ok
rude
maybe
disappointing
wasn
money
overpriced
asked
worst
no
many
tasted
corner
appetizers
expensive
sick
instead
sadly
horrible
better
much
sliders
disappointment
places
part
pearl
ordered
dry
overcooked
awful
biscuit
tough
butter
don
19
after
ham
wanted
does
pass
customers
service
empty
time
why
shell
nothing
checked
won
worse
forgettable
managers
fries
spend
dollar
staying
pay
uncomfortable
over
bring
okay
greasy
literally
left
boyfriend
okra
hostess
limited
terrible
attention
felt
cook
sad
subpar
wrong
linh
call
tasteless
amount
under
never
upstairs
covid
planet
guess
seat
should
been
skip
trenasse
note
failed
cake
seems
use
long
didn
let
burnt
about
way
theirs
bummer
flavorless
old
11
ribeye
besh
mean
going
management
alcoholic
decided
attitude
stand
worth
whole
tables
shells
said
dish
scone
roasted
saving
arrived
eating
expectations
served
confused
bad
order
location
used
spice
ignored
fine
underwhe