In [3]:
import pandas as pd
import numpy as np
import re
import json
import string
import spacy

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English


In [4]:
punctuations = string.punctuation
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

In [5]:

def spacy_normalizer(sent):
    tokens = parser(sent)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations]
    output = ' '.join(list(tokens))
    return output


def clean_text(text):
    return text.strip().lower()


In [6]:
spacy_normalizer('Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.')

'total bill horrible service 8gs crooks actually nerve charge 69 3 pills checked online pills 19 cents avoid hospital ers costs'

In [7]:
with open("_data/reviews.json", encoding='utf-8', errors='ignore') as json_data:
     data = json.load(json_data, strict=False)

In [16]:
#data

In [17]:
reviews=pd.DataFrame.from_dict([v for v in data['reviews']], orient='columns', dtype=None, columns=None)

In [18]:
#reviews.head()

reviews.info()

#reviews.stars.value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 659554 entries, 0 to 659553
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   review_id    659554 non-null  object 
 1   user_id      659554 non-null  object 
 2   business_id  659554 non-null  object 
 3   stars        659554 non-null  float64
 4   useful       659554 non-null  int64  
 5   funny        659554 non-null  int64  
 6   cool         659554 non-null  int64  
 7   text         659554 non-null  object 
 8   date         659554 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 45.3+ MB


In [19]:
reviews = reviews[reviews.stars != 3]

In [20]:
reviews['sentiment'] = np.where(reviews.stars>3, 1, 0)

In [21]:
reviews = reviews[['text', 'sentiment']]

In [22]:
reviews

Unnamed: 0,text,sentiment
0,Total bill for this horrible service? Over $8G...,0
1,I *adore* Travis at the Hard Rock's new Kelly ...,1
2,I have to say that this office really has it t...,1
3,Went in for a lunch. Steak sandwich was delici...,1
4,Today was my second out of three sessions I ha...,0
...,...,...
659548,Nothing bad about the food. Quick service. Pri...,1
659549,Finally a place to eat something other than fr...,1
659550,"I'm not close to Jamaican, nor am i an expert ...",1
659552,30 July 2013\n\nGreat customer service by the ...,1


In [31]:
reviews_pos = reviews[reviews.sentiment == 1][:10000]
reviews_neg = reviews[reviews.sentiment == 0][:10000]

In [32]:
reviews_pos

Unnamed: 0,text,sentiment
1,I *adore* Travis at the Hard Rock's new Kelly ...,1
2,I have to say that this office really has it t...,1
3,Went in for a lunch. Steak sandwich was delici...,1
5,I'll be the first to admit that I was not exci...,1
10,"Like walking back in time, every Saturday morn...",1
...,...,...
15037,Jun's Korean is a great place to come for lunc...,1
15038,Two of my friends bought their Hyundais from C...,1
15040,B with A Twist catered my daughters wedding th...,1
15043,Las Vegas is riddled with restaurants that loo...,1


In [33]:
reviews_pos['text_norm'] = reviews_pos['text'].apply(spacy_normalizer)
reviews_neg['text_norm'] = reviews_neg['text'].apply(spacy_normalizer)

In [38]:
pos = " ".join(reviews_pos.text_norm.to_list())
neg = " ".join(reviews_neg.text_norm.to_list())

['total bill horrible service 8gs crooks actually nerve charge 69 3 pills checked online pills 19 cents avoid hospital ers costs',
 'today second sessions paid session went tell meredith particular enjoyment male clients female returned teeth fine pleased results went today whitening room gentlemen appointment started person service industry attend female clientele couple arrives unbothered signs waited turn checked original 30 minute timer ask ok attended boyfriend numerous occasions men exit room asking looking irritation half way woman showed explaining deals lobby admits timers reset half way process reset boyfriends left rest gentleman furthest away time come redeem deal set gave timer left point time 10 minutes reset 5 minutes ago according sat patiently time major pain gums watched time lamp shut reset explained deals guest checked time light turned released stance mouth relaxed state assuming getting thirty minute session instead usual 45 come point teeth formula burning gum ne

In [33]:
text_pos

['adore travis hard rock new kelly cardenas salon fan great blowout stranger chains offer service travis taken flawless blowout new level travis greets perfectly green swoosh perfectly styled black hair vegas worthy rockstar outfit comes relaxing incredible shampoo -- head message cure worst migraine minutes --- scented shampoo room travis freakishly strong fingers good way use perfect pressure superb starts glorious blowout ... people involved best round brush action hair seen team stylists clearly gets extremely evident way talk help genuine corporate requirement fun travis started flat iron way flipped wrist volume making look like texas pagent girl admirable worth noting fry hair -- happen skilled stylists end blowout style hair perfectly bouncey looked terrific thing better awesome blowout lasted days travis single time vegas feel beauuuutiful',
 'office organized friendly dr. j. phillipp great dentist friendly professional dental assistants helped procedure amazing jewel bailey h

In [39]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([pos, neg])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)


In [42]:
df

Unnamed: 0,00,000,0000000,00a,00am,00cad,00pm,01,0135,014,...,我们刚在这里吃午饭,我把它展示给服务员,日本人にとっても快適,気が利きます,牛肉のタルタル,盛り付けも丁寧で美味しい,真好,聞いたことには丁寧に答えて下さいます,这意味着他们的厨房里有很多蟑螂,这里很好吃
0,0.005947,0.001127,0.0,0.0,0.000563,0.0,0.001127,0.000125,0.0,8.8e-05,...,0.0,0.0,8.8e-05,8.8e-05,8.8e-05,8.8e-05,8.8e-05,8.8e-05,0.0,8.8e-05
1,0.021205,0.003772,7.2e-05,7.2e-05,0.000561,0.000143,0.000816,0.000408,7.2e-05,0.0,...,7.2e-05,7.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,7.2e-05,0.0


In [41]:
dft = df.transpose()

In [47]:
tf_idf_pos = dft.sort_values(by=0, axis=0, ascending=False)[10:].reset_index()[['index', 0]].head(50).rename({'index': 'word', 0:'tfidf'}, axis = 1)

In [48]:
tf_idf_neg= dft.sort_values(by=1, axis=0, ascending=False)[10:].reset_index()[['index', 1]].head(50).rename({'index': 'word', 1:'tfidf'}, axis = 1)

In [49]:
tf_idf_pos

Unnamed: 0,word,tfidf
0,amazing,0.114298
1,friendly,0.111231
2,definitely,0.10685
3,got,0.106224
4,staff,0.10566
5,delicious,0.103094
6,try,0.091076
7,chicken,0.090137
8,little,0.087257
9,recommend,0.081937


In [50]:
tf_idf_neg

Unnamed: 0,word,tfidf
0,asked,0.118359
1,came,0.115708
2,went,0.114944
3,ordered,0.108725
4,minutes,0.10689
5,people,0.106737
6,know,0.094809
7,going,0.092923
8,customer,0.089763
9,come,0.087622


NameError: name 'pipe' is not defined

AttributeError: Can't get attribute 'predictors' on <module '__main__'>