In [41]:
import spacy
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
import matplotlib as mpl
from _get_adj_phrases import get_adjective_phrases, get_list_of_phrases
from collections import Counter, defaultdict
import math

In [42]:
nlp = spacy.load("en_core_web_sm")

In [43]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

A sample of what we get from real life data

In [44]:
# get reviews for a random business
random_business = reviews.sample(random_state=42)
random_business_id = random_business.iloc[0]['business_id']
small_business_dataset = reviews.loc[reviews['business_id'] == random_business_id]
small_business_dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
11200,wysFaMq5S88mF6HLxdh3Vw,4nQ7MOkbc5u-DYUOgjiDsg,yEZn1XpLsEC9uBa-X4xAZw,4,1,0,0,Cute little vietnamese sandwich place on St De...,2016-03-17 14:29:44
11204,z81YK3CXQJb48BU7ZAcf7A,Ht8iGitRu8kynEubcIhsTQ,yEZn1XpLsEC9uBa-X4xAZw,4,7,2,8,"Hey Hey, it's the SAMMIES!\n\nSuper Yummy here...",2011-11-30 03:26:01
11209,tqajcnZA97HlXkIYrdNVlg,-a0XgJCXoJln2Ue_JMnfIQ,yEZn1XpLsEC9uBa-X4xAZw,3,0,0,0,Sandwich was good but service was terrible. We...,2016-07-14 18:52:34
11212,wXz1c5kz0QB2nvZVTnnkIg,F9ivL7-mzKbfnLO1rKUFFQ,yEZn1XpLsEC9uBa-X4xAZw,5,0,0,0,"Super fresh, delicious and cheap Banh Mi in th...",2017-07-28 17:48:01
11258,89J69mHgTTF25Ln97ONTrQ,La1ggB37gvq6UWpEWaMtMA,yEZn1XpLsEC9uBa-X4xAZw,4,2,0,2,Que ce soit pour leurs délicieux bubbles tea/s...,2012-12-04 21:25:13


In [45]:
small_business_dataset_reviews = list(small_business_dataset['text'])
# lowercase and correct the text
# this probably works but it takes too long to run so just run this when we want to submit it
# small_business_dataset_reviews = [TextBlob(text).correct() for text in small_business_dataset_reviews]
small_business_dataset_reviews = [sent_tokenize(text) for text in small_business_dataset_reviews]
small_business_dataset_reviews = [item for sublist in small_business_dataset_reviews for item in sublist]

In [46]:
small_business_phrases = [get_adjective_phrases(text.lower()) for text in small_business_dataset_reviews]
small_business_phrases = [array for array in small_business_phrases if len(array) > 0]
small_business_phrases = [text for sublist in small_business_phrases for text in sublist]
small_business_phrases

also
usually
up
duplicate adverb in subtree: usually
too
also
here
before
duplicate adverb in subtree: here
actually
also
back
since
maybe
still
why
rarely
also
also
also
also
actually
though
duplicate adverb in subtree: actually
so
overall
already
also
also
here
duplicate adverb in subtree: also
definitely
however
also
so
also
upstairs
still
too
duplicate adverb in subtree: still
so
always
duplicate adverb in subtree: so
also
only
duplicate adjective in subtree: ok
possibly
here
undoubtedly
also
most
then
ever
also
here
duplicate adverb in subtree: also
here
also
also
just
duplicate adjective in subtree: sweet
so
actually
even
really
probably
sometimes
duplicate adverb in subtree: probably
better
though
duplicate adverb in subtree: better
also
together
even
also
perhaps
definitely
also
probably
also


['really big',
 'very reasonable',
 'really good',
 'very clear',
 'so good',
 'so fresh',
 'so good',
 'so cheap',
 'so good',
 'quite hungry',
 'very little',
 'too way sweet',
 'just good',
 'how delicious',
 'so addictive',
 'really disappointing',
 'very sad',
 'tremendously tasty',
 'so good',
 'pretty cheap',
 'quite cheap',
 'too lazy',
 'very healthy',
 'as good',
 'how friendly',
 'as good',
 'so tasty',
 'pretty small',
 'really excited',
 'so terrible',
 'very reasonable',
 'quite tasty',
 'very tender',
 'too many',
 'cloyingly sweet',
 'completely foreign',
 'just hungry',
 'less strong',
 'too salty',
 'so delicious',
 'so good',
 'most familiar',
 'very unusual',
 'definitely best',
 'very enjoyable',
 'very good',
 'very good',
 'more flavorful',
 'too bad',
 'just average',
 'just average',
 'so affordable',
 'so good',
 'so worth',
 'very good',
 'very welcoming',
 'very too clean',
 'very good',
 'very smart',
 'very friendly',
 'very small',
 'most legit',
 'beauti

In [39]:
phrase_counts = Counter(small_business_phrases)
freq_phrases = phrase_counts.most_common(10)
freq_phrases


[('so good', 8),
 ('very good', 5),
 ('as good', 3),
 ('too sweet', 3),
 ('very reasonable', 2),
 ('pretty cheap', 2),
 ('just average', 2),
 ('much better', 2),
 ('so much', 2),
 ('really big', 1)]

## Doing it for all of the data that we have

In [40]:
reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11
...,...,...,...,...,...,...,...,...,...
15295,qknwFVEh_0KSuexigYBI_A,YMGmyPOU65SMs4H60ltYiw,shIPnFoXrL3dFo5HLH1_HA,1,2,0,0,This was the worst experience ever. So much so...,2014-07-12 21:58:15
15296,Y-ZRoyAXCukBK1uK1ZcZCA,JLhOWQiWtGbr14K_KmoWxA,zPEYgVqJ2QNKi45FJi2jvg,5,0,0,0,We come here every time we hit Vegas! A giant ...,2018-11-10 21:38:49
15297,A8HdjBfhj3pgQuSbwNtDEw,6CoiKFDFXIACJZvv_I_8mQ,zPEYgVqJ2QNKi45FJi2jvg,1,0,1,0,As locals we used to the this place when it w...,2018-10-13 22:11:22
15298,2n1QdrYBRAAe6GKaxEV0jA,_fH4s3ls08eSl_PfX38KIA,etzDsNjkCyQBoJcU2a3U-g,5,0,0,0,The food was delicious. We were seated in 15 m...,2015-02-15 08:43:46


In [29]:
reviews_list = reviews[['business_id', 'text']]
reviews_list = reviews_list.groupby(['business_id'])['text'].apply(','.join).reset_index()
reviews_list

Unnamed: 0,business_id,text
0,--I7YYLada0tSLkORTHb5Q,"Had to get my wing fix, I like dry rubs on win..."
1,-7XWJYkutqhIxLen7Grg1g,Definite recommend. \n\nBut I never would have...
2,0Rni7ocMC_Lg2UH0lDeKMQ,We love Barros!! Usually go to other locations...
3,0kPm1zEpeXFRg8D2phqgCQ,"Coffee is exponentially better than Starbucks,..."
4,1Fpk8ibHhZYnCw8fnGny8w,Really love the food here! I was a HUGE fan of...
...,...,...
148,yHHVKa9joZAKiBDUp2SkKw,"This coffee shop has great coffee, but more im..."
149,z8Em-bhZI3Mmspml7tj6tg,This was the first time and probably the last ...
150,zPEYgVqJ2QNKi45FJi2jvg,Stopped by here for lunch yesterday. What a d...
151,zZ7KDK3GAkBUZzsaqB1A4Q,"Staff is SO friendly, I was checked in on once..."


In [30]:
# reviews_dict: list of dicts, one for each business
reviews_dict = reviews_list.to_dict('records')

In [31]:
#trial code
reviews_test = reviews_dict[:2]
test = [get_list_of_phrases(review['text']) for review in reviews_test]
test[0]

always
necessarily
here
also
though
here
twice
duplicate adverb in subtree: here
later
duplicate adverb in subtree: here
always
basically
there
ago
duplicate adverb in subtree: there
,do
back
back
however
back
again
duplicate adverb in subtree: back
always
too
also
otherwise
so
back
all
all
back
here
probably
usually
well
duplicate adverb in subtree: usually
always
too
course
maybe
yet
now
however
also
extremely
duplicate adverb in subtree: also
also
definitely
recently
also
also
off
duplicate adverb in subtree: also
however
probably
always
back
definitely
back
duplicate adverb in subtree: definitely
again
duplicate adverb in subtree: definitely
also
always
though
inside
best
instead
hardly
duplicate adverb in subtree: instead
first
here
too
really
nearby
duplicate adverb in subtree: really
definitely
back!,there
duplicate adverb in subtree: definitely
back
really
back
however
oily
so
so
again
duplicate adverb in subtree: so
back
overall
always
always
really
maybe
just
duplicate adverb

Counter({'very pleased': 3,
         'always good always good always good ': 1,
         'necessarily bad necessarily bad ': 1,
         'just average': 1,
         'also fantastic also fantastic also fantastic ': 1,
         'how good': 1,
         'how delicious': 1,
         'very good': 9,
         'very friendly': 4,
         'really good': 7,
         'inside great': 1,
         'very prompt': 1,
         'even fewer': 1,
         'real good': 1,
         'too much': 2,
         'pretty decent': 1,
         'pretty good': 9,
         'most recent': 1,
         'pretty average': 1,
         'way friendlier': 1,
         'exceptionally good': 1,
         'much better': 2,
         'extremely greasy': 1,
         'very empty': 1,
         'enough large': 1,
         'barely hot': 1,
         'extremely attentive': 1,
         'very busy': 4,
         'too many': 1,
         'super busy': 1,
         'really nice': 1,
         'so excited': 1,
         'too low': 1,
         'very ni

In [32]:
# this cell will take some time to run
phrases_dict = [get_list_of_phrases(review['text']) for review in reviews_dict]

always
necessarily
here
also
though
here
twice
duplicate adverb in subtree: here
later
duplicate adverb in subtree: here
always
basically
there
ago
duplicate adverb in subtree: there
,do
back
back
however
back
again
duplicate adverb in subtree: back
always
too
also
otherwise
so
back
all
all
back
here
probably
usually
well
duplicate adverb in subtree: usually
always
too
course
maybe
yet
now
however
also
extremely
duplicate adverb in subtree: also
also
definitely
recently
also
also
off
duplicate adverb in subtree: also
however
probably
always
back
definitely
back
duplicate adverb in subtree: definitely
again
duplicate adverb in subtree: definitely
also
always
though
inside
best
instead
hardly
duplicate adverb in subtree: instead
first
here
too
really
nearby
duplicate adverb in subtree: really
definitely
back!,there
duplicate adverb in subtree: definitely
back
really
back
however
oily
so
so
again
duplicate adverb in subtree: so
back
overall
always
always
really
maybe
just
duplicate adverb

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



here
now
duplicate adverb in subtree: here
occasionally
here
always
here
definitely
again
duplicate adverb in subtree: definitely
simultaneously
always
then
here
also
overall
least
forever
also
here
also
so
much
really
so
all
quite
maybe
always
where
usually
also
harder
so
so
again
better
duplicate adverb in subtree: again
here
definitely
back
duplicate adverb in subtree: definitely
only
once
duplicate adverb in subtree: only
also
definitely
so
matter
duplicate adverb in subtree: so
so
actually
duplicate adverb in subtree: so
off
next
there
so
then
here
much
so
now
duplicate adverb in subtree: so
only
once
duplicate adverb in subtree: only
later
though
then
always
also
well
overall
then
just
often
either
apparently
here
before
duplicate adverb in subtree: here
maybe
back
sadly
much
duplicate adverb in subtree: sadly
just
loud
back
again
duplicate adverb in subtree: back
all
always
wise
still
definitely
here
legitimately
definitely
back
duplicate adverb in subtree: definitely
here
once


TypeError: object of type 'NoneType' has no len()

we can then get the most common phrases. this is where the fun begins

In [None]:
most_common_phrases = [counter.most_common(5) for counter in phrases_dict]
most_common_phrases

since this does not show any good results as they are not indicative of anything, we look into TF-IDF

In [17]:
# get the most common phrases from all the datasets
all_phrases = dict(sum(phrases_dict, Counter()))
all_phrases

{'very pleased': 47,
 'always good always good always good ': 15,
 'just typical': 1,
 'just average': 13,
 'also fantastic also fantastic also fantastic also fantastic ': 1,
 'at least': 75,
 'how good': 44,
 'how delicious': 11,
 'very good': 620,
 'very friendly': 366,
 'really good': 453,
 'very prompt': 5,
 'even fewer': 1,
 'real good': 6,
 'too much': 193,
 'pretty decent': 57,
 'no better': 9,
 'pretty good': 437,
 'most recent': 34,
 'pretty average': 9,
 'way friendlier': 1,
 'exceptionally good': 4,
 'much better': 206,
 'extremely greasy': 3,
 'very empty': 2,
 'enough large': 9,
 'barely hot': 2,
 'extremely attentive': 8,
 'very busy': 84,
 'too many': 70,
 'super busy': 30,
 'really nice': 125,
 'so excited': 46,
 'too low': 2,
 'more interested': 9,
 'very nice': 380,
 'too busy': 71,
 'how busy': 32,
 'just wonderful': 5,
 'pretty awesome': 16,
 'ridiculously good': 4,
 'pretty surprisingly darn': 1,
 'just sad': 5,
 'very spacious': 16,
 'all exceptional all exception

In [18]:
common_phrases_list = [counter.most_common() for counter in phrases_dict]
common_phrases = [tuples for sublist in common_phrases_list for tuples in sublist]
common_phrases

[('very good', 9),
 ('pretty good', 9),
 ('really good', 7),
 ('very friendly', 4),
 ('very busy', 4),
 ('very pleased', 3),
 ('at least', 2),
 ('too much', 2),
 ('much better', 2),
 ('too busy', 2),
 ('very spacious', 2),
 ('very large', 2),
 ('very attentive', 2),
 ('super friendly', 2),
 ('so good', 2),
 ('always good always good always good ', 1),
 ('just typical', 1),
 ('just average', 1),
 ('also fantastic also fantastic also fantastic also fantastic ', 1),
 ('how good', 1),
 ('how delicious', 1),
 ('very prompt', 1),
 ('even fewer', 1),
 ('real good', 1),
 ('pretty decent', 1),
 ('no better', 1),
 ('most recent', 1),
 ('pretty average', 1),
 ('way friendlier', 1),
 ('exceptionally good', 1),
 ('extremely greasy', 1),
 ('very empty', 1),
 ('enough large', 1),
 ('barely hot', 1),
 ('extremely attentive', 1),
 ('too many', 1),
 ('super busy', 1),
 ('really nice', 1),
 ('so excited', 1),
 ('too low', 1),
 ('more interested', 1),
 ('very nice', 1),
 ('how busy', 1),
 ('just wonderful

In [19]:
DF = {}
for i, phrase_tuple in enumerate(common_phrases):
    phrase: str = phrase_tuple[0]
    try:
        DF[phrase].add(i)
    except:
        DF[phrase] = {i}

In [20]:
# count across the entire document
for i in DF:
    DF[i] = len(DF[i])
DF

{'very good': 135,
 'pretty good': 121,
 'really good': 129,
 'very friendly': 124,
 'very busy': 51,
 'very pleased': 39,
 'at least': 53,
 'too much': 101,
 'much better': 99,
 'too busy': 51,
 'very spacious': 14,
 'very large': 34,
 'very attentive': 56,
 'super friendly': 81,
 'so good': 111,
 'always good always good always good ': 15,
 'just typical': 1,
 'just average': 12,
 'also fantastic also fantastic also fantastic also fantastic ': 1,
 'how good': 34,
 'how delicious': 10,
 'very prompt': 5,
 'even fewer': 1,
 'real good': 6,
 'pretty decent': 43,
 'no better': 8,
 'most recent': 28,
 'pretty average': 7,
 'way friendlier': 1,
 'exceptionally good': 4,
 'extremely greasy': 3,
 'very empty': 2,
 'enough large': 9,
 'barely hot': 2,
 'extremely attentive': 8,
 'too many': 53,
 'super busy': 23,
 'really nice': 83,
 'so excited': 31,
 'too low': 2,
 'more interested': 9,
 'very nice': 129,
 'how busy': 27,
 'just wonderful': 5,
 'pretty awesome': 16,
 'ridiculously good': 4,

In [21]:
total_vocab = [x for x in DF]
print(total_vocab)

['very good', 'pretty good', 'really good', 'very friendly', 'very busy', 'very pleased', 'at least', 'too much', 'much better', 'too busy', 'very spacious', 'very large', 'very attentive', 'super friendly', 'so good', 'always good always good always good ', 'just typical', 'just average', 'also fantastic also fantastic also fantastic also fantastic ', 'how good', 'how delicious', 'very prompt', 'even fewer', 'real good', 'pretty decent', 'no better', 'most recent', 'pretty average', 'way friendlier', 'exceptionally good', 'extremely greasy', 'very empty', 'enough large', 'barely hot', 'extremely attentive', 'too many', 'super busy', 'really nice', 'so excited', 'too low', 'more interested', 'very nice', 'how busy', 'just wonderful', 'pretty awesome', 'ridiculously good', 'pretty surprisingly darn', 'just sad', 'all exceptional all exceptional ', 'very reasonable', 'very marginal', 'pretty empty', 'always friendly always friendly ', 'decent(not worth decent(not worth decent(not worth d

In [22]:
DocumentTF = []
for i, sublist in enumerate(common_phrases_list):
    TF = defaultdict(int)
    adj_phrase_count: int = 0
    for tuples in sublist:
        adj_phrase_count += tuples[1]
    
    for tuples in sublist:
        TF[tuples[0]] = tuples[1]/adj_phrase_count
    DocumentTF.append(TF)

In [23]:
def computeIDF(word: str) -> float:
    N = len(common_phrases_list)
    num_review = 0
    for reviews in common_phrases_list:
        for review_tuples in reviews:
            if word == review_tuples[0]:
                num_review += 1
    return math.log(N / float(num_review))

In [24]:
temp_idf_dict = defaultdict(int)
for word in total_vocab:
    temp_idf_dict[word] = computeIDF(word)
# idf_list = [{word: computeIDF(word)} for word in total_vocab]

In [25]:
DocumentTF[0].keys()

dict_keys(['very good', 'pretty good', 'really good', 'very friendly', 'very busy', 'very pleased', 'at least', 'too much', 'much better', 'too busy', 'very spacious', 'very large', 'very attentive', 'super friendly', 'so good', 'always good always good always good ', 'just typical', 'just average', 'also fantastic also fantastic also fantastic also fantastic ', 'how good', 'how delicious', 'very prompt', 'even fewer', 'real good', 'pretty decent', 'no better', 'most recent', 'pretty average', 'way friendlier', 'exceptionally good', 'extremely greasy', 'very empty', 'enough large', 'barely hot', 'extremely attentive', 'too many', 'super busy', 'really nice', 'so excited', 'too low', 'more interested', 'very nice', 'how busy', 'just wonderful', 'pretty awesome', 'ridiculously good', 'pretty surprisingly darn', 'just sad', 'all exceptional all exceptional ', 'very reasonable', 'very marginal', 'pretty empty', 'always friendly always friendly ', 'decent(not worth decent(not worth decent(n

In [26]:
DocumentTF[0]['very good']

0.05806451612903226

In [27]:
temp_idf_dict['very good']

0.125163142954006

In [28]:
def computeTFIDF(TF, idf_list):
    TFIDF = defaultdict(float)
    for phrase in TF.keys():
        phrase_tf_idf = TF[phrase] * idf_list[phrase]
        TFIDF[phrase] = phrase_tf_idf
    return TFIDF

In [29]:
TFIDFList = [computeTFIDF(TF, temp_idf_dict) for TF in DocumentTF]

In [30]:
TFIDFList[0]

defaultdict(float,
            {'very good': 0.007267537332813251,
             'pretty good': 0.013624686336524191,
             'really good': 0.007705668511066736,
             'very friendly': 0.005423389826771577,
             'very busy': 0.028351284868854448,
             'very pleased': 0.026455669843795916,
             'at least': 0.013679303326971788,
             'too much': 0.005358934252273239,
             'much better': 0.005617007371068974,
             'too busy': 0.014175642434427224,
             'very spacious': 0.0308565237648668,
             'very large': 0.019407450280984184,
             'very attentive': 0.012968854589126276,
             'super friendly': 0.008206306667354796,
             'so good': 0.004140744775227113,
             'always good always good always good ': 0.014983146582517582,
             'just typical': 0.03245443820253184,
             'just average': 0.016422782397447967,
             'also fantastic also fantastic also fantastic also 

In [31]:
max_0 = max(TFIDFList[0], key=TFIDFList[0].get)

In [32]:
max_0

'just typical'

### Cloud tries in jeth.ipynb

In [34]:
import gensim
from gensim import models
from gensim.models import Word2Vec, KeyedVectors

from gensim.models import Phrases
# Train a bigram detector.
bigram_transformer = Phrases(small_business_phrases)
# remove duplicates
common_phrases_list1 = []
for i in common_phrases_list:
    for j in i:
        common_phrases_list1.append(j[0])
common_phrases_list1 = set(common_phrases_list1)
common_phrases_list1 = list(common_phrases_list1)
# Apply the trained MWE detector to a corpus, using the result to train a Word2vec model.
model = Word2Vec([common_phrases_list1], min_count=1)

In [35]:
import random
random_word = random.choice(model.wv.index_to_key)

In [36]:
random_word

'especially impressed'

In [37]:
model.wv.most_similar('more creamy')[:5]

[('very cute', 0.3349790871143341),
 ('really hilarious', 0.3323480486869812),
 ('really close', 0.32198747992515564),
 ('quite experienced', 0.3188299834728241),
 ('extra loud', 0.3151163160800934)]

In [38]:
rare_phrase = []
for i in common_phrases_list1:
    j = model.wv.most_similar(i)[0][1]
    if j < 0.30:
        rare_phrase.append(i)
rare_phrase

['overall better']

### claudia_tried, claudia_gave_up
### jeth_continues

In [1]:
X = model[model.vocab]

NameError: name 'model' is not defined

In [None]:
from nltk.cluster import KMeansClusterer
import nltk
NUM_CLUSTERS=3
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
print (assigned_clusters)