In [25]:
import spacy
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
import matplotlib as mpl
from get_adj_phrases import get_adjective_phrases, get_list_of_phrases
from collections import Counter, defaultdict
import math

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
reviews = pd.read_json('../data/reviewSelected100.json', encoding='ISO-8859-1', lines=True)

A sample of what we get from real life data

In [4]:
# get reviews for a random business
random_business = reviews.sample(random_state=42)
random_business_id = random_business.iloc[0]['business_id']
small_business_dataset = reviews.loc[reviews['business_id'] == random_business_id]
small_business_dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
11200,wysFaMq5S88mF6HLxdh3Vw,4nQ7MOkbc5u-DYUOgjiDsg,yEZn1XpLsEC9uBa-X4xAZw,4,1,0,0,Cute little vietnamese sandwich place on St De...,2016-03-17 14:29:44
11204,z81YK3CXQJb48BU7ZAcf7A,Ht8iGitRu8kynEubcIhsTQ,yEZn1XpLsEC9uBa-X4xAZw,4,7,2,8,"Hey Hey, it's the SAMMIES!\n\nSuper Yummy here...",2011-11-30 03:26:01
11209,tqajcnZA97HlXkIYrdNVlg,-a0XgJCXoJln2Ue_JMnfIQ,yEZn1XpLsEC9uBa-X4xAZw,3,0,0,0,Sandwich was good but service was terrible. We...,2016-07-14 18:52:34
11212,wXz1c5kz0QB2nvZVTnnkIg,F9ivL7-mzKbfnLO1rKUFFQ,yEZn1XpLsEC9uBa-X4xAZw,5,0,0,0,"Super fresh, delicious and cheap Banh Mi in th...",2017-07-28 17:48:01
11258,89J69mHgTTF25Ln97ONTrQ,La1ggB37gvq6UWpEWaMtMA,yEZn1XpLsEC9uBa-X4xAZw,4,2,0,2,Que ce soit pour leurs délicieux bubbles tea/s...,2012-12-04 21:25:13


In [5]:
small_business_dataset_reviews = list(small_business_dataset['text'])
# lowercase and correct the text
# this probably works but it takes too long to run so just run this when we want to submit it
# small_business_dataset_reviews = [TextBlob(text).correct() for text in small_business_dataset_reviews]
small_business_dataset_reviews = [sent_tokenize(text) for text in small_business_dataset_reviews]
small_business_dataset_reviews = [item for sublist in small_business_dataset_reviews for item in sublist]

In [6]:
small_business_phrases = [get_adjective_phrases(text.lower()) for text in small_business_dataset_reviews]
small_business_phrases = [array for array in small_business_phrases if len(array) > 0]
small_business_phrases = [text for sublist in small_business_phrases for text in sublist]
small_business_phrases

['really big',
 'very reasonable',
 'really good',
 'very clear',
 'so good',
 'so fresh',
 'so good',
 'so cheap',
 'so good',
 'quite hungry',
 'very little',
 'too way sweet',
 'just good',
 'how delicious',
 'so addictive',
 'really disappointing',
 'very sad',
 'tremendously tasty',
 'so good',
 'pretty cheap',
 'quite cheap',
 'too lazy',
 'very healthy',
 'as good',
 'how friendly',
 'as good',
 'so tasty',
 'pretty small',
 'really excited',
 'so terrible',
 'very reasonable',
 'quite tasty',
 'very tender',
 'too many',
 'cloyingly sweet',
 'completely foreign',
 'just hungry',
 'less strong',
 'too salty',
 'so delicious',
 'so good',
 'most familiar',
 'very unusual',
 'definitely best',
 'very enjoyable',
 'very good',
 'very good',
 'more flavorful',
 'too bad',
 'just average',
 'just average',
 'so affordable',
 'so good',
 'so worth',
 'very good',
 'very welcoming',
 'very too clean',
 'very good',
 'very smart',
 'very friendly',
 'very small',
 'most legit',
 'beauti

In [7]:
phrase_counts = Counter(small_business_phrases)
freq_phrases = phrase_counts.most_common(10)
freq_phrases


[('so good', 8),
 ('very good', 5),
 ('as good', 3),
 ('too sweet', 3),
 ('very reasonable', 2),
 ('pretty cheap', 2),
 ('just average', 2),
 ('much better', 2),
 ('so much', 2),
 ('really big', 1)]

## Doing it for all of the data that we have

In [8]:
reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11
...,...,...,...,...,...,...,...,...,...
15295,qknwFVEh_0KSuexigYBI_A,YMGmyPOU65SMs4H60ltYiw,shIPnFoXrL3dFo5HLH1_HA,1,2,0,0,This was the worst experience ever. So much so...,2014-07-12 21:58:15
15296,Y-ZRoyAXCukBK1uK1ZcZCA,JLhOWQiWtGbr14K_KmoWxA,zPEYgVqJ2QNKi45FJi2jvg,5,0,0,0,We come here every time we hit Vegas! A giant ...,2018-11-10 21:38:49
15297,A8HdjBfhj3pgQuSbwNtDEw,6CoiKFDFXIACJZvv_I_8mQ,zPEYgVqJ2QNKi45FJi2jvg,1,0,1,0,As locals we used to the this place when it w...,2018-10-13 22:11:22
15298,2n1QdrYBRAAe6GKaxEV0jA,_fH4s3ls08eSl_PfX38KIA,etzDsNjkCyQBoJcU2a3U-g,5,0,0,0,The food was delicious. We were seated in 15 m...,2015-02-15 08:43:46


In [9]:
reviews_list = reviews[['business_id', 'text']]
reviews_list = reviews_list.groupby(['business_id'])['text'].apply(','.join).reset_index()
reviews_list

Unnamed: 0,business_id,text
0,--I7YYLada0tSLkORTHb5Q,"Had to get my wing fix, I like dry rubs on win..."
1,-7XWJYkutqhIxLen7Grg1g,Definite recommend. \n\nBut I never would have...
2,0Rni7ocMC_Lg2UH0lDeKMQ,We love Barros!! Usually go to other locations...
3,0kPm1zEpeXFRg8D2phqgCQ,"Coffee is exponentially better than Starbucks,..."
4,1Fpk8ibHhZYnCw8fnGny8w,Really love the food here! I was a HUGE fan of...
...,...,...
148,yHHVKa9joZAKiBDUp2SkKw,"This coffee shop has great coffee, but more im..."
149,z8Em-bhZI3Mmspml7tj6tg,This was the first time and probably the last ...
150,zPEYgVqJ2QNKi45FJi2jvg,Stopped by here for lunch yesterday. What a d...
151,zZ7KDK3GAkBUZzsaqB1A4Q,"Staff is SO friendly, I was checked in on once..."


In [10]:
# reviews_dict: list of dicts, one for each business
reviews_dict = reviews_list.to_dict('records')

In [11]:
#trial code
reviews_test = reviews_dict[:2]
test = [get_list_of_phrases(review['text']) for review in reviews_test]
test[0]

Counter({'very pleased': 3,
         'just average': 1,
         'how good': 1,
         'how delicious': 1,
         'very good': 9,
         'very friendly': 4,
         'really good': 7,
         'inside great': 1,
         'very prompt': 1,
         'even fewer': 1,
         'real good': 1,
         'too much': 2,
         'pretty decent': 1,
         'pretty good': 9,
         'most recent': 1,
         'pretty average': 1,
         'way friendlier': 1,
         'exceptionally good': 1,
         'much better': 2,
         'extremely greasy': 1,
         'very empty': 1,
         'enough large': 1,
         'barely hot': 1,
         'extremely attentive': 1,
         'very busy': 4,
         'too many': 1,
         'super busy': 1,
         'really nice': 1,
         'so excited': 1,
         'too low': 1,
         'very nice': 1,
         'too busy': 2,
         'how busy': 1,
         'just wonderful': 1,
         'pretty awesome': 1,
         'ridiculously good': 1,
         'ju

In [12]:
# this cell will take some time to run
phrases_dict = [get_list_of_phrases(review['text']) for review in reviews_dict]

we can then get the most common phrases. this is where the fun begins

In [13]:
most_common_phrases = [counter.most_common(5) for counter in phrases_dict]
most_common_phrases

[[('very good', 9),
  ('pretty good', 9),
  ('really good', 7),
  ('very friendly', 4),
  ('very busy', 4)],
 [('so good', 6),
  ('very nice', 4),
  ('too much', 4),
  ('really good', 4),
  ('much better', 4)],
 [('pretty good', 6),
  ('really good', 3),
  ('as good', 3),
  ('pretty tasty', 2),
  ('very nice', 2)],
 [('very clean', 3),
  ('really good', 3),
  ('very good', 2),
  ('anywhere close', 2),
  ('too sweet', 2)],
 [('very good', 5),
  ('really good', 4),
  ('very tasty', 4),
  ('very attentive', 3),
  ('very reasonable', 2)],
 [('pretty good', 4),
  ('really nice', 4),
  ('really good', 3),
  ('too crowded', 3),
  ('nearly empty', 3)],
 [('even worse', 3),
  ('so much', 3),
  ('very thorough', 2),
  ('very friendly', 2),
  ('so many', 2)],
 [('very relaxing', 8),
  ('very professional', 4),
  ('very good', 3),
  ('very pleasant', 3),
  ('absolutely amazing', 3)],
 [('pretty good', 6),
  ('really good', 6),
  ('very good', 4),
  ('so many', 3),
  ('too bad', 3)],
 [('absolutely

since this does not show any good results as they are not indicative of anything, we look into TF-IDF

In [14]:
# get the most common phrases from all the datasets
all_phrases = dict(sum(phrases_dict, Counter()))
all_phrases

{'very pleased': 46,
 'just average': 13,
 'how good': 43,
 'how delicious': 11,
 'very good': 597,
 'very friendly': 335,
 'really good': 442,
 'inside great': 3,
 'very prompt': 3,
 'even fewer': 1,
 'real good': 2,
 'too much': 131,
 'pretty decent': 52,
 'pretty good': 426,
 'most recent': 33,
 'pretty average': 8,
 'way friendlier': 1,
 'exceptionally good': 3,
 'much better': 175,
 'extremely greasy': 3,
 'very empty': 2,
 'enough large': 9,
 'barely hot': 2,
 'extremely attentive': 7,
 'very busy': 77,
 'too many': 64,
 'super busy': 22,
 'really nice': 114,
 'so excited': 47,
 'too low': 2,
 'very nice': 368,
 'too busy': 70,
 'how busy': 32,
 'just wonderful': 5,
 'pretty awesome': 16,
 'ridiculously good': 4,
 'just sad': 5,
 'very spacious': 16,
 'very reasonable': 104,
 'very marginal': 1,
 'pretty empty': 13,
 'very disappointing': 30,
 'very perticular': 1,
 'so bizarre': 1,
 'very expensive': 14,
 'very loud': 15,
 'just ok': 75,
 'very accommodating': 38,
 'really small

In [15]:
common_phrases_list = [counter.most_common() for counter in phrases_dict]
common_phrases = [tuples for sublist in common_phrases_list for tuples in sublist]
common_phrases

[('very good', 9),
 ('pretty good', 9),
 ('really good', 7),
 ('very friendly', 4),
 ('very busy', 4),
 ('very pleased', 3),
 ('too much', 2),
 ('much better', 2),
 ('too busy', 2),
 ('very spacious', 2),
 ('just ok', 2),
 ('very large', 2),
 ('very attentive', 2),
 ('super friendly', 2),
 ('so good', 2),
 ('just average', 1),
 ('how good', 1),
 ('how delicious', 1),
 ('inside great', 1),
 ('very prompt', 1),
 ('even fewer', 1),
 ('real good', 1),
 ('pretty decent', 1),
 ('most recent', 1),
 ('pretty average', 1),
 ('way friendlier', 1),
 ('exceptionally good', 1),
 ('extremely greasy', 1),
 ('very empty', 1),
 ('enough large', 1),
 ('barely hot', 1),
 ('extremely attentive', 1),
 ('too many', 1),
 ('super busy', 1),
 ('really nice', 1),
 ('so excited', 1),
 ('too low', 1),
 ('very nice', 1),
 ('how busy', 1),
 ('just wonderful', 1),
 ('pretty awesome', 1),
 ('ridiculously good', 1),
 ('just sad', 1),
 ('very reasonable', 1),
 ('very marginal', 1),
 ('pretty empty', 1),
 ('very disappo

In [16]:
DF = {}
for i, phrase_tuple in enumerate(common_phrases):
    phrase: str = phrase_tuple[0]
    try:
        DF[phrase].add(i)
    except:
        DF[phrase] = {i}

In [17]:
# count across the entire document
for i in DF:
    DF[i] = len(DF[i])
DF

{'very good': 134,
 'pretty good': 121,
 'really good': 127,
 'very friendly': 121,
 'very busy': 47,
 'very pleased': 39,
 'too much': 80,
 'much better': 90,
 'too busy': 51,
 'very spacious': 14,
 'just ok': 58,
 'very large': 31,
 'very attentive': 52,
 'super friendly': 72,
 'so good': 109,
 'just average': 12,
 'how good': 34,
 'how delicious': 10,
 'inside great': 3,
 'very prompt': 3,
 'even fewer': 1,
 'real good': 2,
 'pretty decent': 39,
 'most recent': 28,
 'pretty average': 6,
 'way friendlier': 1,
 'exceptionally good': 3,
 'extremely greasy': 3,
 'very empty': 2,
 'enough large': 9,
 'barely hot': 2,
 'extremely attentive': 7,
 'too many': 50,
 'super busy': 19,
 'really nice': 79,
 'so excited': 31,
 'too low': 2,
 'very nice': 128,
 'how busy': 27,
 'just wonderful': 5,
 'pretty awesome': 16,
 'ridiculously good': 4,
 'just sad': 5,
 'very reasonable': 59,
 'very marginal': 1,
 'pretty empty': 9,
 'very disappointing': 27,
 'very perticular': 1,
 'so bizarre': 1,
 'ver

In [18]:
total_vocab = [x for x in DF]
print(total_vocab)

['very good', 'pretty good', 'really good', 'very friendly', 'very busy', 'very pleased', 'too much', 'much better', 'too busy', 'very spacious', 'just ok', 'very large', 'very attentive', 'super friendly', 'so good', 'just average', 'how good', 'how delicious', 'inside great', 'very prompt', 'even fewer', 'real good', 'pretty decent', 'most recent', 'pretty average', 'way friendlier', 'exceptionally good', 'extremely greasy', 'very empty', 'enough large', 'barely hot', 'extremely attentive', 'too many', 'super busy', 'really nice', 'so excited', 'too low', 'very nice', 'how busy', 'just wonderful', 'pretty awesome', 'ridiculously good', 'just sad', 'very reasonable', 'very marginal', 'pretty empty', 'very disappointing', 'very perticular', 'so bizarre', 'very expensive', 'very loud', 'very accommodating', 'really small', 'less complacent', 'pleasantly surprised', 'very overpriced', 'very average', 'mostly attentive', 'really sure', 'so great', 'very comfortable', 'really plentiful', '

In [22]:
DocumentTF = []
for i, sublist in enumerate(common_phrases_list):
    TF = defaultdict(int)
    adj_phrase_count: int = 0
    for tuples in sublist:
        adj_phrase_count += tuples[1]
    
    for tuples in sublist:
        TF[tuples[0]] = tuples[1]/adj_phrase_count
    DocumentTF.append(TF)

In [23]:
def computeIDF(word: str) -> float:
    N = len(common_phrases_list)
    num_review = 0
    for reviews in common_phrases_list:
        for review_tuples in reviews:
            if word == review_tuples[0]:
                num_review += 1
    return math.log(N / float(num_review))

In [56]:
temp_idf_dict = defaultdict(int)
for word in total_vocab:
    temp_idf_dict[word] = computeIDF(word)
# idf_list = [{word: computeIDF(word)} for word in total_vocab]

In [47]:
DocumentTF[0].keys()

dict_keys(['very good', 'pretty good', 'really good', 'very friendly', 'very busy', 'very pleased', 'too much', 'much better', 'too busy', 'very spacious', 'just ok', 'very large', 'very attentive', 'super friendly', 'so good', 'just average', 'how good', 'how delicious', 'inside great', 'very prompt', 'even fewer', 'real good', 'pretty decent', 'most recent', 'pretty average', 'way friendlier', 'exceptionally good', 'extremely greasy', 'very empty', 'enough large', 'barely hot', 'extremely attentive', 'too many', 'super busy', 'really nice', 'so excited', 'too low', 'very nice', 'how busy', 'just wonderful', 'pretty awesome', 'ridiculously good', 'just sad', 'very reasonable', 'very marginal', 'pretty empty', 'very disappointing', 'very perticular', 'so bizarre', 'very expensive', 'very loud', 'very accommodating', 'really small', 'less complacent', 'pleasantly surprised', 'very overpriced', 'very average', 'mostly attentive', 'really sure', 'so great', 'very comfortable', 'really ple

In [48]:
DocumentTF[0]['very good']

0.07258064516129033

In [57]:
temp_idf_dict['very good']

0.13259812144152416

In [61]:
def computeTFIDF(TF, idf_list):
    TFIDF = defaultdict(float)
    for phrase in TF.keys():
        phrase_tf_idf = TF[phrase] * idf_list[phrase]
        TFIDF[phrase] = phrase_tf_idf
    return TFIDF

In [62]:
TFIDFList = [computeTFIDF(TF, temp_idf_dict) for TF in DocumentTF]

In [67]:
TFIDFList[0]

defaultdict(float,
            {'very good': 0.009624057201400948,
             'pretty good': 0.01703085792065524,
             'really good': 0.010514160036587982,
             'very friendly': 0.0075692701869578835,
             'very busy': 0.038073881280076674,
             'very pleased': 0.033069587304744893,
             'too much': 0.010458246559976675,
             'much better': 0.008558520178422102,
             'too busy': 0.017719553043034027,
             'very spacious': 0.0385706547060835,
             'just ok': 0.015645079207193806,
             'very large': 0.02574920511140789,
             'very attentive': 0.01740635810985497,
             'super friendly': 0.012157609715748066,
             'so good': 0.005469194180053093,
             'just average': 0.020528477996809957,
             'how good': 0.012129656425615113,
             'how delicious': 0.02199881313224508,
             'inside great': 0.03170827123164779,
             'very prompt': 0.03170827123164

In [69]:
max_0 = max(TFIDFList[0], key=TFIDFList[0].get)

In [70]:
max_0

'even fewer'