# Import libraries

In [26]:
import pandas as pd
import spacy
import en_core_web_sm
import random
from spacy import displacy

# Explore data

In [11]:
df = pd.read_json('../data/reviewSelected100.json', lines=True, encoding = "ISO-8859-1")

In [12]:
df.describe()

Unnamed: 0,stars,useful,funny,cool
count,15300.0,15300.0,15300.0,15300.0
mean,3.646601,1.292745,0.43,0.514575
std,1.45513,3.241261,1.866658,2.33405
min,1.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0
75%,5.0,1.0,0.0,0.0
max,5.0,191.0,122.0,180.0


In [13]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11


# Implement extractor

Most frequent ⟨ Noun - Adjective ⟩ pairs for each rating. Each review has a “star” rating in the range of
1 to 5. Randomly select 50 reviews (one from each business) of rating 1, extract the top-10 most frequent
noun-adjective pairs from the sentences in these selected reviews. Example noun-adjective pairs are servicegreat,
food-delicious, that appear in the same sentence. Do the same for 20 reviews of ratings 2, 3, 4, and 5,
respectively. Discuss your results and limitations of your method.

### Functions to get random 20 reviews from random business that have a certain rating

In [20]:
# random 50 reviews (not the same biz) from rating 1
# just get 50 random review, then if there is any other biz that overlap then find random again, repeat until all
# review is from different biz (this)


def get_all_biz(df):
    biz = df['business_id'].unique()
    sorted(biz)
    return biz

def get_random(df_len, seed=10086):
    return random.randint(0,df_len)    

def extract_random_reviews_from_different_biz(num_reviews, star_rating, df):
    # filter star rating
    filter_df = df.loc[df['stars'] == star_rating]
    
    # get random review
    random_review = {}
    while (len(random_review.keys()) < num_reviews):
        random_index = get_random(len(filter_df)-1)
        biz_name = filter_df.iloc[[random_index]]['business_id'].values[0]
        if biz_name in random_review:
            continue
        else:
            random_review[biz_name] = filter_df.iloc[[random_index]]['text'].values[0]
            #filter_df = filter_df.drop([filter_df.index[random_index]])
    
    
    # convert to list
    return list(random_review.values())

# descending
def sort_dict(data_dict):
    return {k: v for k, v in sorted(data_dict.items(), key=lambda item: item[1], reverse=True)}


### Function to extract adj-noun pair

In [4]:
nlp = spacy.load('en_core_web_sm')

In [6]:
def get_noun_adj_pair(reviews):
    noun_adj_pair_dict = {}
    for review in reviews:
        doc = nlp(review)
        for np in doc.noun_chunks: # np refers to noun phrases
            np_adj = []
            np_noun_only = []
            extracted_np_adj_temp = []   # before adding to the noun_adj_pair_dict

            # start extraction of noun adj pair

            # adjective right before noun eg 'good food'
            for token in np:
                if token.pos_ in ("ADV","ADJ","PART"):
                    extracted_np_adj_temp.append(token)
                elif (token.pos_ in ("NOUN", "PROPN") and str(token) != 'one'):   # track those with noun/pronoun
                    np_noun_only.append(token)
            if len(extracted_np_adj_temp)>0:
                np_adj.append(extracted_np_adj_temp)

        
            # sentences with auxiliary words such as "food was good"
            extracted_np_adj_temp = []

            # find "was, were, is, are"
            for j in range(token.i+1,len(doc)): 
                if doc[j].pos_ in ('AUX'):
                    aux_found = False
                    neg_conj_found = False

                    # find in the children of the aux
                    for child in list(doc[j].children): 
                        if neg_conj_found == False:
                            extracted_np_adj_temp = []
                        else:
                            neg_conj_found = False
                        if child.pos_ in ("PART","CCONJ"):
                            neg_conj_found = True
                        if (child.pos_ in ("NOUN","PROPN") and str(child) != 'one') and (child in np):
                            aux_found = True 
                            
                        # find corresponding adjective for the noun
                        if aux_found == True and child.pos_ in ("ADJ","ADV","PART"): # PART refers to negation
                            for grandchild in child.children: # extracts the "very" in "very happy atmosphere"
                                if grandchild.pos_ in ("ADJ","ADV","PART") and grandchild.i<child.i:
                                    extracted_np_adj_temp.append(grandchild)
                            extracted_np_adj_temp.append(child)
                            if extracted_np_adj_temp not in np_adj:
                                np_adj.append(extracted_np_adj_temp)

            if (len(np_noun_only)!=0 and len(np_adj)!=0):
                for extracted_np_adj_temp in np_adj:
                    noun_adj_pair = (" ".join([str(x).lower() for x in np_noun_only])," ".join([str(x).lower() for x in extracted_np_adj_temp]))
                    if noun_adj_pair in noun_adj_pair_dict:
                        noun_adj_pair_dict[noun_adj_pair] += 1
                    else:
                        noun_adj_pair_dict[noun_adj_pair] = 1
                        
    return noun_adj_pair_dict

In [72]:
def preprocessing(reviews):
    return [i.lower() for i in reviews]

In [43]:
def postprocessing(data_dict):
    new_dict = {}
    # k is tuple, (noun, adj)
    for k,v in data_dict.items():
#         print(k)
        doc = nlp(k[0])
        lemma = [token.lemma_.lower() for token in doc]
        new_tuple = tuple([' '.join(lemma), k[1].lower()])
#         print(new_tuple)
        if new_tuple in new_dict:
            new_dict[new_tuple] += v
        else:
            new_dict[new_tuple] = v
    return new_dict
        

# Run on data

In [73]:
## Rating 1, 50 random data from different business

def extract_adj_noun_pair(rating, num_review, df, top_n_most_frequent):
    # get random reviews
    random_reviews = extract_random_reviews_from_different_biz(num_review, rating, df)
    
    # preprocessing
    random_reviews = preprocessing(random_reviews)
    
    # extract noun-adj pair
    noun_adj_pairs = get_noun_adj_pair(random_reviews)
    
    # postprocessing
    print('count before postprocessing {}'.format(len(noun_adj_pairs)))
    noun_adj_pairs = postprocessing(noun_adj_pairs)
    noun_adj_pairs = sort_dict(noun_adj_pairs)
    print('count after postprocessing {}'.format(len(noun_adj_pairs)))
    
    # convert & return to dataframe
    noun_adj_pairs_df = pd.DataFrame(noun_adj_pairs, index=['count']).transpose()
    return noun_adj_pairs_df.head(10)

## Rating 1, 50 random reviews

In [74]:
extract_adj_noun_pair(1, 50, df, 10)

count before postprocessing 298
count after postprocessing 298


Unnamed: 0,Unnamed: 1,count
people,other,3
service,terrible,3
minute,several,3
soup,hot sour,3
chance,second,2
cheese,big,2
minute,few,2
visit,last,2
fry,french,2
service,good,2


## Rating 2, 20 random reviews

In [75]:
extract_adj_noun_pair(2, 20, df, 10)

count before postprocessing 131
count after postprocessing 130


Unnamed: 0,Unnamed: 1,count
service,horrible,2
thing,only,2
decor,nice,2
decor,best,2
option,ayce,2
flavor,little,2
side,east,2
food,bland,1
maple syrup,real,1
problem,real,1


## Rating 3, 20 random reviews

In [76]:
extract_adj_noun_pair(3, 20, df, 10)

count before postprocessing 113
count after postprocessing 112


Unnamed: 0,Unnamed: 1,count
service,great,2
time,first,2
home,nice,2
place,great,2
place,other,1
check process,slower,1
hour,less,1
nurse,nice,1
menu,full,1
bakery,other,1


## Rating 4, 20 random reviews

In [77]:
extract_adj_noun_pair(4, 20, df, 10)

count before postprocessing 141
count after postprocessing 140


Unnamed: 0,Unnamed: 1,count
service,great,3
pot,hot,2
breakfast,delicious,1
puppy,hush,1
option,many,1
egg,large,1
food price,great good,1
thing,good,1
setup,kind of,1
bar area,actual,1


## Rating 5, 20 random reviews

In [79]:
extract_adj_noun_pair(5, 20, df, 10)

count before postprocessing 96
count after postprocessing 96


Unnamed: 0,Unnamed: 1,count
tire,new,2
service,great,2
experience,great,2
food,great,2
shop,best,1
michelle,best,1
vip admission,free,1
club,other,1
hour wait,ridiculous,1
entry,extremely fast,1


## Testing extractor

In [80]:
noun_adj_pairs = get_noun_adj_pair(['Environment, and atmosphere was thoroughly enjoyable.'])

In [81]:
noun_adj_pairs

{('environment', 'thoroughly enjoyable'): 1,
 ('atmosphere', 'thoroughly enjoyable'): 1}

In [84]:
noun_adj_pairs = get_noun_adj_pair(['Quick and attentive service, despite being continuously busy.'])

In [85]:
noun_adj_pairs

{('service', 'attentive'): 1}

In [101]:
noun_adj_pairs = get_noun_adj_pair(['Potatoes were extremely well seasoned'])

In [102]:
noun_adj_pairs

{}

In [95]:
noun_adj_pairs = get_noun_adj_pair(['The fresh mozzarella was fantastic and the pizzas were great'])

In [96]:
noun_adj_pairs

{('mozzarella', 'fresh'): 1, ('mozzarella', 'fantastic'): 1}

In [55]:
print('count before postprocessing {}'.format(len(noun_adj_pairs)))
noun_adj_pairs = postprocessing(noun_adj_pairs)
noun_adj_pairs = sort_dict(noun_adj_pairs)
print('count after postprocessing {}'.format(len(noun_adj_pairs)))

count before postprocessing 239
count after postprocessing 238


In [56]:
noun_adj_pairs_df = pd.DataFrame(noun_adj_pairs, index=['count']).transpose()
noun_adj_pairs_df

Unnamed: 0,Unnamed: 1,count
water,hot cold,3
order,wrong,2
people,asian,2
service,poor,2
food,cold,2
...,...,...
shift,1st 2nd,1
place,first,1
time,first,1
time,certain,1


In [None]:
# sort and get top 5
# Coreference Resolution?

In [102]:
reviews = extract_random_reviews_from_different_biz(2, 1, df)

In [107]:
print(get_noun_adj_pair(reviews))

{('time', 'same'): 1, ('wings', 'alright'): 1, ('wings', 'at best'): 1, ('napkins', 'more'): 1, ('credit card terminal', 'handheld'): 1, ('food', 'somewhere'): 1, ('service', 'somewhere'): 1, ('fees', 'extra'): 1, ('toenail', 'loose'): 1, ('stone massage', "'"): 1, ('stone massage', 'barely'): 1, ('legs', 'lower'): 1, ('lines', 'few'): 1, ('dollars', 'more'): 1, ('place', 'appalling'): 1}


In [109]:
print(get_noun_adj_pair(reviews))

{1: [('time', 'same'), ('wings', 'alright'), ('wings', 'at best'), ('napkins', 'more'), ('credit card terminal', 'handheld'), ('food', 'somewhere'), ('service', 'somewhere')], 2: [('fees', 'extra'), ('toenail', 'loose'), ('stone massage', "'"), ('stone massage', 'barely'), ('legs', 'lower'), ('lines', 'few'), ('dollars', 'more'), ('place', 'appalling')]}


In [105]:
print(get_noun_adj_pair1(reviews))

{('', 'more'): 1, ('', 'extra'): 1}


In [80]:
get_all_biz(df)

array(['eU_713ec6fTGNO4BegRaww', '3fw2X5bZYeW9xCz_zGhOHg',
       '6lj2BJ4tJeu7db5asGHQ4w', 'XTzKRvWciP_BZ9imk8mmPQ',
       'CDpoJiqgM04wqQTZ6QkTqQ', 'm7ommfJFalI47UksyX04Iw',
       '6sATfFam9_q9uod0I2aCsg', 'SU56w479vUfFHsvmvQIf7A',
       'CGUK3cd2gxp2q3KPY19Oog', 'vhIJ91MDgUuk4Cr9Kpj1Nw',
       '8KmqWgL0UEdxJFwTZ_YZvQ', 'EgwGTDZ705TwudPJwAY0yQ',
       'GdCRQU3VCh_x8fY84mbwYQ', 'qaPSbg690KaX5av6xsSV4Q',
       '9mIm1ef-NVDQHFE39Faxxg', '9nHpqlu7be1466wdo_t5kQ',
       'FStFa2esHFgsAjVzIGJcQA', 'Z66xO_B7trDah8F0PKwRqg',
       'Nc7cqq5k0WZsVhu25QrmNQ'], dtype=object)

In [14]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,fdiNeiN_hoCxCMy2wTRW9g,w31MKYsNFMrjhWxxAb5wIw,eU_713ec6fTGNO4BegRaww,4,0,0,0,I'll be the first to admit that I was not exci...,2013-01-20 13:25:59
1,G7XHMxG0bx9oBJNECG4IFg,jlu4CztcSxrKx56ba1a5AQ,3fw2X5bZYeW9xCz_zGhOHg,3,5,4,5,Tracy dessert had a big name in Hong Kong and ...,2016-05-07 01:21:02
2,rEITo90tpyKmEfNDp3Ou3A,6Fz_nus_OG4gar721OKgZA,6lj2BJ4tJeu7db5asGHQ4w,5,0,0,0,We've been a huge Slim's fan since they opened...,2017-05-26 01:23:19
3,bjD0Dqn3k-fi00BXatrytg,1fi6x4tnJtlVWaJmoIO9XA,XTzKRvWciP_BZ9imk8mmPQ,1,5,1,0,I tried this place because my girls are away f...,2014-06-27 21:32:31
4,CelUWzp-GnJIiiV1mDUb-g,tFICmdLtwgFIRcwtlbYQOg,CDpoJiqgM04wqQTZ6QkTqQ,1,0,0,0,Love this place downtown but the Scottsdale lo...,2015-12-05 02:37:03


In [46]:
biz = df.iloc[[2]]
print(biz['business_id'].values[0])

6lj2BJ4tJeu7db5asGHQ4w


In [29]:
type(df.iloc[[2]]['business_id'])

pandas.core.series.Series