# Import libraries

In [26]:
import pandas as pd
import spacy
import en_core_web_sm
import random
from spacy import displacy

In [187]:
# create directory for result
from pathlib import Path
Path("../result/").mkdir(parents=True, exist_ok=True)

# Explore data

In [11]:
df = pd.read_json('../data/reviewSelected100.json', lines=True, encoding = "ISO-8859-1")

In [12]:
df.describe()

Unnamed: 0,stars,useful,funny,cool
count,15300.0,15300.0,15300.0,15300.0
mean,3.646601,1.292745,0.43,0.514575
std,1.45513,3.241261,1.866658,2.33405
min,1.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0
75%,5.0,1.0,0.0,0.0
max,5.0,191.0,122.0,180.0


In [13]:
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11


# Implement extractor

Most frequent ⟨ Noun - Adjective ⟩ pairs for each rating. Each review has a “star” rating in the range of
1 to 5. Randomly select 50 reviews (one from each business) of rating 1, extract the top-10 most frequent
noun-adjective pairs from the sentences in these selected reviews. Example noun-adjective pairs are servicegreat,
food-delicious, that appear in the same sentence. Do the same for 20 reviews of ratings 2, 3, 4, and 5,
respectively. Discuss your results and limitations of your method.

### Functions to get random 20 reviews from random business that have a certain rating

In [20]:
# random 50 reviews (not the same biz) from rating 1
# just get 50 random review, then if there is any other biz that overlap then find random again, repeat until all
# review is from different biz (this)


def get_all_biz(df):
    biz = df['business_id'].unique()
    sorted(biz)
    return biz

def get_random(df_len, seed=10086):
    return random.randint(0,df_len)    

def extract_random_reviews_from_different_biz(num_reviews, star_rating, df):
    # filter star rating
    filter_df = df.loc[df['stars'] == star_rating]
    
    # get random review
    random_review = {}
    while (len(random_review.keys()) < num_reviews):
        random_index = get_random(len(filter_df)-1)
        biz_name = filter_df.iloc[[random_index]]['business_id'].values[0]
        if biz_name in random_review:
            continue
        else:
            random_review[biz_name] = filter_df.iloc[[random_index]]['text'].values[0]
            #filter_df = filter_df.drop([filter_df.index[random_index]])
    
    
    # convert to list
    return list(random_review.values())

# descending
def sort_dict(data_dict):
    return {k: v for k, v in sorted(data_dict.items(), key=lambda item: item[1], reverse=True)}


### Function to extract adj-noun pair

In [4]:
nlp = spacy.load('en_core_web_sm')

In [6]:
def get_noun_adj_pair(reviews):
    noun_adj_pair_dict = {}
    for review in reviews:
        doc = nlp(review)
        for np in doc.noun_chunks: # np refers to noun phrases
            np_adj = []
            np_noun_only = []
            extracted_np_adj_temp = []   # before adding to the noun_adj_pair_dict

            # start extraction of noun adj pair

            # adjective right before noun eg 'good food'
            for token in np:
                if token.pos_ in ("ADV","ADJ","PART"):
                    extracted_np_adj_temp.append(token)
                elif (token.pos_ in ("NOUN", "PROPN") and str(token) != 'one'):   # track those with noun/pronoun
                    np_noun_only.append(token)
            if len(extracted_np_adj_temp)>0:
                np_adj.append(extracted_np_adj_temp)

        
            # sentences with auxiliary words such as "food was good"
            extracted_np_adj_temp = []

            # find "was, were, is, are"
            for j in range(token.i+1,len(doc)): 
                if doc[j].pos_ in ('AUX'):
                    aux_found = False
                    neg_conj_found = False

                    # find in the children of the aux
                    for child in list(doc[j].children): 
                        if neg_conj_found == False:
                            extracted_np_adj_temp = []
                        else:
                            neg_conj_found = False
                        if child.pos_ in ("PART","CCONJ"):
                            neg_conj_found = True
                        if (child.pos_ in ("NOUN","PROPN") and str(child) != 'one') and (child in np):
                            aux_found = True 
                            
                        # find corresponding adjective for the noun
                        if aux_found == True and child.pos_ in ("ADJ","ADV","PART"): # PART refers to negation
                            for grandchild in child.children: # extracts the "very" in "very happy atmosphere"
                                if grandchild.pos_ in ("ADJ","ADV","PART") and grandchild.i<child.i:
                                    extracted_np_adj_temp.append(grandchild)
                            extracted_np_adj_temp.append(child)
                            if extracted_np_adj_temp not in np_adj:
                                np_adj.append(extracted_np_adj_temp)

            if (len(np_noun_only)!=0 and len(np_adj)!=0):
                for extracted_np_adj_temp in np_adj:
                    noun_adj_pair = (" ".join([str(x).lower() for x in np_noun_only])," ".join([str(x).lower() for x in extracted_np_adj_temp]))
                    if noun_adj_pair in noun_adj_pair_dict:
                        noun_adj_pair_dict[noun_adj_pair] += 1
                    else:
                        noun_adj_pair_dict[noun_adj_pair] = 1
                        
    return noun_adj_pair_dict

In [72]:
def preprocessing(reviews):
    return [i.lower() for i in reviews]

In [43]:
def postprocessing(data_dict):
    new_dict = {}
    # k is tuple, (noun, adj)
    for k,v in data_dict.items():
#         print(k)
        doc = nlp(k[0])
        lemma = [token.lemma_.lower() for token in doc]
        new_tuple = tuple([' '.join(lemma), k[1].lower()])
#         print(new_tuple)
        if new_tuple in new_dict:
            new_dict[new_tuple] += v
        else:
            new_dict[new_tuple] = v
    return new_dict
        

# Run on data

In [73]:
## Rating 1, 50 random data from different business

def extract_adj_noun_pair(rating, num_review, df, top_n_most_frequent):
    # get random reviews
    random_reviews = extract_random_reviews_from_different_biz(num_review, rating, df)
    
    # preprocessing
    random_reviews = preprocessing(random_reviews)
    
    # extract noun-adj pair
    noun_adj_pairs = get_noun_adj_pair(random_reviews)
    
    # postprocessing
    print('count before postprocessing {}'.format(len(noun_adj_pairs)))
    noun_adj_pairs = postprocessing(noun_adj_pairs)
    noun_adj_pairs = sort_dict(noun_adj_pairs)
    print('count after postprocessing {}'.format(len(noun_adj_pairs)))
    
    # convert & return to dataframe
    noun_adj_pairs_df = pd.DataFrame(noun_adj_pairs, index=['count']).transpose()
    return noun_adj_pairs_df.head(10)

## Rating 1, 50 random reviews

In [129]:
rating_1 = extract_adj_noun_pair(1, 50, df, 10)

count before postprocessing 298
count after postprocessing 295


In [130]:
rating_1

Unnamed: 0,Unnamed: 1,count
money,hard,3
service,horrible,3
tool,hot,3
place,other,3
time,long,2
service,bad,2
cook,new,2
wait,long,2
deal,big,2
information,personal,2


In [132]:
rating_1.to_csv('../result/rating_1_top_10_frequently_used.csv')

## Rating 2, 20 random reviews

In [161]:
rating_2 = extract_adj_noun_pair(2, 20, df, 10)

count before postprocessing 163
count after postprocessing 162


In [162]:
rating_2

Unnamed: 0,Unnamed: 1,count
service,slow,2
meat sandwich,loose,2
part,best,2
restaurant,clean,2
dish,deep,2
food,vietnamese,1
curry soup basis,very flavourful,1
beef,impossible,1
time,absolute worse,1
soup curry,good,1


In [163]:
rating_2.to_csv('../result/rating_2_top_10_frequently_used.csv')

## Rating 3, 20 random reviews

In [170]:
rating_3 = extract_adj_noun_pair(3, 20, df, 10)

count before postprocessing 116
count after postprocessing 116


In [171]:
rating_3

Unnamed: 0,Unnamed: 1,count
service,good,2
service,fast,2
restaurant,other,2
trio,awesome,2
food,average,2
pancake,good,1
breakfast,fast,1
breakfast,speedy,1
guy,great,1
wine sommelier,onsite,1


In [172]:
rating_3.to_csv('../result/rating_3_top_10_frequently_used.csv')

## Rating 4, 20 random reviews

In [175]:
rating_4 = extract_adj_noun_pair(4, 20, df, 10)

count before postprocessing 124
count after postprocessing 124


In [176]:
rating_4

Unnamed: 0,Unnamed: 1,count
food,good,3
place,great,2
fudge,hot,2
place,good,2
strip mall,small,1
parking lot,small,1
monkey waffle,funky,1
waffle,terrific,1
edge,crisp,1
vehicle,excellent,1


In [177]:
rating_4.to_csv('../result/rating_4_top_10_frequently_used.csv')

## Rating 5, 20 random reviews

In [184]:
rating_5 = extract_adj_noun_pair(5, 20, df, 10)

count before postprocessing 103
count after postprocessing 103


In [185]:
rating_5

Unnamed: 0,Unnamed: 1,count
food,great,2
staff,friendly,2
stuff,other,2
good,own,2
occasion,multiple,1
experience,good,1
vibe,great,1
place,favorite,1
service,always clean fast,1
food,delicious,1


In [186]:
rating_5.to_csv('../result/rating_5_top_10_frequently_used.csv')

## Testing extractor

In [80]:
noun_adj_pairs = get_noun_adj_pair(['Environment, and atmosphere was thoroughly enjoyable.'])

In [81]:
noun_adj_pairs

{('environment', 'thoroughly enjoyable'): 1,
 ('atmosphere', 'thoroughly enjoyable'): 1}

In [84]:
noun_adj_pairs = get_noun_adj_pair(['Quick and attentive service, despite being continuously busy.'])

In [85]:
noun_adj_pairs

{('service', 'attentive'): 1}

In [101]:
noun_adj_pairs = get_noun_adj_pair(['Potatoes were extremely well seasoned'])

In [102]:
noun_adj_pairs

{}

In [95]:
noun_adj_pairs = get_noun_adj_pair(['The fresh mozzarella was fantastic and the pizzas were great'])

In [96]:
noun_adj_pairs

{('mozzarella', 'fresh'): 1, ('mozzarella', 'fantastic'): 1}