In [1]:
import pandas as pd
import spacy

In [2]:
import time

In [3]:
df_raw = pd.read_csv('../Data/review.csv')

In [4]:
df_raw.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,8aoJJdKEO3ypoZNszpPu7Q,bGgAL09pxLnV_FFgR4ZADg,ZBE-H_aUlicix_9vUGQPIQ,5.0,0,0,0,We had my Mother's Birthday Party here on 10/2...,2016-11-09 20:07:25
1,J5NOCLdhuhor7USRhtYZ8w,pFCb-1j6oI3TDjr26h2cJQ,e-YnECeZNt8ngm0tu4X9mQ,4.0,0,0,0,Good Korean grill near Eaton Centre. The marin...,2015-12-05 05:06:43
2,PXiLWAYRt3xnHaJ8MB4rzw,mEzc6LeTNiQgIVsq3poMbg,j7HO1YeMQGYo3KibMXZ5vg,5.0,2,1,3,Was recommended to try this place by few peopl...,2014-10-11 05:16:15
3,VrLarvxZYJm74yAqtpe9PQ,o-zUN2WEZgjQS7jnNsec0g,7e3PZzUpG5FYOTGt3O3ePA,3.0,0,0,0,Ambience: Would not expect something this nice...,2016-07-25 03:45:26
4,C1CUpidlVFprUCkApqzCmA,Wlx0iBXJvk4x0EeOt2Bz1Q,vuHzLZ7nAeT-EiecOkS5Og,1.0,11,0,3,Absolutely the WORST pool company that I have ...,2016-04-11 18:49:11


## Tokenization using split

In [5]:
df = df_raw.copy()

# Noun-Adjective Pairs

In [6]:
import random
from collections import Counter
random.seed(4)
RANDOM_STATE = 4

nlp = spacy.load("en_core_web_sm")

In [7]:
ids = [1.0, 2.0, 3.0, 4.0, 5.0]
num_reviews = [50, 20, 20, 20, 20]

In [8]:
for star_rating, num_review in zip(ids, num_reviews):
    print(star_rating, " Star Reviews")
    print("-------------")
    # Extacting 1 Star Reviews
    x_star_df = df.loc[df['stars'] == star_rating]

    # Selecting all business IDs
    business_ids = x_star_df['business_id'].unique().tolist()

    # Sample 50 IDs
    selected_ids = random.sample(business_ids, num_review)

    dfs = dict(tuple(x_star_df.groupby('business_id')))

    x_star_reviews = []
    
    for id in selected_ids:
        one_star_review = dfs[id].sample(n = 1, random_state = RANDOM_STATE)
        x_star_reviews.append(one_star_review.iloc[0]["text"])

    noun_adj_pairs = []

    for i in range(len(x_star_reviews)):
        doc = nlp(x_star_reviews[i])
        sentences = [sent.text.strip() for sent in doc.sents]

        for sentence in sentences:
            
            doc = nlp(sentence)
            noun, adj = "", ""
            for i,token in enumerate(doc):
                if token.pos_ not in ('NOUN','PROPN'):
                    continue
                for j in range(i+1, len(doc)):
                    if doc[j].pos_ == 'ADJ':
                        noun_adj_pairs.append((token,doc[j]))
                        break
            for i,token in enumerate(doc):
                if token.pos_ != 'ADJ':
                    continue
                for j in range(i+1, len(doc)):
                    if doc[j].pos_ in ('NOUN', 'PROPN'):
                        noun_adj_pairs.append((doc[j],token))
                        break

    out = []

    for pairs in noun_adj_pairs:
        item_1 = pairs[0]
        item_2 = pairs[1]
        out.append((str(item_1), str(item_2)))

    c = Counter(out)
    print(c.most_common(10))
    print("-------------")


1.0  Star Reviews
-------------
[(('time', 'first'), 5), (('reviews', 'good'), 3), (('time', 'second'), 3), (('appointment', 'able'), 3), (('fly', 'dead'), 3), (('service', 'horrible'), 3), (('night', 'last'), 2), (('Charlotte', 'local'), 2), (('food', 'fast'), 2), (('quality', 'poor'), 2)]
-------------
2.0  Star Reviews
-------------
[(('night', 'entire'), 2), (('experience', 'bad'), 2), (('pepper', 'red'), 2), (('Lasagne', 'fresh'), 2), (('variation', 'small'), 2), (('table', 'small'), 2), (('order', 'accurate'), 2), (('estate', 'real'), 2), (('thing', 'Next'), 2), (('day', 'next'), 2)]
-------------
3.0  Star Reviews
-------------
[(('food', 'good'), 4), (('dogs', 'other'), 3), (('sauce', 'cranberry'), 3), (('food', 'overall'), 2), (('Staff', 'friendly'), 2), (('sangria', 'regular'), 2), (('orleans', 'new'), 2), (('table', 'double'), 2), (('tip', 'double'), 2), (('seat', 'hard'), 2)]
-------------
4.0  Star Reviews
-------------
[(('room', 'recreational'), 3), (('Service', 'fast'),