In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### import data and libraries 

In [2]:
import json
import pandas as pd
import numpy as np
import math
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import Tree
from nltk import pos_tag
from nltk.chunk import RegexpParser
from nltk import ne_chunk

In [3]:
with open('/content/drive/MyDrive/Y3S1/CZ4045 Natural language processing/reviewSelected100.json', encoding = "ISO-8859-1") as f:
    data = f.readlines()
data = [json.loads(line) for line in data] #convert string to dict format
data = pd.DataFrame(data)
data = data[['business_id', 'stars', 'text']]
data

Unnamed: 0,business_id,stars,text
0,ZBE-H_aUlicix_9vUGQPIQ,5.0,We had my Mother's Birthday Party here on 10/2...
1,e-YnECeZNt8ngm0tu4X9mQ,4.0,Good Korean grill near Eaton Centre. The marin...
2,j7HO1YeMQGYo3KibMXZ5vg,5.0,Was recommended to try this place by few peopl...
3,7e3PZzUpG5FYOTGt3O3ePA,3.0,Ambience: Would not expect something this nice...
4,vuHzLZ7nAeT-EiecOkS5Og,1.0,Absolutely the WORST pool company that I have ...
...,...,...,...
15295,shIPnFoXrL3dFo5HLH1_HA,1.0,This was the worst experience ever. So much so...
15296,zPEYgVqJ2QNKi45FJi2jvg,5.0,We come here every time we hit Vegas! A giant ...
15297,zPEYgVqJ2QNKi45FJi2jvg,1.0,As locals we used to the this place when it w...
15298,etzDsNjkCyQBoJcU2a3U-g,5.0,The food was delicious. We were seated in 15 m...


#### Generate Random reviews and tag

In [4]:
# Get random reviews from 50 business id and do POS tagging, return a list of tagged text
def get_random(rating):
  seed = 85
  sample_size = 50
  # Randomly select 1 review from 50 distinct business id with rating 1
  rating_1 = data[data['stars'] == rating]
  rating_1 = rating_1.groupby('business_id').apply(lambda x: x.sample(1)).reset_index(drop=True)
  # Convert all text to lowercase
  # lower_rating_1 = rating_1['text'].str.lower()
  random_rating_1 = rating_1.sample(sample_size, random_state = seed)['text'].reset_index(drop=True)
  tagged_low_rating = random_rating_1.str.split().map(pos_tag)
  return list(tagged_low_rating)

NN	noun, singular (cat, tree)

NNS	noun plural (desks)

NNP	proper noun, singular (sarah)

JJ	This NLTK POS Tag is an adjective (large)

JJR	adjective, comparative (larger)

JJS	adjective, superlative (largest)

Reference used: 

https://www.learntek.org/blog/named-entity-recognition-with-nltk/

https://www.guru99.com/pos-tagging-chunking-nltk.html

#### Find most frequent <noun - adj> pairs

In [5]:
from collections import Counter
def get_n_adj_pairs(tagged_sentences):
    noun_list = []
    adj_list = []
    grammar = r"""
    CHUNK1:
        {<NN.*><.*>?<JJ.*>}  # Any noun end with any Adjective, eg. cream is melting, service extremely slow, place poorly managed
    
    CHUNK2:
        {<JJ.*><.*>?<NN.*>}  # Nouns or adjectives, end with Nouns eg. perfect time, particular person

    CHUNK3: 
        {<NN.*><NN.*>}   # Noun as Adjectives, terminated with Nouns eg. school trip

    """
    cp = RegexpParser(grammar)
    for sentence in tagged_sentences:
        tree = cp.parse(sentence)
        for subtree in tree.subtrees(filter = lambda x: x.label() in ['CHUNK1', 'CHUNK2']):
          if (str(subtree).find('NN') > 0 or str(subtree).find('NNS') > 0 or str(subtree).find('NNP') > 0) and (str(subtree).find('JJ')> 0 or str(subtree).find('JJS')> 0 or str(subtree).find('JJR')> 0):
              nouns = [word for word, tag in subtree.leaves() if tag in ['NN', 'NNS', 'NNP']]
              adjss = [word for word, tag in subtree.leaves() if tag in ['JJ','JJR','JJS']]
              noun_list.extend([nouns])
              adj_list.extend([adjss])
    pair_list = [(m[0]+": "+n[0]) for m,n in zip(noun_list, adj_list)]
    c = Counter(pair_list)
    print(c.most_common(10))

In [6]:
for i in range(1, 6):
  print('For rating = ', i, 'top 10 most common pairs are')
  get_n_adj_pairs(get_random(i))
  print()

For rating =  1 top 10 most common pairs are
[('time: first', 5), ('day: next', 2), ('review,: negative', 2), ('taste: bad', 2), ('side: west', 2), ('cheese: big', 2), ('review: last', 2), ('thing: only', 2), ('place: same', 1), ('years: several', 1)]

For rating =  2 top 10 most common pairs are
[('time: second', 3), ('bit: little', 3), ('time: long', 2), ('time: first', 2), ('places: many', 2), ('time: next', 2), ('service: bad', 2), ("time: she'll", 2), ('time: last', 2), ('something: different', 1)]

For rating =  3 top 10 most common pairs are
[('amount: little', 3), ('service: good', 3), ('nicer: much', 2), ('dogs: other', 2), ('years: few', 2), ('bit: little', 2), ('anything: special', 2), ('food: Japanese', 2), ("didn't: much", 2), ('Foodland: former', 1)]

For rating =  4 top 10 most common pairs are
[('day,: next', 4), ('food: good', 3), ('time: first', 2), ('light: little', 2), ('strawberry: fresh', 2), ('Service: quick', 1), ('ambience: quiet', 1), ('key!: low', 1), ('brunc