# CZ4045 Part 3.3

In [1]:
import re
import math
import json
import string
import random
import pandas as pd
import nltk.corpus
from nltk.tokenize import PunktSentenceTokenizer

In [2]:
# Importing dataset
with open('reviewSelected100.json','r', encoding = 'ISO=8859-1') as f:
    review = f.readlines()
review = [json.loads(line) for line in review]
review = pd.DataFrame(review)

## Text cleaning

In [3]:
# function to remove short forms
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [4]:
# Function to remove punctuations
def remove_punctuation(text):
    import string
    # replacing the punctuations with no space, 
    # which in effect deletes the punctuation marks 
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks
    return text.translate(translator)

In [5]:
review['business_id'].nunique()

153

In [6]:
business_id = review['business_id'].unique()

## Selecting random business

In [7]:
# Randomly generate business_id
random.choices(business_id)[0]

'p6FPcgLymnpk_gAyQuW_Mw'

In [8]:
# Use the above generated id as the three samples
# Three id used here as demo
b1 = review[review['business_id'] == 'QqGMtc24VdCzYAajw1g4bA'].reset_index()
b2 = review[review['business_id'] == 'I6MRD0JkDDa74TbVOJ1ykw'].reset_index()
b3 = review[review['business_id'] == 'Rii85bzYKGC9P0zOyAem6A'].reset_index()

In [9]:
# Pre-process the texts
test1 = b1['text']
test1 = test1.replace("\n"," ")
test1 = test1.apply(decontracted)
test1 = test1.apply(remove_punctuation)

test2 = b2['text']
test2 = test2.replace("\n", " ")
test2 = test2.apply(decontracted)
test2 = test2.apply(remove_punctuation)

test3 = b3['text']
test3 = test3.replace("\n", " ")
test3 = test3.apply(decontracted)
test3 = test3.apply(remove_punctuation)

In [31]:
# Manually go through the reviews
for i in test1:
    print(i)

This is a chain that I am pretty sure is new to the Pittsburgh area  Mercifully this was built right behind the Denny is  If you go to Denny is instead of The Original for breakfast you are a moron

Portions are definitely big You might be just fine with a short stack

Who does not love chocolate chip pancakes If you get sick of them try the chocolate chip waffle

The Dutch Baby intrigued us for a while  We finally tried it and were underwhelmed  The Apple Pancakes are awesome though
Solid breakfast  Great service  Our waitress was a little older and so good  Bought the food out fast due to only two arms she would serve a couple of plates and then acknowledge what would be out next Know when to refill our cups and glasses  The Blueberry Pancakes were good with plump fresh blueberries and came with a small pitcher of fresh blueberries with some blueberry sauce   From experience know to order the short stack and our waitress also advised that it was 3 pancakes  Plenty to eat  The bacon w

## Define rules to extract adj. phrase

In [10]:
# Define grammar rules for adjective phrases
chunk_grammar = """ADJP: {<JJ|JJS|JJR|RB><CC>*<JJ|JJS|JJR|VBG|VBN>+ | <RB|RBR|RBS*>+<JJ|JJR|JJS|VBN|VBG>+<CC>*<JJ|VBN|VBG>* | <JJR|VBN|VBD><IN><JJ|NN> | <DT><NN><JJ|JJR|JJS> | <JJ|JJR|JJS><JJ|JJR|JJS> }"""

# Define functions to extract adjective phrases
def adj_phr(business_review):
    adj_phrase = []
    try:
        for i in business_review:
            i = decontracted(i)
            tokens = nltk.word_tokenize(i)
            tags = nltk.pos_tag(tokens)
            parser = nltk.RegexpParser(chunk_grammar)
            tree = parser.parse(tags)
            
            for subtree in tree.subtrees(filter=lambda t: t.label() == 'ADJP'):
                adj_phrase.append(subtree)
                #print(subtree)
                     
    except Exception as e:
        print(str(e))
    return adj_phrase

In [11]:
from collections import Counter

# Functions to return adjective phrases lists and dataframe
def extract_adj(input_data):
    adj_phrase = adj_phr(input_data)
    subtexts = []
    for i in range(len(adj_phrase)):
        res =  [' '.join(tups[0] for tups in adj_phrase[i])]
        subtexts.append(res)
    
    for i in range(len(subtexts)):
        subtexts[i] = " ".join(subtexts[i])
        
    adjcount = Counter(subtexts)
    adjcount_df = pd.DataFrame.from_dict(adjcount, orient='index').reset_index()
    adjcount_df = adjcount_df.rename(columns={'index':'adj_phrase', 0:'count'})
    adjcount_df = adjcount_df.sort_values(by=['count'], ascending = False)
    
    return adj_phrase, subtexts, adjcount_df

In [12]:
b1_phrase, b1_subtexts, b1_adjcount_df = extract_adj(test1)
b2_phrase, b2_subtexts, b2_adjcount_df = extract_adj(test2)
b3_phrase, b3_subtexts, b3_adjcount_df = extract_adj(test3)

In [13]:
b1_subtexts

['definitely big',
 'just fine',
 'little older',
 'so good',
 'fast due',
 'very thick',
 'quite noisy',
 'not amazing',
 'too thick',
 'thin and crispy',
 'very clean',
 'else Going',
 'quite large',
 'not been disappointed',
 'well done',
 'not cooked',
 'enough Granted',
 'completely cooked',
 'much better',
 'really easy',
 'much better',
 'too sweet',
 'right next',
 'always full',
 'always really good',
 'very nice',
 'little slow',
 'very busy',
 'very large',
 'absolutely horrible',
 'not so thick and good',
 'always good',
 'sweet and adorable',
 'outwardly rude',
 'not as good',
 'rude or condescending',
 'good and most',
 'not crazy',
 'awesome tropical',
 'adorable and talkative',
 'really fresh',
 'made from scratch',
 'pretty good',
 'extremely knowledgeable',
 'ever had',
 'always good',
 'very clean',
 'not rushed',
 'never really long',
 'not very busy',
 'very quick',
 'So good',
 'reasonably priced',
 'phenomenal sourdough',
 'bit much',
 'Not sure',
 'always sendin

In [14]:
b1_adjcount_df.head(10)

Unnamed: 0,adj_phrase,count
98,very friendly,3
43,ever had,3
24,very nice,3
41,pretty good,3
60,so delicious,3
188,gluten free,3
193,too much,3
102,really good,3
138,not sure,3
20,too sweet,2


## Create Bag of Words

In [15]:
import numpy as np
phraseset12 = np.union1d(b1_subtexts, b2_subtexts)
phraseset = np.union1d(phraseset12, b3_subtexts)

In [16]:
# Functions to generate the vector table
def calculateBOW(wordset,l_doc):
  tf_diz = dict.fromkeys(wordset,0)
  for word in l_doc:
      tf_diz[word]=l_doc.count(word)
  return tf_diz

In [17]:
bow1 = calculateBOW(phraseset,b1_subtexts)
bow2 = calculateBOW(phraseset,b2_subtexts)
bow3 = calculateBOW(phraseset,b3_subtexts)
df_bow = pd.DataFrame([bow1,bow2,bow3])
df_bow.head()

Unnamed: 0,Absolutely amazing,Absolutely delightful,Also eaten,As simple,Busy but seated,Finally dropped,However most importantlythe,Just ordered,Native American,Never been,...,well worth,well young,went for breakfast,went in midweek,were at first,were in trouble,worried about caloric,worst dining,yeasty sweet,yummy and aesthetic looking
0,1,0,1,0,0,0,1,0,0,0,...,1,1,1,1,1,0,1,0,0,1
1,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,1,0,1,1,0,...,0,0,0,0,0,1,0,1,1,0


In [18]:
df_bow = df_bow.T
df_bow

Unnamed: 0,0,1,2
Absolutely amazing,1,0,0
Absolutely delightful,0,1,0
Also eaten,1,0,0
As simple,0,1,0
Busy but seated,0,0,1
...,...,...,...
were in trouble,0,0,1
worried about caloric,1,0,0
worst dining,0,0,1
yeasty sweet,0,0,1


In [19]:
df_bow = df_bow.sort_values(by=[0], ascending = False)
df_bow = df_bow.rename(columns={0:'b1', 1:'b2', 2:'b3'})

In [20]:
df_bow

Unnamed: 0,b1,b2,b3
so delicious,3,0,1
ever had,3,0,1
very friendly,3,5,0
pretty good,3,3,2
very nice,3,4,0
...,...,...,...
not eaten,0,0,1
not done,0,1,2
not doing,0,0,1
not disappointed,0,1,0


## Calculate TF-IDF

In [21]:
# Obtain number of documents that contain each adjective prhases
df_bow['freq'] = df_bow.astype(bool).sum(axis=1)
df_bow

Unnamed: 0,b1,b2,b3,freq
so delicious,3,0,1,2
ever had,3,0,1,2
very friendly,3,5,0,2
pretty good,3,3,2,3
very nice,3,4,0,2
...,...,...,...,...
not eaten,0,0,1,1
not done,0,1,2,2
not doing,0,0,1,1
not disappointed,0,1,0,1


In [22]:
df_bow.index[2]

'very friendly'

In [23]:
phrase_corpus = b1_subtexts + b2_subtexts + b3_subtexts
phrase_count = Counter(phrase_corpus)

**Term Frequency (TF)** measures how frequently a term t appears in a document d.

TF = No. of t in d/No. of terms in d

In [24]:
tf_b1 = []
tf_b2 = []
tf_b3 = []
for i in range(len(df_bow)):
    tf_b1.append(df_bow['b1'][i]/len(b1_subtexts))
    tf_b2.append(df_bow['b2'][i]/len(b2_subtexts))
    tf_b3.append(df_bow['b3'][i]/len(b3_subtexts))

In [25]:
df_bow['TF(b1)'] = tf_b1
df_bow['TF(b2)'] = tf_b2
df_bow['TF(b3)'] = tf_b3
df_bow

Unnamed: 0,b1,b2,b3,freq,TF(b1),TF(b2),TF(b3)
so delicious,3,0,1,2,0.009677,0.000000,0.003003
ever had,3,0,1,2,0.009677,0.000000,0.003003
very friendly,3,5,0,2,0.009677,0.018868,0.000000
pretty good,3,3,2,3,0.009677,0.011321,0.006006
very nice,3,4,0,2,0.009677,0.015094,0.000000
...,...,...,...,...,...,...,...
not eaten,0,0,1,1,0.000000,0.000000,0.003003
not done,0,1,2,2,0.000000,0.003774,0.006006
not doing,0,0,1,1,0.000000,0.000000,0.003003
not disappointed,0,1,0,1,0.000000,0.003774,0.000000


**Inverse Document Frequency (IDF)** measures how important a term is. 

IDF = log(No. of documents/No. of documents with term t + 1)

In [26]:
IDF = []
for i in range(len(df_bow)):
    IDF.append(math.log(3/(df_bow['freq'][i]+1)))
df_bow['IDF'] = IDF

**TF-IDF**: A term with high score means the term is both rare in all documents but also frequent in a single document

In [27]:
df_bow['TF-IDF(b1)'] = df_bow['TF(b1)']*df_bow['IDF'] 

In [28]:
df_bow

Unnamed: 0,b1,b2,b3,freq,TF(b1),TF(b2),TF(b3),IDF,TF-IDF(b1)
so delicious,3,0,1,2,0.009677,0.000000,0.003003,0.000000,0.000000
ever had,3,0,1,2,0.009677,0.000000,0.003003,0.000000,0.000000
very friendly,3,5,0,2,0.009677,0.018868,0.000000,0.000000,0.000000
pretty good,3,3,2,3,0.009677,0.011321,0.006006,-0.287682,-0.002784
very nice,3,4,0,2,0.009677,0.015094,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
not eaten,0,0,1,1,0.000000,0.000000,0.003003,0.405465,0.000000
not done,0,1,2,2,0.000000,0.003774,0.006006,0.000000,0.000000
not doing,0,0,1,1,0.000000,0.000000,0.003003,0.405465,0.000000
not disappointed,0,1,0,1,0.000000,0.003774,0.000000,0.405465,0.000000


Obtain the most indicative phrases by sorting the dataframe based on TF-IDF score in descending orders. Here we select the top 10 phrases with highest TF-IDF score.

In [29]:
most_indicative = df_bow
most_indicative = most_indicative.sort_values(by=['TF-IDF(b1)'], ascending = False)
most_indicative.head(10)

Unnamed: 0,b1,b2,b3,freq,TF(b1),TF(b2),TF(b3),IDF,TF-IDF(b1)
gluten free,3,0,0,1,0.009677,0.0,0.0,0.405465,0.003924
very large,2,0,0,1,0.006452,0.0,0.0,0.405465,0.002616
deep fried,2,0,0,1,0.006452,0.0,0.0,0.405465,0.002616
not as good,2,0,0,1,0.006452,0.0,0.0,0.405465,0.002616
fresh squeezed,2,0,0,1,0.006452,0.0,0.0,0.405465,0.002616
relatively small,2,0,0,1,0.006452,0.0,0.0,0.405465,0.002616
too sweet,2,0,0,1,0.006452,0.0,0.0,0.405465,0.002616
sometimes crunchy,1,0,0,1,0.003226,0.0,0.0,0.405465,0.001308
somewhat mixed,1,0,0,1,0.003226,0.0,0.0,0.405465,0.001308
still coldpartially frozen,1,0,0,1,0.003226,0.0,0.0,0.405465,0.001308
