# CZ4045 Part 3.3

In [1]:
import re
import math
import json
import string
import random
import pandas as pd
import nltk.corpus
from nltk.tokenize import PunktSentenceTokenizer

In [2]:
with open('reviewSelected100.json','r', encoding = 'ISO=8859-1') as f:
    review = f.readlines()
review = [json.loads(line) for line in review]
review = pd.DataFrame(review)

## Text cleaning

In [3]:
# function to remove short forms
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [4]:
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [5]:
def stopwords(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)

In [6]:
review['business_id'].nunique()

153

In [7]:
business_id = review['business_id'].unique()

## Selecting random business

In [8]:
random.choices(business_id)[0]

'paCgjLEUEVM38VZ6Sjghgg'

In [9]:
#samples = review[review['business_id'] != '7e3PZzUpG5FYOTGt3O3ePA'].reset_index()
#sample = review['text'].head(20)
#sample = sample.apply(remove_punctuation)
#sample
#for i in range(len(sample)):
#    sample[i] = sample[i].replace("\n"," ")
#test_sample = samples['text']
#test_sample = test_sample.replace("\n", " ")

In [10]:
b1 = review[review['business_id'] == random.choices(business_id)[0]].reset_index()
b2 = review[review['business_id'] == random.choices(business_id)[0]].reset_index()
b3 = review[review['business_id'] == random.choices(business_id)[0]].reset_index()

In [11]:
test1 = b1['text']
test1 = test1.replace("\n"," ")

test2 = b2['text']
test2 = test2.replace("\n", " ")

test3 = b3['text']
test3 = test3.replace("\n", " ")


In [12]:
#for i in test1:
#    print(decontracted(i))

In [13]:
#test1 = test1.apply(stopwords)
#test2 = test2.apply(stopwords)
#test3 = test3.apply(stopwords)

## Define rules to extract adj. phrase

In [14]:
# INCLUDE NEGATION
chunk_grammar = """ADJP: {<JJ|JJS|JJR><,>*<CC>*<JJ|JJS|JJR|VBG>+ | <RB|RBR|RBS.*>+<JJ|JJR|JJS|VBN|VBG>+<CC>*<JJ|VBN|VBG>* | <JJ|RB><CC><JJ|VBN|VBG> | <JJR|VBN><IN><JJ|NN> | <DT><NN><JJ|JJR|JJS> | <IN><NN><IN><DT><NN> | <VBD><IN><NN|NNS>+}"""

def adj_phr(business_review):
    adj_phrase = []
    try:
        for i in business_review:
            i = decontracted(i)
            tokens = nltk.word_tokenize(i)
            tags = nltk.pos_tag(tokens)
            parser = nltk.RegexpParser(chunk_grammar)
            tree = parser.parse(tags)
            
            for subtree in tree.subtrees(filter=lambda t: t.label() == 'ADJP'):
                adj_phrase.append(subtree)
                #print(subtree)
                     
    except Exception as e:
        print(str(e))
    return adj_phrase

In [15]:
from collections import Counter
# Functions to return adjective phrases lists and dataframe
def extract_adj(input_data):
    adj_phrase = adj_phr(input_data)
    subtexts = []
    for i in range(len(adj_phrase)):
        res =  [' '.join(tups[0] for tups in adj_phrase[i])]
        subtexts.append(res)
    
    for i in range(len(subtexts)):
        subtexts[i] = " ".join(subtexts[i])
        
    adjcount = Counter(subtexts)
    adjcount_df = pd.DataFrame.from_dict(adjcount, orient='index').reset_index()
    adjcount_df = adjcount_df.rename(columns={'index':'adj_phrase', 0:'count'})
    adjcount_df = adjcount_df.sort_values(by=['count'], ascending = False)
    
    return adj_phrase, subtexts, adjcount_df

In [16]:
b1_phrase, b1_subtexts, b1_adjcount_df = extract_adj(test1)
b2_phrase, b2_subtexts, b2_adjcount_df = extract_adj(test2)
b3_phrase, b3_subtexts, b3_adjcount_df = extract_adj(test3)

In [17]:
for i in test1:
    print(i)

We bring our two dogs (German Shepherd and Golden Retriever) here all the time! Usually on the weekends, we drive 45 min just to come here!
We love it & so do our pups!
Great drink deals too!
Kiddie pools in the summer to keep your pup nice and cool!
Great atmosphere! but you MUST be a dog lover!
Never seen a fight or anything. Sometimes you see one or two dogs trying to boss around others (there is a grey pitbull, that I think belongs to an employee that always does that) but nothing serious. 
Wish there was one of these closer to Gastonia/Kings Mountain!
This is my favorite dog bar in the area! I miss living closer to it. The staff is super nice, and atmosphere is very laid back. My dog has a blast here. It's wonderful!
Great concept in a great location, that being said there is a ton of room for improvement, the fence isn't very tall and there are a lot of areas that there is a bench pushed up to the fence making it very easy for any dog to escape. 
They allow any dog in no question

In [18]:
b1_subtexts

['nice and cool',
 'super nice',
 'very laid',
 'not very tall and',
 'very easy',
 'good looking',
 'very little',
 'Just moved',
 'a dozen other',
 'not blocking',
 'the dog urine',
 'fine overall',
 'no longer',
 'still confused',
 'too close',
 'illegally parked',
 'not towed',
 'very frustrated',
 'clearly marked parking',
 'least 2-3',
 'so so friendly and',
 'so many',
 'this place more',
 'a pet friendly',
 'not necessarily bad',
 'not quite sure',
 'was in heaven',
 'around exploring',
 'so fun',
 'so much',
 'really great',
 'pretty good',
 'other fur',
 'few human',
 'many fur',
 'really adorable',
 'really fun',
 'just sprayed',
 'not sure',
 'identified as staff',
 'watched by thier',
 'really cute',
 'especially fantastic',
 'fenced in area',
 'of enjoyment for this place',
 'much more reasonably priced',
 'super friendly',
 'always provided',
 'so comfy',
 'too much',
 'not so efficient',
 'well behaved',
 'not having',
 'were in town',
 'desperately missing',
 'so happy

## Create Bag of Words

In [19]:
import numpy as np
phraseset12 = np.union1d(b1_subtexts, b2_subtexts)
phraseset = np.union1d(phraseset12, b3_subtexts)

In [20]:
def calculateBOW(wordset,l_doc):
  tf_diz = dict.fromkeys(wordset,0)
  for word in l_doc:
      tf_diz[word]=l_doc.count(word)
  return tf_diz

In [21]:
bow1 = calculateBOW(phraseset,b1_subtexts)
bow2 = calculateBOW(phraseset,b2_subtexts)
bow3 = calculateBOW(phraseset,b3_subtexts)
df_bow = pd.DataFrame([bow1,bow2,bow3])
df_bow.head()

Unnamed: 0,AAA black,Also dogs,As bad,By recommendation of the staff,Clean and relaxing,Definately coming,Definitely been happy,Extremely clean,Friendly and able,Greek and Italian,...,were in town,wet pee,whole better,with top of the line,"wonderful , courteous","wonderful , friendly",worked in detail,worst bar/dog,yet so greasy,yet warm and easy going
0,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
1,0,0,1,0,1,0,1,1,0,0,...,0,0,0,1,1,1,1,0,0,1
2,1,0,0,1,0,1,0,0,1,1,...,0,0,1,0,0,0,0,0,1,0


In [22]:
df_bow = df_bow.T
df_bow

Unnamed: 0,0,1,2
AAA black,0,0,1
Also dogs,1,0,0
As bad,0,1,0
By recommendation of the staff,0,0,1
Clean and relaxing,0,1,0
...,...,...,...
"wonderful , friendly",0,1,0
worked in detail,0,1,0
worst bar/dog,1,0,0
yet so greasy,0,0,1


In [23]:
df_bow = df_bow.sort_values(by=[0], ascending = False)
df_bow = df_bow.rename(columns={0:'b1', 1:'b2', 2:'b3'})

In [24]:
df_bow

Unnamed: 0,b1,b2,b3
so much,4,3,2
well behaved,3,0,0
a bit unsanitary,3,0,0
not paying,3,0,0
really great,3,0,0
...,...,...,...
lacked in mass,0,0,1
least bit heavy,0,0,1
less than perfect,0,0,1
limp and soggy,0,0,1


## Calculate TF-IDF

In [25]:
df_bow['freq'] = df_bow.astype(bool).sum(axis=1)
df_bow

Unnamed: 0,b1,b2,b3,freq
so much,4,3,2,3
well behaved,3,0,0,1
a bit unsanitary,3,0,0,1
not paying,3,0,0,1
really great,3,0,0,1
...,...,...,...,...
lacked in mass,0,0,1,1
least bit heavy,0,0,1,1
less than perfect,0,0,1,1
limp and soggy,0,0,1,1


In [26]:
df_bow.index[2]

'a bit unsanitary'

In [27]:
phrase_corpus = b1_subtexts + b2_subtexts + b3_subtexts
phrase_count = Counter(phrase_corpus)

**Term Frequency (TF)** measures how frequently a term t appears in a document d.

TF = No. of t in d/No. of terms in d

In [28]:
tf_b1 = []
tf_b2 = []
tf_b3 = []
for i in range(len(df_bow)):
    tf_b1.append(df_bow['b1'][i]/len(b1_subtexts))
    tf_b2.append(df_bow['b2'][i]/len(b2_subtexts))
    tf_b3.append(df_bow['b3'][i]/len(b3_subtexts))

In [29]:
df_bow['TF(b1)'] = tf_b1
df_bow['TF(b2)'] = tf_b2
df_bow['TF(b3)'] = tf_b3
df_bow

Unnamed: 0,b1,b2,b3,freq,TF(b1),TF(b2),TF(b3)
so much,4,3,2,3,0.012085,0.007712,0.004975
well behaved,3,0,0,1,0.009063,0.000000,0.000000
a bit unsanitary,3,0,0,1,0.009063,0.000000,0.000000
not paying,3,0,0,1,0.009063,0.000000,0.000000
really great,3,0,0,1,0.009063,0.000000,0.000000
...,...,...,...,...,...,...,...
lacked in mass,0,0,1,1,0.000000,0.000000,0.002488
least bit heavy,0,0,1,1,0.000000,0.000000,0.002488
less than perfect,0,0,1,1,0.000000,0.000000,0.002488
limp and soggy,0,0,1,1,0.000000,0.000000,0.002488


**Inverse Document Frequency (IDF)** measures how important a term is. 

IDF = log(No. of documents/No. of documents with term t)

smoothing denominator +1

In [30]:
IDF = []
for i in range(len(df_bow)):
    IDF.append(math.log(3/df_bow['freq'][i]))
df_bow['IDF'] = IDF

**TF-IDF**: A term with high score means the term is both rare in all documents but also frequent in a single document

In [31]:
df_bow['TF-IDF(b1)'] = df_bow['TF(b1)']*df_bow['IDF'] 

In [32]:
df_bow

Unnamed: 0,b1,b2,b3,freq,TF(b1),TF(b2),TF(b3),IDF,TF-IDF(b1)
so much,4,3,2,3,0.012085,0.007712,0.004975,0.000000,0.000000
well behaved,3,0,0,1,0.009063,0.000000,0.000000,1.098612,0.009957
a bit unsanitary,3,0,0,1,0.009063,0.000000,0.000000,1.098612,0.009957
not paying,3,0,0,1,0.009063,0.000000,0.000000,1.098612,0.009957
really great,3,0,0,1,0.009063,0.000000,0.000000,1.098612,0.009957
...,...,...,...,...,...,...,...,...,...
lacked in mass,0,0,1,1,0.000000,0.000000,0.002488,1.098612,0.000000
least bit heavy,0,0,1,1,0.000000,0.000000,0.002488,1.098612,0.000000
less than perfect,0,0,1,1,0.000000,0.000000,0.002488,1.098612,0.000000
limp and soggy,0,0,1,1,0.000000,0.000000,0.002488,1.098612,0.000000


In [33]:
most_indicative = df_bow
most_indicative = most_indicative.sort_values(by=['TF-IDF(b1)'], ascending = False)
most_indicative.head(10)

Unnamed: 0,b1,b2,b3,freq,TF(b1),TF(b2),TF(b3),IDF,TF-IDF(b1)
a bit unsanitary,3,0,0,1,0.009063,0.0,0.0,1.098612,0.009957
not paying,3,0,0,1,0.009063,0.0,0.0,1.098612,0.009957
really great,3,0,0,1,0.009063,0.0,0.0,1.098612,0.009957
well behaved,3,0,0,1,0.009063,0.0,0.0,1.098612,0.009957
reasonably priced,2,0,0,1,0.006042,0.0,0.0,1.098612,0.006638
bit nervous,2,0,0,1,0.006042,0.0,0.0,1.098612,0.006638
really fun,2,0,0,1,0.006042,0.0,0.0,1.098612,0.006638
not ready,2,0,0,1,0.006042,0.0,0.0,1.098612,0.006638
good local,2,0,0,1,0.006042,0.0,0.0,1.098612,0.006638
super friendly,2,0,0,1,0.006042,0.0,0.0,1.098612,0.006638
