In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Change to any path
path = 'yelp_academic_dataset_review.json'
reviews = pd.read_json(path, lines=True, chunksize=10000)
reviews_for_filter = pd.read_json(path, lines=True, chunksize=1000)

In [3]:
for r in reviews_for_filter:
    subset = r
    break

In [4]:
subset.shape

(1000, 9)

In [5]:
subset.dtypes

review_id              object
user_id                object
business_id            object
stars                   int64
useful                  int64
funny                   int64
cool                    int64
text                   object
date           datetime64[ns]
dtype: object

In [6]:
subset.head(5)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [7]:
sortedSubset = subset.sort_values(by=['business_id'])
x = subset['text']

In [8]:
# make list of review text
reviewText = []
for index in enumerate(x):
    reviewText.append(index[1])

In [9]:
print(len(reviewText))

1000


# Overview of Pros and Cons Analysis

## Create a Filter
- reviews sorted by business_id
- combine the text of the reviews
- remove stop words
- count phrases
- filter similar phrases

## Potential Issues of using machine learning to identify positive and negative phrases
- Although sentiment analysis is a solution to this task, using a model on individual phrases to identify if it is a positive or negative phrase may take an exponential computation time.
- A band-aid solution is to review the most common phrases and manually filter relevant phrases and assign each phrase a pos or neg classification.
- Two files will need to be created, one as mentioned before, to classify each filtered phrase as a pro or a con.
- The second file maps phrases to a "display" phrase that will show on the webapp. For example, there can be two phrases that will be considered the same pro. "Service is Good" and "Great service" both mean the same thing and we don't want redundancy.
- If there was more time to do this project, I would have tested a machine learning solution, but we'll use the manual solution for now.

## Use the filter
- group reviews by restaurant
- get most common phrases
- filter out phrases by the list made
- save a list of pros and cons by business_id


In [10]:
import nltk
from nltk.corpus import stopwords

In [11]:
#nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'each', 'other', 'some', 'such', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'will', 'just', 'don', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'ma', 'mightn', "mightn't", 

Following function is inspired by:
https://dev.to/mattschwartz/quickly-find-common-phrases-in-a-large-list-of-strings-9in

In [12]:
stopwords = stopwords.words('english')

In [13]:
import re

def get_common_phrases(texts, maximum_length=3, minimum_repeat=2) -> dict:
    phrases = {}
    
    for text in texts:
        # clean text
        text = re.sub(r'[.!?,:;/\-\s]',' ', text)
        text = re.sub(r'[\\|@#$&~%\(\)*\"]', '', text)
        
        words = text.split(' ')
        # filter stopwords
        words = [w for w in words if len(w) and w.lower() not in stopwords]
        length = len(words)
        
        # phrases can only be maximum_length words long
        size = length if length <= maximum_length else maximum_length
        while size > 0:
            pos = 0
            # walk over sets of words
            while pos + size <= length:
                phrase = words[pos:pos+size]
                phrase = tuple(w.lower() for w in phrase)
                if phrase in phrases:
                    phrases[phrase] += 1
                else:
                    phrases[phrase] = 1
                pos += 1
            size -= 1
    # remove phrases that are less than the length set by minimum_repeat
    phrases = {k: v for k, v in phrases.items() if v >= minimum_repeat}
    # Loop through the dictionary of phrases and remove phrases that are too similar to each other.
    longest_phrases = {}
    keys = list(phrases.keys())
    keys.sort(key=len, reverse=True)
    for phrase in keys:
        found = False
        for l_phrase in longest_phrases:
            intersection = set(l_phrase).intersection(phrase)
            if len(intersection) == len(phrase):
                difference = (phrases[phrase] - longest_phrases[l_phrase]) / longest_phrases[l_phrase]
                if difference < 0.25:
                    found = True
                    break
        if not found:
            longest_phrases[phrase] = phrases[phrase]
    # return a filtered dict of phrases
    return longest_phrases
    

## Process for creating a custom filter for relevant phrases

In [14]:
# Analyze 1000 reviews to create the filter
common_phrases = get_common_phrases(reviewText)

In [15]:
keys = list(common_phrases.keys()) # businessID
values = list(common_phrases.values()) # reviewText

In [16]:
import numpy as np

# sort common phrases by highest-> lowest
sorted_value_index = np.argsort(values)
sorted_value_index_desc= sorted_value_index[::-1]
sortedPhrases = {keys[i]: values[i] for i in sorted_value_index_desc}

In [18]:
# perform some text cleaning before saving the output
outputList = []
for key in sortedPhrases:
    txt = str(key)
    txt = txt.replace("(", "").replace(")","").replace("'","").replace(",","")
    outputList.append(txt)

In [24]:
import os.path
save_path = 'output/commonPhrases.txt'
with open(save_path, 'w') as f:
    for word in outputList:
        f.write(f"{word}\n")

## Process for finding pros and cons

In [25]:
# read filter list
filterFile = 'output/combinePhrases.txt'
mappedFile = 'output/uniqueProsandCons.txt'
mapPhrases = {}
with open(filterFile) as file:
    for line in file:
        x = line.split(',')
        mapPhrases[x[0]] = x[1].strip('\n')

In [26]:
mapValue = {}
with open (mappedFile) as file:
    for line in file:
        x = line.split(',')
        mapValue[x[0]] = x[1].strip('\n')

In [27]:
from pyspark.sql import SparkSession

In [28]:
# Create Spark Session
spark = SparkSession.builder.master("local").appName("findProsAndCons").getOrCreate()

In [29]:
# group reviews by business_id
reviewKeys = sortedSubset['business_id']
reviewValues = sortedSubset['text']


In [31]:
#pair busines_id with text
reviewPairs = []
for idx, key in enumerate(reviewKeys):
    reviewPairs.append((reviewKeys[idx], reviewValues[idx]))

In [32]:
# Distribute the data
reviews_rdd = spark.sparkContext.parallelize(reviewPairs, numSlices=8)


In [33]:
reviews_rdd.take(1)

[('XQfwVwDr-v0ZS3_CbbE5Xw',
  "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.")]

In [34]:
# Group pairs by the key
groupedPairs = reviews_rdd.groupByKey()

In [35]:
groupedPairs = groupedPairs.map(lambda x: (x[0], list(x[1])))

In [36]:
groupedPairs.take(5)

[('gmjsEdUsKpj9Xxu6pdjH0g',
  ["Loved this tour! I grabbed a groupon and the price was great. It was the perfect way to explore New Orleans for someone who'd never been there before and didn't know a lot about the history of the city. Our tour guide had tons of interesting tidbits about the city, and I really enjoyed the experience. Highly recommended tour. I actually thought we were just going to tour through the cemetery, but she took us around the French Quarter for the first hour, and the cemetery for the second half of the tour. You'll meet up in front of a grocery store (seems strange at first, but it's not terribly hard to find, and it'll give you a chance to get some water), and you'll stop at a visitor center part way through the tour for a bathroom break if needed. This tour was one of my favorite parts of my trip!"]),
 ('EQ-TZ2eeD_E0BHuvoaeG5Q',
  ["Locals recommended Milktooth, and it's an amazing jewel of Indianapolis. I'm glade I had the chance to experience this.",
   "M

In [37]:
def analyzeBusiness(businessId, texts):
    # restaurant[0] = business_id
    # restaurant[1] = text
    maximum_length=3
    minimum_repeat=2
    
    phrases = {}
    
    for text in texts:
        # clean text
        text = re.sub(r'[.!?,:;/\-\s]',' ', text)
        text = re.sub(r'[\\|@#$&~%\(\)*\"]', '', text)
        
        words = text.split(' ')
        # filter stopwords
        words = [w for w in words if len(w) and w.lower() not in stopwords]
        length = len(words)
        
        # phrases can only be maximum_length words long
        size = length if length <= maximum_length else maximum_length
        while size > 0:
            pos = 0
            # walk over sets of words
            while pos + size <= length:
                phrase = words[pos:pos+size]
                phrase = tuple(w.lower() for w in phrase)
                if phrase in phrases:
                    phrases[phrase] += 1
                else:
                    phrases[phrase] = 1
                pos += 1
            size -= 1
    # remove phrases that are less than the length set by minimum_repeat
    phrases = {k: v for k, v in phrases.items() if v >= minimum_repeat}
    longest_phrases = {}
    keys = list(phrases.keys())
    keys.sort(key=len, reverse=True)
    for phrase in keys:
        found = False
        for l_phrase in longest_phrases:
            intersection = set(l_phrase).intersection(phrase)
            if len(intersection) == len(phrase):
                difference = float(phrases[phrase] - len(longest_phrases[l_phrase])) / len(longest_phrases[l_phrase])
                if difference < 0.25:
                    found = True
                    break
        if not found:
            currentPhrase = ' '.join(phrase)
            if currentPhrase in mapPhrases:
                if currentPhrase not in longest_phrases:
                    longest_phrases[mapPhrases[currentPhrase]] = mapValue[mapPhrases[currentPhrase]]
    returnObj = dict()
    returnObj['business_id'] = businessId
    returnObj['pos'] = []
    returnObj['neg'] = []
    for phrase in longest_phrases:
        if longest_phrases[phrase] == 'pos':
            returnObj['pos'].append(phrase)
        else:
            returnObj['neg'].append(phrase)
    return returnObj

In [39]:
findProsAndCons = groupedPairs.map(lambda x: analyzeBusiness(x[0],x[1]))

# final output
# {
#    business_id: string
#    pros: []
#    cons: []
# }

In [41]:
# take output and store into a list
examples = groupedPairs.take(10000)
finalOutput = []
for idx, example in enumerate(examples):
    analyzeBusiness(example[0],example[1])
    finalOutput.append(analyzeBusiness(example[0],example[1]))

In [45]:
# Save output into json file
import json

In [47]:
output_save_path = 'output/finalOutput.json'
with open(output_save_path, 'w') as f:
    json.dump(finalOutput, f)

In [48]:
output_save_path = 'output/reviewID.txt'
with open(output_save_path, 'w') as f:
    for word in reviewKeys:
        f.write('"' + f"{word}" + '",')

# Results and Observations

## Current Results and how to improve in the future
- The results are decent, there are a good amount of pros and cons that are discovered by the filter and most restaurants have a decent list of pros and cons.
- After getting the results there are clear limitations with this method.
- First, due to only taking a subset of the data, there are many instances where some businesses have very few reviews, therefore those businesses may get little to no pros and cons. If that small handful of reviews do not contain any of the common phrases then the pros and cons arrays will be empty. As a default, if there are no pros and cons found, the attributes row of the business works well as a default pros and cons array. Attributes contain key,value pairs of business attributes such as if the restaurant has take out, what type of parking, etc. A solution to this issue is to increase the subset size, but there was not enough time to test what is the maximum size subset we can use on our machine.
- This filter is general, because the function iterates through review texts of all restaurants, the most common phrases will be phrases that are shared across all reviews. Therefore, specific pros and cons that may only apply to that restaurant in particular may not be caught. A solution to this is to find pros and cons of each type of restaurant at a time(burger, sushi, etc.) this way, we can get more specific pros and cons.
- Cons are harder to detect than pros. Reviews who have negative things to say about a restaurant tend to be more descriptive and specific about their issue, this leads to less negative common phrases than positive ones. I believe a machine learning method to classify the pros and cons may alleviate this issue.
