In [6]:
### TODOs - extract restaurant name, get rid of weird formatting (EX: <br&gt;<br&gt)
### find way to get all reviews (not just first page)

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

matplotlib.rcParams['figure.figsize'] = [15.0, 10.0]
matplotlib.rcParams['font.size'] = 15

import requests
from bs4 import BeautifulSoup

In [7]:
def extract_data(filename):
    """
    Function to get the information from restaurant's Yelp page and put each review in a list
    param filename: address of restaurant's Yelp site
    return: list of basic restaurant info (name, street, category, price), list of separate reviews
    """
    page = requests.get(filename)
    soup = BeautifulSoup(page.content, 'html.parser')
    review_lines = soup.find_all('script', type = 'application/json')

    #create a list to store basic restaurant info
    restaurant_info = []
    
    #get restaurant name and street and add to info list
    try:
        info = soup.title.string.split('-')
    except:
        print('Could not get soup :/')
        return [], []
    restaurant = info[0]
    restaurant_info.append(restaurant)
    address = info[2][1:len(info[2])-17]
    try:
        street = address[address.index(' ')+1:]
    except:
        street = 'NA'
    restaurant_info.append(street)
    
    string_review_line = str(review_lines[0])
    #find food category, price and add to info list
    info2= string_review_line[string_review_line.index('category_aliases'):string_review_line.index('review_count')].split(':')
    category = info2[1][1:len(info2[1])-9]
    restaurant_info.append(category)
    price_tags = info2[6][1:len(info2[6])-9]
    restaurant_info.append(price_tags)
    
    #find overall restaurant rating
    string_rating_line = str(review_lines[1])
    average_rating= string_rating_line[string_rating_line.index('stars from')-4:string_rating_line.index('stars from')]
    restaurant_info.append(average_rating)
    
    #find reviews
    split_start = string_review_line.split('comment')
    try:
        split_end = split_start[4].split('photos')
    except:
        print("could not read reviews :(")
        return restaurant_info, []
    #create list of all the different reviews/ratings of customers
    reviews = []
    for i in range(len(split_start)):
        reviews.append(split_start[i].split('photos')[0])
    return restaurant_info, reviews


In [8]:
def clean_reviews(info, new_reviews, all_reviews):
    """
    Function separate reviews into their features (restaurant name, street, category, price, review, date, rating)
    param: separate reviews
    return: cleaned list of reviews
    """
    #clean up reviews to only have restaurant name, review, date, rating
    for i in range(1, len(new_reviews)):
        #DEBUGGING
        #print(new_reviews[i])
        #print()
        if new_reviews[i].find(',\"language\"') == -1:
            continue
        else:
            review_text = new_reviews[i][11:new_reviews[i].index(',\"language\"')-1:]
        date = new_reviews[i][new_reviews[i].index('Date\":\"')+7:new_reviews[i].index('\",\"localized')]
        rating = new_reviews[i][new_reviews[i].index('\"rating\":')+9: -2]
        all_reviews.append([info[0],info[1],info[2],info[3],info[4],review_text,date,rating])
    return all_reviews
    

In [9]:
def create_dataframe(clean_reviews):
    """
    Function to create a data frame in pandas
    param: list of all cleaned reviews
    return: dataframe
    """
    #create dataframe in pandas
    df = pd.DataFrame(clean_reviews, columns=['Name', 'Street', 'Category', 'Price Tags','Average Rating','Review', 'Date', 'Rating'])
    return df

In [None]:
#load in the website data
infile = open('ultimate_websites.txt', 'r')
#file with all the restaurant's Yelp sites that we will be using - each line has a web address
lines = infile.readlines()

#initialize list to put all the cleaned reviews in
all_reviews = []

#loop through all restaurant Yelp websites in the file
for website in lines:
    print(website)
    info, new_reviews = extract_data(website.strip())
    
    #if this doesn't work, go to the next website
    if new_reviews == []:
        continue
    
    #print(new_reviews)
    all_reviews = clean_reviews(info, new_reviews, all_reviews)
infile.close()

#create a dataframe
df = create_dataframe(all_reviews)
df

https://www.yelp.com/biz/the-farmhouse-tap-and-grill-burlington



In [None]:

df.to_csv('reviews_data_humongous.csv')