# Yelp Restaurant Scraping

In [1]:
import pandas as pd
import numpy as np

## Upload NYC Open Data Restaurants Inspection Results

In [2]:
restaurants_df = pd.read_csv('DOHMH_New_York_City_Restaurant_Inspection_Results.csv', encoding='UTF-8')

In [3]:
# Glimpse of the raw data
restaurants_df

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE
0,40545130,SPEEDY'S DELI,MANHATTAN,1271,BROADWAY,10001.0,2126838997,American,07/16/2018,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140Âº F.,Critical,12.0,A,07/16/2018,11/21/2018,Cycle Inspection / Re-inspection
1,50001204,MON CHER MARKET,MANHATTAN,339,BROADWAY,10013.0,2129650007,American,04/06/2018,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140Âº F.,Critical,21.0,,,11/21/2018,Cycle Inspection / Initial Inspection
2,50002474,THE LOYAL,MANHATTAN,289,BLEECKER ST,10014.0,2124885800,American,04/12/2018,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140Âº F.,Critical,18.0,,,11/21/2018,Cycle Inspection / Initial Inspection
3,41067502,CARLOS PIZZERIA,QUEENS,575,SENECA AVENUE,11385.0,7183868526,Pizza/Italian,04/27/2017,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140Âº F.,Critical,26.0,,,11/21/2018,Cycle Inspection / Initial Inspection
4,41353290,THANH DA,BROOKLYN,6008,7 AVENUE,11220.0,7184922717,Vietnamese/Cambodian/Malaysia,08/22/2017,Violations were cited in the following area(s).,20F,Current letter grade card not posted.,Not Critical,,,,11/21/2018,Administrative Miscellaneous / Initial Inspection
5,50042180,MARIO'S PIZZA 2,BRONX,1011,WESTCHESTER AVE,10459.0,7189916142,Pizza,09/14/2018,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140Âº F.,Critical,28.0,,,11/21/2018,Cycle Inspection / Initial Inspection
6,50050878,"COLUMBIA UNIVERSITY BAKER ATHLETICS COMPLEX, S...",MANHATTAN,533,W 218TH ST,10034.0,2128548324,CafÃ©/Coffee/Tea,09/28/2018,No violations were recorded at the time of thi...,,,Not Applicable,0.0,,,11/21/2018,Pre-permit (Operational) / Compliance Inspection
7,50046096,JACKSON GYRO,QUEENS,8530,37TH AVE,11372.0,7186852542,American,10/10/2018,Violations were cited in the following area(s).,09C,Food contact surface not properly maintained.,Not Critical,12.0,,,11/21/2018,Cycle Inspection / Second Compliance Inspection
8,50001089,BLANCA,BROOKLYN,261,MOORE,,7184171118,Continental,05/26/2016,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140Âº F.,Critical,29.0,,,11/21/2018,Cycle Inspection / Initial Inspection
9,50018553,TEXAS FRIED CHICKEN,MANHATTAN,2144,8TH AVE,10026.0,3472812013,Chicken,02/09/2015,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140Âº F.,Critical,9.0,A,02/09/2015,11/21/2018,Pre-permit (Operational) / Initial Inspection


In [4]:
# Clean data by removing duplicate CAMIS Records (keep most recent), removing restaurants without a valid rating, and removing restaurants located in Queens.
restaurants_df = restaurants_df[restaurants_df.BORO != 'QUEENS'].sort_values(['CAMIS', 'INSPECTION DATE'], ascending=[True, False]).drop_duplicates(subset=['CAMIS'], keep='first').dropna(subset=['GRADE'])

In [6]:
# Define a helper method that obtains the Yelp URL for a given restuarant name and borough
def get_url(df):
    url = str(df['DBA']).replace(' ','-')
    if '&' in url:
        url = url.replace('&', 'AND')
    if '\'' in url:
        url = url.replace('\'', '')
    if df['BORO'] == 'MANHATTAN':
        url += '-new-york'
    else:
        url += "-" + str(df['BORO']).replace(" ", "-")
    url = "https://www.yelp.com/biz/" + url
    return url

In [7]:
# Apply the get_url function on the dataframe and add as a new column
restaurants_df['URL'] = restaurants_df.apply(get_url, axis = 1)

## Scrape Yelp data for reviews

In [8]:
# Make a copy of the first 10 restaurants to test
test_df = restaurants_df.head(10)
test_df

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE,URL
91711,30075445,MORRIS PARK BAKE SHOP,BRONX,1007,MORRIS PARK AVE,10462.0,7188924968,Bakery,05/18/2017,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,7.0,A,05/18/2017,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/MORRIS-PARK-BAKE-SHOP...
151360,30112340,WENDY'S,BROOKLYN,469,FLATBUSH AVENUE,11225.0,7182875005,Hamburgers,10/27/2016,Violations were cited in the following area(s).,04N,Filth flies or food/refuse/sewage-associated (...,Critical,11.0,A,10/27/2016,11/21/2018,Cycle Inspection / Re-inspection,https://www.yelp.com/biz/WENDYS-BROOKLYN
120414,30191841,DJ REYNOLDS PUB AND RESTAURANT,MANHATTAN,351,WEST 57 STREET,10019.0,2122452912,Irish,12/14/2017,Violations were cited in the following area(s).,04H,"Raw, cooked or prepared food is adulterated, c...",Critical,10.0,A,12/14/2017,11/21/2018,Cycle Inspection / Re-inspection,https://www.yelp.com/biz/DJ-REYNOLDS-PUB-AND-R...
300624,40356018,RIVIERA CATERERS,BROOKLYN,2780,STILLWELL AVENUE,11224.0,7183723031,American,10/30/2018,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,10.0,A,10/30/2018,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/RIVIERA-CATERERS-BROO...
8840,40356483,WILKEN'S FINE FOOD,BROOKLYN,7114,AVENUE U,11234.0,7184443838,Delicatessen,06/03/2017,Violations were cited in the following area(s).,09A,Canned food product observed dented and not se...,Not Critical,13.0,A,06/03/2017,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/WILKENS-FINE-FOOD-BRO...
126087,40356731,TASTE THE TROPICS ICE CREAM,BROOKLYN,1839,NOSTRAND AVENUE,11226.0,7188560821,"Ice Cream, Gelato, Yogurt, Ices",06/26/2015,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,5.0,A,06/26/2015,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/TASTE-THE-TROPICS-ICE...
22234,40357217,WILD ASIA,BRONX,2300,SOUTHERN BOULEVARD,10460.0,7182207846,American,06/14/2017,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,2.0,A,06/14/2017,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/WILD-ASIA-BRONX
72155,40359480,1 EAST 66TH STREET KITCHEN,MANHATTAN,1,EAST 66 STREET,10065.0,2128793900,American,09/28/2018,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,7.0,A,09/28/2018,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/1-EAST-66TH-STREET-KI...
72110,40359705,NATHAN'S FAMOUS,BROOKLYN,1310,SURF AVENUE,11224.0,7183332202,Hotdogs,09/08/2015,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or condit...,Not Critical,11.0,A,09/08/2015,11/21/2018,Cycle Inspection / Re-inspection,https://www.yelp.com/biz/NATHANS-FAMOUS-BROOKLYN
22959,40360045,SEUDA FOODS,BROOKLYN,705,KINGS HIGHWAY,11223.0,7183751500,Jewish/Kosher,11/27/2017,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,13.0,A,11/27/2017,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/SEUDA-FOODS-BROOKLYN


In [9]:
from bs4 import BeautifulSoup
from requests import get 
import re

In [10]:
# Checking the validity of the Yelp URL
def check_validity(df):
    response = get(df['URL'])
    
    html_soup = BeautifulSoup(response.text, 'html.parser')

    if html_soup.find('body', class_ = 'error-page'):
        return False
    else:
        return True

In [11]:
test_df['VALID'] = test_df.apply(check_validity, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
test_df = test_df[test_df.VALID != False]

In [27]:
test_df

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE,URL,VALID
151360,30112340,WENDY'S,BROOKLYN,469,FLATBUSH AVENUE,11225.0,7182875005,Hamburgers,10/27/2016,Violations were cited in the following area(s).,04N,Filth flies or food/refuse/sewage-associated (...,Critical,11.0,A,10/27/2016,11/21/2018,Cycle Inspection / Re-inspection,https://www.yelp.com/biz/WENDYS-BROOKLYN,True
300624,40356018,RIVIERA CATERERS,BROOKLYN,2780,STILLWELL AVENUE,11224.0,7183723031,American,10/30/2018,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,10.0,A,10/30/2018,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/RIVIERA-CATERERS-BROO...,True
126087,40356731,TASTE THE TROPICS ICE CREAM,BROOKLYN,1839,NOSTRAND AVENUE,11226.0,7188560821,"Ice Cream, Gelato, Yogurt, Ices",06/26/2015,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,5.0,A,06/26/2015,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/TASTE-THE-TROPICS-ICE...,True
72110,40359705,NATHAN'S FAMOUS,BROOKLYN,1310,SURF AVENUE,11224.0,7183332202,Hotdogs,09/08/2015,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or condit...,Not Critical,11.0,A,09/08/2015,11/21/2018,Cycle Inspection / Re-inspection,https://www.yelp.com/biz/NATHANS-FAMOUS-BROOKLYN,True
22959,40360045,SEUDA FOODS,BROOKLYN,705,KINGS HIGHWAY,11223.0,7183751500,Jewish/Kosher,11/27/2017,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,13.0,A,11/27/2017,11/21/2018,Cycle Inspection / Initial Inspection,https://www.yelp.com/biz/SEUDA-FOODS-BROOKLYN,True


In [28]:
def scrape_reviews(df):
    response = get(df['URL'])
    
    html_soup = BeautifulSoup(response.text, "html.parser")
    review_containers = html_soup.find_all("div", class_ = "review review--with-sidebar")
    review_list = []
    review_list.append(df['CAMIS'])
    
    for i in range(len(review_containers)):
        review = str((review_containers[i]).p)
        clean = re.compile('<.*?>')
        review = re.sub(clean, '', review)
        review_list.append(review)
    
    new_df = pd.DataFrame(review_list)
    
    return new_df

In [48]:
# Define new reviews_df to store 20 most recent Yelp Reviews
reviews_df = pd.DataFrame()

for index, row in test_df.iterrows():
    reviews_df = reviews_df.append(scrape_reviews(row).T)

                                                    0
0                                            30112340
1   So I never really tried Wendy's food until rec...
2   This Wendy's has an odd smell, and the staff i...
3   The ventilation in this store sucks , you can ...
4   I am Boycotting this Restaurant all together!!...
5   Horrible last experiences with this place .......
6   Like all Wendy's (or fast food for that matter...
7   I had a very disappointing experience at this ...
8   The customer service is so horrible they are s...
9   The sign says "No Loitering".  Why are all the...
10  So I went to this wendys today &amp; placed my...
11  This place is not suitable for dining in. You ...
12  They honestly have the best burger ever create...
                                                    0
0                                            40356018
1   Magical!!!!!!!!! No need to say more!!!My cous...
2   I had my sweet sixteen here and everything was...
3   Shout Out to @tommychias

In [55]:
reviews_df = pd.DataFrame(reviews_df)
reviews_df = reviews_df.set_index(0)

In [59]:
reviews_df.index.names = ['CAMIS']

In [60]:
reviews_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
CAMIS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
30112340,So I never really tried Wendy's food until rec...,"This Wendy's has an odd smell, and the staff i...","The ventilation in this store sucks , you can ...",I am Boycotting this Restaurant all together!!...,Horrible last experiences with this place .......,Like all Wendy's (or fast food for that matter...,I had a very disappointing experience at this ...,The customer service is so horrible they are s...,"The sign says ""No Loitering"". Why are all the...",So I went to this wendys today &amp; placed my...,This place is not suitable for dining in. You ...,They honestly have the best burger ever create...,,,,,,,,
40356018,Magical!!!!!!!!! No need to say more!!!My cous...,I had my sweet sixteen here and everything was...,Shout Out to @tommychiassonjr for making sure ...,Just had my wedding at Riviera last week and i...,Just had my wedding here last weekend. Everyon...,If you are planning to marry and you need a pl...,Went here for my graduate luncheon today. The ...,"Beautiful place inside and out, but I had mult...",I was going down memory lane and I decided to ...,TIME FOR AN UPDATE!I was sad to see the Rivier...,I was at a wedding at The Riviera on Friday fo...,My best friend had her wedding here a few year...,"Jack and I were married here on June 7, 2009 ....",Have always heard about this place and drove b...,"Came here for a wedding, the cocktail hour was...",I attended a night wedding last night 7/25/201...,Don't let the somewhat dingy Coney Island loca...,"Place is beautiful, service is good.Music is g...","If there were a 0, that would have been picked...",I've been to numerous events at the Riviera an...
40356731,I had to go out to East Flatbush to deal with ...,,,,,,,,,,,,,,,,,,,
40359705,"I mean, NATHANS FAMOUS how can you go wrong??C...",The hot dogs are pretty good. The corn dogs ar...,"When one thinks of all time greats, I don't t...",I expected it to be a lot better. That include...,THIS IS THE BEST CORNDOG I HAVE EVER HAD.Wow. ...,Nathan's Famous Hotdogs. It's a Coney Island f...,I came to this iconic place to eat the famous ...,Well ... it's not every day that you get to ex...,$5 for a regular hot dog? No thanks! Unless yo...,Haven't been back in this area for years! The ...,"I've been to the boardwalk location, now we ge...",Nothing beats a classic Nathan's dog with beac...,I didn't know the original nathans hot dog was...,It had been a minute since I had been to Coney...,Had to come here to visit as its a historical ...,I found it hyped.. but chicken philly was good...,Food is decent...service not so much. This pla...,When I'm in the mood for some deep fried seafo...,"Huge lines, minimal parking, staff moves at th...",Trip to Coney Island has completed a world win...
40360045,Wonderful food.authentic sephardic food.i love...,More delicious food you cannot find. Prices re...,This is THE place to sample kosher Syrian Jewi...,Excellent kosher sephardic appetizers. Excell...,,,,,,,,,,,,,,,,
