# Python Web Crawler for BBQ restaurants in Austin (TX)

Importing relevant packages:

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Retrieving the restaurant names and links

In [2]:
# Get the URL without the last number (so it says start= at the end)
url = 'https://www.yelp.com/search?cflt=bbq&find_loc=Austin%2C%20TX&start='

# Create two empty lists
name_restaurant = []
link_restaurant = []

# For each page, collect the restaurant names and links
for j in range(0,23):
    
    html = requests.get(url + str(j*10))
    
    soup = BeautifulSoup(html.content, 'lxml')
    links = soup.select(".css-1pxmz4g .css-166la90")
   
    for link in links:
        name_restaurant.append(link.string)
        link_restaurant.append(link.get('href'))

In [3]:
# Check the names
name_restaurant

['Terry Black’s Barbecue',
 'Franklin Barbecue',
 'la Barbecue',
 'The Salt Lick BBQ',
 'The Original Black’s Barbecue',
 'Cooper’s Old Time Pit Bar-B-Que',
 'Lamberts Downtown Barbecue',
 'SLAB BBQ & Beer',
 'Micklethwait Craft Meats',
 'Stiles Switch BBQ & Brew',
 'Fowler’s Smokin Soul Food',
 'Rollin Smoke BBQ',
 'Bigg Belly BBQ',
 'Rudy’s “Country Store” and Bar-B-Q',
 'King’s BBQ and Soul',
 'County Line On The Lake',
 'Big Vinny’s Bbq',
 'Brown’s Bar-B-Que',
 'Iron Works Barbecue',
 'Stubb’s Bar-B-Q',
 'Jim’s Smokehouse',
 'Scotty’s BBQ',
 'Rudy’s “Country Store” and Bar-B-Q',
 'CM Smokehouse',
 'Valentina’s Tex Mex BBQ',
 'Metcalf Barbecue',
 'The Mean Eyed Cat',
 'House Park Bar-B-Q',
 'Interstellar BBQ',
 'Mum Foods',
 'Kerlin BBQ',
 'Louie Mueller Barbecue',
 'Moreno Barbecue',
 'The Salt Lick BBQ',
 'Rudy’s “Country Store” and Bar-B-Q',
 'Metcalf Barbecue',
 'The Salt Lick BBQ - Austin Airport',
 'The Mean Eyed Cat',
 'Bar-B-Q Heaven',
 'Texas Ranch BBQ',
 'The County Line',

### Adding the beginning for all links (http://www.yelp.com)

In [4]:
# Complete the restaurant links
link_restaurant=["https://www.yelp.com" + s for s in link_restaurant]
print(link_restaurant[0:5])

['https://www.yelp.com/biz/terry-blacks-barbecue-austin', 'https://www.yelp.com/biz/franklin-barbecue-austin', 'https://www.yelp.com/biz/la-barbecue-austin-3', 'https://www.yelp.com/biz/the-salt-lick-bbq-driftwood', 'https://www.yelp.com/biz/the-original-black-s-barbecue-austin']


### Creating a dataframe with all restaurant names and links

In [5]:
# Create a Zipped list of the restaurant names and links
RestaurantDataSet = list(zip(name_restaurant,link_restaurant))

# Create a dataframe of two columns
df_restaurants = pd.DataFrame(data = RestaurantDataSet , columns=['name', 'url'])

In [9]:
# View dataframe
df_restaurants.iloc[:15]

Unnamed: 0,name,url
0,Terry Black’s Barbecue,https://www.yelp.com/biz/terry-blacks-barbecue...
1,Franklin Barbecue,https://www.yelp.com/biz/franklin-barbecue-austin
2,la Barbecue,https://www.yelp.com/biz/la-barbecue-austin-3
3,The Salt Lick BBQ,https://www.yelp.com/biz/the-salt-lick-bbq-dri...
4,The Original Black’s Barbecue,https://www.yelp.com/biz/the-original-black-s-...
5,Cooper’s Old Time Pit Bar-B-Que,https://www.yelp.com/biz/coopers-old-time-pit-...
6,Lamberts Downtown Barbecue,https://www.yelp.com/biz/lamberts-downtown-bar...
7,SLAB BBQ & Beer,https://www.yelp.com/biz/slab-bbq-and-beer-austin
8,Micklethwait Craft Meats,https://www.yelp.com/biz/micklethwait-craft-me...
9,Stiles Switch BBQ & Brew,https://www.yelp.com/biz/stiles-switch-bbq-and...


In [10]:
# Write a CSV of all the restaurants and their links
df_restaurants.to_csv('Austin_BBQ_Restaurants.csv',
                      index=False,header=True,encoding='utf8')

## Going through the subpages

In [18]:
url_sub = df_restaurants['url'][0]
name_business = df_restaurants['name'][0]
print(name_business, url_sub)

Terry Black’s Barbecue https://www.yelp.com/biz/terry-blacks-barbecue-austin


In [19]:
html_sub = requests.get(url_sub)
soup_sub = BeautifulSoup(html_sub.content, 'lxml')
soup_sub

<!DOCTYPE HTML>
<!--[if lt IE 7 ]> <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie6 ie ltie9 ltie8 no-js" lang="en"> <![endif]--><!--[if IE 7 ]>    <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie7 ie ltie9 ltie8 no-js" lang="en"> <![endif]--><!--[if IE 8 ]>    <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie8 ie ltie9 no-js" lang="en"> <![endif]--><!--[if IE 9 ]>    <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie9 ie no-js" lang="en"> <![endif]--><!--[if (gt IE 9)|!(IE)]><!--><html class="no-js" lang="en" xmlns:fb="http://www.facebook.com/2008/fbml"> <!--<![endif]-->
<head>
<script nonce="7366194a">
            (function() {
                var main = null;

                var main=function(){window.onerror=function(k,a,c,i,f){var j=(document.getElementsByTagName("html")[0].getAttribute("webdriver")==="true"||navigator.userAgent==="selenium");var h=f&&(f.name==="ServerSideRenderingError"||f.name==="CSRFallbackError");if(j&&!h){documen

### Finding usernames of first 10 reviews

In [25]:
soup_username = soup_sub.select('.css-m6anxm .css-166la90')

username = []
for name in soup_username:
    username.append(name.string)
    
username[0:10]

['Patricia N.',
 'Kirstin N.',
 'Ivy C.',
 'Rayand G.',
 'Abby P.',
 'Dan S.',
 'Billy Y.',
 'Nancy L.',
 'Desiree N.',
 'Samira S.']

### Finding ratings of first 10 reviews

In [28]:
# Get ratings
soup_stars=soup_sub.select('.margin-b1-5__373c0__2Wblx .overflow--hidden__373c0__2B0kz')

rating = []

for stars in soup_stars:
    rating.append(stars.attrs['aria-label'])

# Get rid of text "star rating"
rating  = [r[0:len(rating)][0] for r in rating]

#convert from string to number
rating = [float(i) for i in rating]

rating

[5.0, 5.0, 5.0, 5.0, 4.0, 1.0, 5.0, 5.0, 4.0, 5.0]

### Find dates of first 10 reviews

In [30]:
#Get date of rating
soup_date=soup_sub.select('.margin-b1-5__373c0__2Wblx .css-e81eai')

date_review = []

for date in soup_date:
    date_review.append(date.string)

date_review

['4/14/2021',
 '4/15/2021',
 '4/4/2021',
 '5/23/2021',
 '5/26/2021',
 '5/23/2021',
 '5/16/2021',
 '5/9/2021',
 '5/24/2021',
 '5/6/2021']

### Find text of first 10 reviews

In [31]:
html_texts=soup_sub.select('.css-n6i4z7 .raw__373c0__3rcx7')

html_text = []

for t in html_texts:
    html_text.append(t.get_text())

# View first example text
html_text[0]

"Absolutely delicious bbq. Hubby and I were driving through Austin on one of our road trips and we knew we had to get some Texas-style bbq. We were able to do an online preorder the previous day, and were lucky enough that they weren't super busy so that we were able to enjoy the bbq fresh on one of their picnic tables outside. Might be easier to do a bullet point for what we ordered and what we thought, so here goes!Meats:Beef ribs - These things are ginormous. You'll end up paying around $40.00 for one rib, but honestly this one rib is a full meal for one or can be shared by up to 3 people and will be enough meat for everyone to appreciate it. It was tender, super juicy, and just so satisfying. Pork ribs - Literally fell off the bone when I bit into it. The dry rub was fantastic. Honestly, you wouldn't need to add any bbq sauce because it's packed with flavor and juicy. Excellent choice. Cheese and jalapeño sausage - This is like a little treat. You can taste the spices used in the s

### Get reviewer origin location

In [34]:
html_origin = soup_sub.select('.arrange-unit-fill__373c0__17z0h .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .css-n6i4z7')

soup_origin = []

for origin in html_origin:
    soup_origin.append(origin.get_text())

# View first example text
soup_origin

['Jacksonville, FL',
 'Salt Lake City, UT',
 'Orange County, CA',
 'Santa Maria, CA',
 'West Covina, CA',
 'Sacramento, CA',
 'Cary, NC',
 'Murrieta, CA',
 'El Paso, TX',
 'Williamsville, NY']

### Get reviewer friend count

In [38]:
# Get friend count
soup_friends = soup_sub.select('.arrange-unit-fill__373c0__17z0h .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .icon--16-friends-v2+ .border-color--default__373c0__2oFDT .css-1dgkz3l')

friendcount = []

for friends in soup_friends:
    friendcount.append(friends.get_text())

#convert from string to number
friendcount = [float(i) for i in friendcount]

friendcount

[412.0, 29.0, 1735.0, 15.0, 0.0, 27.0, 21.0, 94.0, 666.0, 180.0]

### Get reviewer review count

In [39]:
# Get review count
soup_reviews = soup_sub.select('.arrange-unit-fill__373c0__17z0h .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .border-color--default__373c0__2oFDT .icon--16-review-v2+ .border-color--default__373c0__2oFDT .css-1dgkz3l')
reviewcount = []

for reviews in soup_reviews:
    reviewcount.append(reviews.get_text())

#convert from string to number
reviewcount = [float(i) for i in reviewcount]

reviewcount

[358.0, 78.0, 575.0, 11.0, 7.0, 50.0, 108.0, 93.0, 176.0, 112.0]

### Get reviewer picture count

In [40]:
# Get picture count
soup_pictures = soup_sub.select('.icon--16-photos-v2+ .border-color--default__373c0__2oFDT .css-1dgkz3l')

picturecount = []

for pictures in soup_pictures:
    picturecount.append(pictures.get_text())

#convert from string to number
picturecount = [float(i) for i in picturecount]

picturecount

[651.0, 204.0, 1410.0, 15.0, 22.0, 18.0, 303.0, 563.0, 875.0, 209.0]

## Finding the user profile information
Meaning the information that cannot be viewed immediately in their review

### Finding the links

In [43]:
link_user = []

for userlink in soup_username:
        link_user.append(userlink.get('href'))

link_user=["https://www.yelp.com" + s for s in link_user]

link_user

['https://www.yelp.com/user_details?userid=JY6IepgSDkESrtMl1GCD_Q',
 'https://www.yelp.com/user_details?userid=KWoKuuL6m1zCCLLEc6IgBg',
 'https://www.yelp.com/user_details?userid=pE_rKN0nhg1QRbr9dxiDew',
 'https://www.yelp.com/user_details?userid=X4ov-Nb-cIgRFMY1q_kolQ',
 'https://www.yelp.com/user_details?userid=nyE4-wjrH4iJEAgaXxBYtw',
 'https://www.yelp.com/user_details?userid=BB20SM4_y6vKVB4mfnlZzA',
 'https://www.yelp.com/user_details?userid=_4dx5wBsmyv4FAQ-6Qp_Lw',
 'https://www.yelp.com/user_details?userid=FWVS68burzsuaavhBdc3qQ',
 'https://www.yelp.com/user_details?userid=F4i7RdxvkfNRTDEHWARTMA',
 'https://www.yelp.com/user_details?userid=M8eIIxg4zVGeaRFcHNrFCQ']

In [45]:
# WORK IN PROGRESS
for userlink in link_user:
    html_user = requests.get(userlink)
    soup_user = BeautifulSoup(html_user.content, 'lxml')
    soup_user

<!DOCTYPE HTML>
<!--[if lt IE 7 ]> <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie6 ie ltie9 ltie8 no-js" lang="en"> <![endif]--><!--[if IE 7 ]>    <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie7 ie ltie9 ltie8 no-js" lang="en"> <![endif]--><!--[if IE 8 ]>    <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie8 ie ltie9 no-js" lang="en"> <![endif]--><!--[if IE 9 ]>    <html xmlns:fb="http://www.facebook.com/2008/fbml" class="ie9 ie no-js" lang="en"> <![endif]--><!--[if (gt IE 9)|!(IE)]><!--><html class="no-js" lang="en" xmlns:fb="http://www.facebook.com/2008/fbml"> <!--<![endif]-->
<head>
<script nonce="353ee337">
            (function() {
                var main = null;

                var main=function(){window.onerror=function(k,a,c,i,f){var j=(document.getElementsByTagName("html")[0].getAttribute("webdriver")==="true"||navigator.userAgent==="selenium");var h=f&&(f.name==="ServerSideRenderingError"||f.name==="CSRFallbackError");if(j&&!h){documen

### Finding the HTML page of user 1

In [None]:
html_user = requests.get('https://www.yelp.com/user_details?userid=JY6IepgSDkESrtMl1GCD_Q')
soup_user = BeautifulSoup(html_user.content, 'lxml')

### Finding the rating distribution

In [51]:
soup_ratedist = soup_user.select('.histogram_count')

distribution = []

for ratings in soup_ratedist:
    distribution.append(ratings.get_text())

#convert from string to number
distribution = [float(i) for i in distribution]

distribution

[84.0, 157.0, 78.0, 29.0, 10.0]

### Find yelping since

In [52]:
soup_yelpsince = soup_user.select('.ysection li:nth-child(2) p')

yelpsince = []

for since in soup_yelpsince:
    yelpsince.append(since.get_text())

yelpsince

['September 2009']