In [2]:
import os
import re
import time
from datetime import date

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Purpose: Scrape first 3 pages of Yelp data for every single zipcode in NYC with label of "bagel"

In [116]:
# Read in csv file with all zip codes in NYC

zip_nyc = pd.read_csv("zipcodes_nyc.csv", dtype=str)

In [119]:
# Split zipcodes into 10 groups, because we will run the scraper in smaller groups

zip_split = np.array_split(zip_nyc.zip_nyc.to_list(), 10)

In [None]:
# Create lists to contain information

name = []
phone = []
rev_count = []
price = []
food_type = []
rat = []
add = []
town = []
zip_search = []

In [120]:
# This works for one url to print a dataframe with the correct columns


def get_bagel_data(zips):

    for z in zips:
        delay = np.random.choice(
            [0, 0, 0, 0, 7, 4, 6, 2, 10, 19, 7, 8, 34, 7, 100, 345, 542]
        )
        print(delay)
        print(z)
        time.sleep(delay)

        for i in [0, 10, 20]:
                        url = "https://api.scrapingdog.com/scrape?api_key={}&url=https://www.yelp.com/search?cflt=bagels&find_loc={}&start={}".format(os.environ['SCRAPE_API'],z,i)
            r = requests.get(url)
            soup = BeautifulSoup(r.content, "lxml")

            for item in soup.select("[class*=container]"):
                if item.find("h4"):
                    name.append(item.find("h4").get_text())
                    phone.append(
                        item.select("[class*=secondaryAttributes]")[0].get_text()
                    )
                    try:
                        rev_count.append(
                            item.select("[class*=reviewCount]")[0].get_text()
                        )
                    except IndexError:
                        rev_count.append(np.nan)
                    try:
                        price.append(item.select("[class*=priceRange]")[0].get_text())
                    except IndexError:
                        price.append(np.nan)
                    food_type.append(
                        item.select("[class*=priceCategory]")[0].get_text()
                    )
                    try:
                        rat.append(item.select("[aria-label*=rating]")[0]["aria-label"])
                    except IndexError:
                        rat.append(np.nan)
                    try:
                        add.append(item.find("address").get_text())
                    except AttributeError:
                        add.append(np.nan)
                    town.append(
                        item.select("[class*=margin-b1__09f24__1647o]")[0].get_text()
                    )
                    zip_search.append(z)

    df = pd.DataFrame(
        {
            "name": name,
            "phone": phone,
            "review_count": rev_count,
            "price": price,
            "food_type": food_type,
            "rating": rat,
            "address": add,
            "town": town,
            "zip_search": zip_search,
        }
    )

    return df

In [123]:
# Save the bagel as both csv and as a pickle file

bagel_nyc_data.to_csv(
    date.today().strftime("%Y%m%d") + "_bagel_nyc_data_10.csv", index=False
)
bagel_nyc_data.to_pickle(date.today().strftime("%Y%m%d") + "_bagel_nyc_data_10.pkl")