In [1]:
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

from bs4 import BeautifulSoup
import pandas as pd
import re
load_dotenv()

True

In [2]:
# grab the zip code we've saved as an environment variable
zip_code = os.environ.get('ZIP')

driver = webdriver.Firefox()
driver.maximize_window()
# access Whole Foods' products that are on sale via selenium
driver.get("https://www.wholefoodsmarket.com/products/all-products?featured=on-sale")
assert "Whole Foods" in driver.title

# wait for the page to load
wait = WebDriverWait(driver, 3)

# try/except in case selenium opens us logged in for some reason
try:
    # we create a variable to grab the search field
    store = driver.find_element(by=By.ID, value="pie-store-finder-modal-search-field")
    # ensure it's empty
    store.clear()
    # selenium adds our zip code into the text field
    store.send_keys(zip_code)
    store.send_keys(Keys.RETURN)
    
    # wait until a nearby Whole Foods shows up as a clickable option
    closest = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.wfm-search-bar--list_item:nth-child(1)')))
    closest.click()
except:
    NoSuchElementException('Either you already have the lcoation or there\'s an error')

# the on sale products page does not show all of the products
# in fact, there are so many deals on sale that I have no idea
# what the exact amount is, so I limit the amount of extra
# product pages that we load more to 5

# we wait for the load more button to load since it doesn't initially
load_more = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.w-button:nth-child(5)")))
for _ in range(5):
    # we click 5 times
    load_more.click()

In [3]:
# grabbing the html source after loading more deals
# than we would have gotten after initially loading
# the page
content = driver.page_source

# make working with the html easier with BeautifulSoup
dom = BeautifulSoup(content, 'html.parser')
body = dom.body

In [5]:
# select the products by grabbing all content in the body
# that have the class for a product item

# we need to use a dictionary to select this css atrribute
# because python already has a class keyword

# EDIT: after doing a ton of work with this dictionary-type
# attribute search style, I learned BeautifulSoup can just
# use `class_` in order to search by CSS class :/
products = body.find_all(attrs={'class': 'w-pie--product-tile'})

In [92]:
data = []

# loop through every product
for i in products:
    # search for elements with the pricing information
    info = i.find(attrs={'class': 'w-pie--product-tile__content'})
    
    brand = info.find(attrs={'class': 'w-cms--font-disclaimer'}).text
    item = info.find(attrs={'class': 'w-cms--font-body__sans-bold'}).text
    
    # need to use regex to find numbers following dollar sign
    # or if there's nothing following the numbers
    # or if there's `/lb` following the numbers
    regular = info.find(attrs={'class': 'regular_price has_sale'}).text
    # print(regular)
    re_regular = re.search(r'(\$)(.*)', sale)[2]
    # print(re_regular)
    
    sale = info.find(attrs={'class': 'sr-only'}).next_sibling
    re_sale = re.search(r'(\$)(.*)', sale)[2]
    # print(re_sale)
    try:
        content_prime = info.find(attrs={'class': 'prime_price'})
        prime = content_prime.find(class_='sr-only').next_sibling
        re_prime = re.search(r'(\$)(.*)', prime)[2]
        
    except:
        AttributeError('There may not be a prime price, so lets set it to the sale price')
        re_prime = sale
        
    try:
        lb = re_sale.index('/lb')
        re_sale = re_sale[:lb]
    except:
        ValueError("Looks like this isn't measured in lb")
    try:    
        lb2 = re_prime.index('/lb')
        re_prime = re_prime[:lb2]
    except:
        ValueError("Looks like this isn't measured in lb")
    try:    
        lb3 = re_regular.index('/lb')
        re_regular = re_regular[:lb3]
    except:
        ValueError("Looks like this isn't measured in lb")
    print(re_regular)
    
    data.append({'Brand': brand,
                'Item': item,
               'Regular': float(re_regular),
               'Sale': float(re_sale),
               'Prime': float(re_prime)})

14.99
14.99
3.49
2.49
4.99
13.99
12.99
2.49
4.49
5
7.99
5.99
4.49
21.99
4.79
5.99
5.59
2.99
3.49
5.99
3.99
5.99
4.99
3.69
5.99
9.99
5.99
3.49
4.79
4.99
5.99
5.69
4.99
4.99
3.99
12.99
4.99
4.99
4.49
2.99
3.69
4.99
8.99
7.49
4.99
5.59
7.49
5.99
8.99
3.79
5.59
7.19
3.99
6.99
8.99
6.49
8
10.49
2.19
7.99
6.79
4.49
10.99
4.99
5.59
4.99
4.99
4.49
4.99
10
6
8.99
7.99
3.99
3.99
12.99
14.99
3.99
4.99
2.79
3.99
7.99
3.99
6.49
8
8
5.59
10
5.99
4.69
30.49
36.49
4.99
4.19
4.79
1.79
8.99
6.49
2.99
12.49
8.99
5.99
4.99
3.79
4.79
2.99
8
5.99
4
10
3.99
3.99
6.99
8
8.99
7.49
2.99
16.49
4
20.99
3.99
5.39
10
4.49
4.49
2.99
1.89
8
8.99
8.99
3.99
5.49
8.99
4.99
8.99
5.59
9.49
5.79
8
5.29
4.99
3.79
4.19
7.69
7.69
5.99
3.39
2.99
5.39
3.99
3.99
4.79
7.69
2.49
2.99
5.99
4.19
14.29
11.99
3.99
9
8
6.99
5
4
5
10.79
3.99
5.29
4.49
5.99
1.89
6.99
9.69
6
5.99
36.49
3.99
36.49
5
9
5
8.99
5
5.79
1.59
6
7.49
1.59
3.19
3.49
6
6.99
5.99
6.99
14.69
3.99
54.99
5
8.99
5
8.49
6
2.39
8
10
8
6.39
5.49
3.99
4.99
7.79
6.49
5
3.99


ValueError: could not convert string to float: '$12.99'