In [1]:
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

from bs4 import BeautifulSoup
import pandas as pd
import re
load_dotenv()

ModuleNotFoundError: No module named 'dotenv'

In [59]:
# grab the zip code we've saved as an environment variable
zip_code = os.environ.get('ZIP')

driver = webdriver.Firefox()
driver.maximize_window()
# access Whole Foods' products that are on sale via selenium
driver.get("https://www.wholefoodsmarket.com/products/all-products?featured=on-sale")
assert "Whole Foods" in driver.title

# wait for the page to load
wait = WebDriverWait(driver, 3)

# try/except in case selenium opens us logged in for some reason
try:
    # we create a variable to grab the search field
    store = driver.find_element(by=By.ID, value="pie-store-finder-modal-search-field")
    # ensure it's empty
    store.clear()
    # selenium adds our zip code into the text field
    store.send_keys(zip_code)
    store.send_keys(Keys.RETURN)
    
    # wait until a nearby Whole Foods shows up as a clickable option
    closest = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.wfm-search-bar--list_item:nth-child(1)')))
    closest.click()
except:
    NoSuchElementException('Either you already have the lcoation or there\'s an error')

# the on sale products page does not show all of the products
# in fact, there are so many deals on sale that I have no idea
# what the exact amount is, so I limit the amount of extra
# product pages that we load more to 5

# we wait for the load more button to load since it doesn't initially
load_more = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.w-button:nth-child(5)")))
for _ in range(10):
    # we click 5 times
    load_more.click()

In [60]:
# grabbing the html source after loading more deals
# than we would have gotten after initially loading
# the page
content = driver.page_source
driver.close()

# make working with the html easier with BeautifulSoup
dom = BeautifulSoup(content, 'html.parser')
body = dom.body

In [61]:
# select the products by grabbing all content in the body
# that have the class for a product item

# we need to use a dictionary to select this css atrribute
# because python already has a class keyword

# EDIT: after doing a ton of work with this dictionary-type
# attribute search style, I learned BeautifulSoup can just
# use `class_` in order to search by CSS class :/
products = body.find_all(attrs={'class': 'w-pie--product-tile'})

In [62]:
data = []

# loop through every product
for i in products:
    # search for elements with the pricing information
    info = i.find(attrs={'class': 'w-pie--product-tile__content'})
    
    brand = info.find(attrs={'class': 'w-cms--font-disclaimer'}).text
    item = info.find(attrs={'class': 'w-cms--font-body__sans-bold'}).text
    
    # need to use regex to find numbers following dollar sign
    # or if there's nothing following the numbers
    # or if there's `/lb` following the numbers
    regular = info.find(attrs={'class': 'regular_price has_sale'}).text
    # print(regular)
    
    sale = info.find(attrs={'class': 'sr-only'}).next_sibling
    
    # checking for sale discounts that involve buying more than 1 item
    # so we create a function that will check if there is a multi-item
    # discount
    # we check for the word 'for' in the discount, which indicates
    # a multi-item discount
    # then we try to convert the first two characters in the string that
    # has the discount to an integer
    # if that doesn't work then the items involved in the multi-item
    # discount are single digit, so we try to convert the first character
    # we save the number that we're discounting by, then return it so
    # that we can divide the sale price by the items needed so that
    # we have a simple 'per-item' sale price and prime price
    
    sale_divisor = 1
    def check_items(type_of_sale, divisor):
        if ('for' in sale):
            # print('This is a multi-item discount for:', type_of_sale)
            # need to check for a two digit item buying amount
            try:
                divisor = (type_of_sale[:2])
            except:
                print("Items to buy must be single digit")
                divisor = (type_of_sale[0])
            # print('Returning:', divisor)
            return int(divisor)
    sale_divisor = check_items(sale, sale_divisor)
    # print(sale_divisor)
    
    
    re_sale = re.search(r'(\$)(.*)', sale)[2]
    # print(re_sale)
    re_regular = re.search(r'(\$)(.*)', regular)[2]
    # print(re_regular)
    try:
        content_prime = info.find(attrs={'class': 'prime_price'})
        prime = content_prime.find(class_='sr-only').next_sibling
        prime_divisor = 1
        prime_divisor = check_items(prime, prime_divisor)
        re_prime = re.search(r'(\$)(.*)', prime)[2]
        
    except:
        AttributeError('There may not be a prime price, so lets set it to the sale price')
        # print("No prime price")
        re_prime = re_sale
    
    
    if (sale_divisor is not None):
        re_sale = float(re_sale) / sale_divisor
        # print('Item prices after multi-item sale discount:', re_sale)
        re_prime = float(re_prime) / prime_divisor
        # print('Item prices after multi-item prime discount:', re_prime)
        
    try:
        lb = re_sale.index('/lb')
        re_sale = re_sale[:lb]
    except:
        ValueError("Looks like this isn't measured in lb")
    try:    
        lb2 = re_prime.index('/lb')
        re_prime = re_prime[:lb2]
    except:
        ValueError("Looks like this isn't measured in lb")
    try:    
        lb3 = re_regular.index('/lb')
        re_regular = re_regular[:lb3]
    except:
        ValueError("Looks like this isn't measured in lb")
    # print("Regular:", re_regular)
    # print("Sale:", re_sale)
    # print("Prime:", re_prime)
    
    data.append({'Brand': brand,
                'Item': item,
               'Regular': float(re_regular),
               'Sale': float(re_sale),
               'Prime': float(re_prime)})

In [63]:
df = pd.DataFrame(data)

In [64]:
df

Unnamed: 0,Brand,Item,Regular,Sale,Prime
0,PRODUCE,Organic Green Asparagus,5.99,3.49,3.14
1,SEAFOOD,Atlantic Cod Fillet,15.99,12.99,11.69
2,Oatly,"Original Oatmilk, 64 fl oz",5.49,4.49,4.04
3,PRODUCE,Red Seedless Grapes,2.99,2.49,2.24
4,PRODUCE,Organic Tomato On The Vine,3.49,2.99,2.69
...,...,...,...,...,...
535,Mt. Vikos,Barrel Aged Feta,16.99,13.99,12.59
536,Oatly,"Chocolate Frozen Dessert, 1 pint",5.99,3.99,3.59
537,Tom's of Maine,"Cool Mountain Mint Mouthwash, 16 fl oz",6.49,5.49,4.94
538,Tony's Chocolonely,"32% Milk Chocolate Honey Almond Nougat Bar, 6....",4.99,3.99,3.59


In [65]:
prices = df[['Regular', 'Sale', 'Prime']]

In [66]:
discount = prices.pct_change(axis='columns')

In [67]:
df['Sale_Discount'] = discount['Sale']

In [68]:
prime_discount = df[['Regular', 'Prime']].pct_change(axis='columns')

In [69]:
df['Prime_Discount'] = prime_discount['Prime']

In [70]:
df

Unnamed: 0,Brand,Item,Regular,Sale,Prime,Sale_Discount,Prime_Discount
0,PRODUCE,Organic Green Asparagus,5.99,3.49,3.14,-0.417362,-0.475793
1,SEAFOOD,Atlantic Cod Fillet,15.99,12.99,11.69,-0.187617,-0.268918
2,Oatly,"Original Oatmilk, 64 fl oz",5.49,4.49,4.04,-0.182149,-0.264117
3,PRODUCE,Red Seedless Grapes,2.99,2.49,2.24,-0.167224,-0.250836
4,PRODUCE,Organic Tomato On The Vine,3.49,2.99,2.69,-0.143266,-0.229226
...,...,...,...,...,...,...,...
535,Mt. Vikos,Barrel Aged Feta,16.99,13.99,12.59,-0.176574,-0.258976
536,Oatly,"Chocolate Frozen Dessert, 1 pint",5.99,3.99,3.59,-0.333890,-0.400668
537,Tom's of Maine,"Cool Mountain Mint Mouthwash, 16 fl oz",6.49,5.49,4.94,-0.154083,-0.238829
538,Tony's Chocolonely,"32% Milk Chocolate Honey Almond Nougat Bar, 6....",4.99,3.99,3.59,-0.200401,-0.280561


In [78]:
print(df['Sale_Discount'].mean())
print(df['Sale_Discount'].median())

-0.1833603872305773
-0.18214936247723135


The mean discount on an on-sale product is `18.33%`
The median discount on an on-sale product is `18.21%`

In [72]:
print(df['Prime_Discount'].mean())
print(df['Prime_Discount'].median())

-0.2648014214482892
-0.2641165755919854


The mean discount after using prime is `26.48%`
The median discount after using prime is `26.41%`

In [85]:
df.loc[df['Brand'] == 'Organic Valley']

Unnamed: 0,Brand,Item,Regular,Sale,Prime,Sale_Discount,Prime_Discount
5,Organic Valley,Organic Whole Milk Grassmilk,5.79,5.29,4.76,-0.086356,-0.177893
19,Organic Valley,Organic 2% Reduced Fat Grassmilk,5.79,5.29,4.76,-0.086356,-0.177893
20,Organic Valley,Lactose Free 2% Reduced Fat Milk,5.29,4.99,4.49,-0.056711,-0.151229
29,Organic Valley,Organic Lactose Free Whole Milk (1/2 Gl),5.29,4.99,4.49,-0.056711,-0.151229
56,Organic Valley,"Organic Low Fat Lactose Free Milk (1/2 Gl), 0....",5.29,4.99,4.49,-0.056711,-0.151229
57,Organic Valley,"Organic Nonfat Lactose Free Milk (1/2 Gl), 0.5...",5.29,4.99,4.49,-0.056711,-0.151229
95,Organic Valley,"Organic Mozzarella Cheese Stringles, 1 each",5.99,4.99,4.49,-0.166945,-0.250417
153,Organic Valley,Organic Fat Free Milk,5.79,5.29,4.76,-0.086356,-0.177893


Given these discounts, we can determine how much a person would have to spend in order to earn back their Amazon Prime payment. Given that the prime membership is `$140` per year, and the discount for on-sale products with prime is `~25%`, we can simply multiply `140` by `4` to get `560`. Thus, a person would need to spend `$560` on on-sale products (and use their prime discount) at Whole Foods in order to earn back their Amazon Prime subscription.

A student will only need to spend half as much, as the student membership is `$70` per year, so they need to spend `$280` per year.