# Coffee Lovers Unite!
If caffeine is one of the most popular drugs, then coffee is likely one of the most popular delivery systems for it. Aside from caffeine, people enjoy the wonderful variety of coffee-related drinks. Let’s do a rough investigation of the “market share” by two of the top coffee chains in the United States!

World Population Review provides some great data on store locations and chain prevalence. Check out this page for the Starbucks Coffee locations in the United States. Notice that this page only really gives the name of the state and the number of locations in that state.

# Scrape the Location Counts
1. Use the beautifulsoup library to scrape the data (from the link above) on state names and corresponding number of store locations, for the following chains:

Starbucks

Dunkin’ Donuts

2. Parse, merge and tidy your data. Think carefully about what the tidy version of this dataset is with multiple years represented on the website.

In [178]:
# import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [179]:
response = requests.get("https://worldpopulationreview.com/state-rankings/starbucks-stores-by-state")
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table", class_ = "wpr-table")

# scrape starbucks data

rows = []
states_list = []

# get state names and add to states list
for th in table.find_all("th")[4:]:
    states = th.get_text(strip = True)
    states_list.append(states)

for i, tr in enumerate(table.find_all("tr")[1:]):
    # iterate across states list to get corresponding state for each row
    state = states_list[i]

    # get all the rows in the dataset
    cells = tr.find_all("td")

    # get all 2023 stores
    stores2023_tag = cells[0].find("a") or cells[0]
    stores2023 = stores2023_tag.get_text(strip = True)
    
    #get all 2021 stores
    stores2021_tag = cells[1].find("a") or cells[1]
    stores2021 = stores2021_tag.get_text(strip = True)

    # get all 2024 stores
    stores2024_tag = cells[2].find("a") or cells[2]
    stores2024 = stores2024_tag.get_text(strip = True)

    # add columns to a list
    rows.append({
        "state" : state,
        "2023" : stores2023,
        "2021" : stores2021,
        "2024" : stores2024

    })

# convert to a dataframe
starbucks = pd.DataFrame(rows)

# clean and tidy data
starbucks_clean = starbucks.melt(id_vars = ["state"], var_name = "year", value_name = "store_count" )
starbucks_clean["location"] = "Starbucks"


In [233]:
# Dunkin Donuts

response = requests.get("https://worldpopulationreview.com/state-rankings/dunkin-donuts-by-state")
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table", class_ = "wpr-table")

# scrape dunkin data

rows = []
states_list = []

# get state names and add to states list
for th in table.find_all("th")[3:]:
    states = th.get_text(strip = True)
    states_list.append(states)

for i, tr in enumerate(table.find_all("tr")[1:]):
    # iterate across states list to get corresponding state for each row
    state = states_list[i]

    # get all the rows in the dataset
    cells = tr.find_all("td")

    # get all 2024 stores
    stores2024_tag = cells[0].find("a") or cells[0]
    stores2024 = stores2024_tag.get_text(strip = True)

    # get all 2023 stores
    stores2023_tag = cells[1].find("a") or cells[1]
    stores2023 = stores2023_tag.get_text(strip = True)

    # add columns to a list
    rows.append({
        "state" : state,
        "2023" : stores2023,
        "2024" : stores2024

    })

# convert to a dataframe
dunkin = pd.DataFrame(rows)

# clean and tidy data
dunkin_clean = dunkin.melt(id_vars = ["state"], var_name = "year", value_name = "store_count")
dunkin_clean["location"] = "Dunkin"


In [250]:
# merge dunkin and starbucks datasets

merged_coffee = starbucks_clean.merge(dunkin_clean, on = ["state", "year", "location", "store_count"], 
how = "outer")

merged_coffee["year"] = merged_coffee["year"].astype(int)

# Supplemental Data
4. Scrape the state names and populations from this wikipedia page. Merge these data with your coffee dataset.

In [251]:

response = requests.get("https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population")
soup = BeautifulSoup(response.content, "html.parser")

tables = soup.find_all("table", class_ = "wikitable")

#extract table
table = tables[0]

# scrape population data
rows = []

for tr in table.find_all("tr")[1:]:
    cells = tr.find_all("td")

    # get state values
    state_tag = cells[2].find("a") or cells[2]
    state = state_tag.get_text(strip = True)

    # get population values
    pop_tag = cells[3].find("a") or cells[3]
    population = pop_tag.get_text(strip = True)

    # append data
    rows.append({
        "state" : state,
        "population" : population
    })

# convert to a df
population = pd.DataFrame(rows)

# merge dataframes
merged_pop = merged_coffee.merge(population, on = "state", how = "left")


5. Find the revenue, stock price, or your financial metric of choice for each of the companies listed above (if you can find a website to scrape these from that’s great!…but it’s okay if you manually enter these). Merge these values into your big dataset. Note: these values may be repeated for each state.

In [252]:
# scrape starbucks revenue table from website
response = requests.get("https://companiesmarketcap.com/starbucks/revenue/")
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table")

rows = []

for tr in table.find_all("tr")[1:]:
    cells = tr.find_all("td")
    
    # get year values 
    year_tag = cells[0].find("a") or cells[0]
    year = year_tag.get_text(strip = True)

    # get revenue values
    revenue_tag = cells[1].find("a") or cells[1]
    revenue = revenue_tag.get_text(strip = True)

    # append rows
    rows.append({
        "year" : year,
        "revenue" : revenue
    })

# convert to a df
starbucks_revenue = pd.DataFrame(rows)

# remove unnecessary items in the columns
starbucks_revenue["year"] = starbucks_revenue["year"].str.replace("2024(TTM)", "2024").astype(int)
starbucks_revenue["revenue"] = starbucks_revenue["revenue"].str.replace("$","")
starbucks_revenue["revenue"] = starbucks_revenue["revenue"].str.replace(" B","").astype(float)

# add location column
starbucks_revenue["location"] = "Starbucks"


In [253]:
# create Dunkin Donuts df - *** CHECK 2024 REVENUE FOR DUNKIN
dunkin_revenue = pd.DataFrame({
    "year" :[2023, 2024],
    "revenue" : [1.4, 1.6],
    "location" : ["Dunkin"] * 2
})

# merge revenue data
merged_revenue = pd.concat([starbucks_revenue, dunkin_revenue], ignore_index=True)


In [254]:
# merge revenue data with merged coffee data
merged_rev_coffee = merged_coffee.merge(merged_revenue, on = ["location", "year"], how = "left")

6. Create a region variable in your dataset according to the scheme on this wikipedia page: Northeast, Midwest, South, West. You do not need to scrape this information.

In [267]:
# create region variable
regions_dict = {
    "New England": ["Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont"],
    "Mideast": ["Delaware", "Maryland", "New Jersey", "New York", "Pennsylvania", "Washington, D.C."],
    "Great Lakes": ["Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin"],
    "Plains": ["Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota"],
    "Southeast": ["Alabama", "Arkansas", "Florida", "Georgia", "Kentucky", "Louisiana", "Mississippi", "North Carolina", "South Carolina", "Tennessee", "Virginia", "West Virginia"],
    "Southwest": ["Arizona", "New Mexico", "Oklahoma", "Texas"],
    "Rocky Mountain": ["Colorado", "Idaho", "Montana", "Utah", "Wyoming"],
    "Far West": ["Alaska", "California", "Hawaii", "Nevada", "Oregon", "Washington"],
}

# create regions df
regions = pd.DataFrame([(state, region) for region, states in regions_dict.items() for state in states], 
                            columns=['state', 'Region'])

# merge df with dataset
all_data = merged_rev_coffee.merge(regions, on = "state", how = "left")

# Analyze
7. Assess and comment on the prevalence of each chain. Some questions to consider (you don’t need to answer all of these and you may come up with your own):

Are some of these chains more prevalent in certain states than others? Possibly despite having less stores overall? Same questions for regions instead of states.

How does your chosen financial metric change by state and region for each chain? For example, having 5 stores in California is very different from having 5 stores in Wyoming.

Does the distribution of each chain’s stores match population distribution, by both state/region?

Do the financial data match what you’d expect based on the number and locations of the stores? Why or why not?

# Automate
Convert your code for Exercises 1-3 above to a function that takes a single argument: the URL. This function should

Scrape the information on state names and corresponding number of store locations on the webpage specified (assume the page has a table in the same form and placement as the ones you scraped above)

Extract the name of the company from either the URL specified or the webpage (assume the URL will have the same format as the ones used above)

Return a clean, organized and tidy dataset. Find a page other than Starbucks and Dunkin’ Donuts to test this on to confirm that it works. It’s fine if this is not related to coffee.

In [None]:
def get_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    table = soup.find("table", class_ = "wpr-table")

    rows = []
    states_list = []

    for th in table.find_all("th")[4:]:
        states = th.get_text(strip = True)
        states_list.append(states)

    for i, tr in enumerate(table.find_all("tr")[1:]):
        # iterate across states list to get corresponding state for each row
        state = states_list[i]

        # get all the rows in the dataset
        cells = tr.find_all("td")

        # get all 2023 stores
        stores2023_tag = cells[0].find("a") or cells[0]
        stores2023 = stores2023_tag.get_text(strip = True)
        
        #get all 2021 stores
        stores2021_tag = cells[1].find("a") or cells[1]
        stores2021 = stores2021_tag.get_text(strip = True)

        # get all 2024 stores
        stores2024_tag = cells[2].find("a") or cells[2]
        stores2024 = stores2024_tag.get_text(strip = True)

        # add columns to a list
        rows.append({
            "state" : state,
            "2023" : stores2023,
            "2021" : stores2021,
            "2024" : stores2024

        })


    

In [306]:
# test function

url = "https://worldpopulationreview.com/state-rankings/chick-fil-a-by-state"

response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table", class_ = "wpr-table")

headers = []
rows = []
states_list = []

# get state names and add to states list
for th in table.find_all("th", class_ = "datatable-th"):
    header = th.get_text(strip = True)
    headers.append(header)
index = len(table.find_all("th", class_ = "datatable-th"))

for th in table.find_all("th")[index:]:
    states = th.get_text(strip = True)
    states_list.append(states)


for i, tr in enumerate(table.find_all("tr")[1:]):
    state = states_list[i]

    cells = tr.find_all("td")

    stores2024_tag = cells[0].find("a") or cells[0]
    stores2024 = stores2024_tag.get_text(strip = True)

    stores2023_tag = cells[1].find("a") or cells[1]
    stores2023 = stores2023_tag.get_text(strip = True)

    rows.append({
        "state" : state,
        "2023" : stores2023,
        "2024" : stores2024

    })

# Appendix

Chatgpt uses
- enumerate function
- convert dictionary to a df



Revenue data sources:

Starbucks yearly revenue: https://companiesmarketcap.com/starbucks/revenue/

Dunkin donuts yearly revenue: 
https://www.zippia.com/dunkin-donuts-careers-554008/revenue/

