In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from time import sleep
from tqdm.notebook import trange, tqdm

In [2]:

mc_url = "https://www.mcdonalds.com/us/en-us/full-menu.html"
mc_page = requests.get(mc_url)

In [3]:
# creating a BeautifulSoup object from the full menu in order to create a list of websites to scrape over for nutritional data

mc_soup = BeautifulSoup(mc_page.content, 'html.parser')
mc_items = mc_soup.find_all('a', attrs={'href': re.compile("^/us/en-us/product")})
mc_meals = mc_soup.find_all('a', attrs={'href': re.compile("^/us/en-us/meal")})

In [4]:
mc_main = 'https://www.mcdonalds.com'
# turning html text into full web pages
mc_links = []
mc_menu_item_names = []
for mc_item in mc_items:
    mc_link = mc_item.get('href')
    mc_links.append(mc_main + mc_link)
    mc_menu_item_names.append(mc_link)
for mc_meal in mc_meals:
    mc_link = mc_meal.get('href')
    mc_links.append(mc_main + mc_link)
    mc_menu_item_names.append(mc_link)

In [5]:
# cleaning up the item names for the dataframe later
mc_item_names = []
for item, name in enumerate(mc_menu_item_names):
    if mc_menu_item_names[item].startswith('/us/en-us/product/'):
        mc_item_names.append(mc_menu_item_names[item].removeprefix('/us/en-us/product/').removesuffix('.html'))
for item, name in enumerate(mc_menu_item_names):
    if mc_menu_item_names[item].startswith('/us/en-us/meal/'):
        mc_item_names.append(mc_menu_item_names[item].removeprefix('/us/en-us/meal/').removesuffix('.html'))
print(len(mc_item_names))
print(len(mc_links))

129
129


In [6]:
# getting into selenium webdriver because BeautifulSoup is unable to access hidden information. All nutritional data
# on the mcdonalds website is behind a button that will have to get pressed

driver = webdriver.Chrome('chromedriver')
mc_dict = {}
for (mc_link, mc_name) in tqdm(zip(mc_links, mc_item_names)):
    # going through each link and opening the nutrition summary button in order to then scrape the info
    driver.get(mc_link)
    nutrition_summary_button = driver.find_element(By.XPATH, "//button[normalize-space()='Nutrition Summary']")
    nutrition_summary_button.location_once_scrolled_into_view
    nutrition_summary_button.click()
    sleep(2)
    # openning BeautifulSoup now that nutrition summary is available
    
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'lxml')
    
    list_of_nutrition_values = []
    
    # the nutrition data is seperated into two sections, 
    # each for loop is nearly identical it is just slightly altered for the two classes
    nutrition_info = soup.find_all('li', class_='cmp-nutrition-summary__heading-primary-item')
    for info in nutrition_info:
        
        value = info.find('span', class_='sr-only sr-only-pd').text
        # turning the very long string into the important piece of data (grams, milligrams, etc.)
        numbers_list = re.findall(r"[-+]?(?:\d*\.\d+|\d+)", value)
        nutrition_data_number = 0
        list_of_nutrition_values.append(numbers_list[nutrition_data_number])
    
    nutrition_info = soup.find_all('li', class_='label-item')
    for info in nutrition_info:
        
        value = info.find('span', class_="sr-only").text
        numbers_list = re.findall(r"[-+]?(?:\d*\.\d+|\d+)", value)
        nutrition_data_number = 0
        list_of_nutrition_values.append(numbers_list[nutrition_data_number])
        
    mc_dict[mc_name] = list_of_nutrition_values

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=105.0.5195.127)
Stacktrace:
Backtrace:
	Ordinal0 [0x0032DF13+2219795]
	Ordinal0 [0x002C2841+1779777]
	Ordinal0 [0x001D423D+803389]
	Ordinal0 [0x001BC70A+706314]
	Ordinal0 [0x00221502+1119490]
	Ordinal0 [0x0022E8FA+1173754]
	Ordinal0 [0x0021E616+1107478]
	Ordinal0 [0x001F7F89+950153]
	Ordinal0 [0x001F8F56+954198]
	GetHandleVerifier [0x00622CB2+3040210]
	GetHandleVerifier [0x00612BB4+2974420]
	GetHandleVerifier [0x003C6A0A+565546]
	GetHandleVerifier [0x003C5680+560544]
	Ordinal0 [0x002C9A5C+1808988]
	Ordinal0 [0x002CE3A8+1827752]
	Ordinal0 [0x002CE495+1827989]
	Ordinal0 [0x002D80A4+1867940]
	BaseThreadInitThunk [0x7511FA29+25]
	RtlGetAppContainerNamedObjectPath [0x771F7B5E+286]
	RtlGetAppContainerNamedObjectPath [0x771F7B2E+238]


In [273]:
# double checking to make sure the process was done properly, especially the findall command

mc_dict['deluxe-crispy-chicken-sandwich-meal']

['1050',
 '41',
 '142',
 '32',
 '6',
 '6',
 '47',
 '41',
 '61',
 '3.5',
 '65',
 '0',
 '1160',
 '1410',
 '6',
 '6',
 '47',
 '41',
 '61',
 '3.5',
 '65',
 '0',
 '1160',
 '1410']

In [274]:
expected_len = 14
for item in mc_dict:
    mc_dict[item] = mc_dict[item][:expected_len]

In [275]:
for item in mc_dict:
    if len(mc_dict[item]) != expected_len:
        print(item)

mccaf-mocha-small
egg-mcmuffin-meal


In [276]:
# these links had issues in the process or do not have nutritional information on the mcdonalds website.
# below is me handling each one by hand using the mcdonalds website

mc_dict['egg-mcmuffin-meal'] = [450, 21, 49, 21, 7, 4, 186, 21, 3, 3.5, 250, 2, 630, 1100]
mc_dict.pop('mccaf-mocha-small')

['0', '0', '0', '0']

In [277]:
for item in mc_dict:
    print(len(mc_dict[item]))

14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14


In [1]:
# creating a dataframe from dict with list of column names

mc_column_names = ['Calories', 'Total_Fat', 'Total_Carbs', 'Protein', 'Saturated_Fat', 'Dietary_Fiber', 'Calcium', 'Total_Fat2', 'Total_Sugars', 'Iron', 'Cholesterol', 'Vitamin_D', 'Potassium', 'Sodium' ]
mc_df = pd.DataFrame(mc_dict).transpose()
mc_df.columns = mc_column_names
mc_df = df.drop_duplicates()
# removing duplicate column
del mc_df['Total_Fat2']

NameError: name 'pd' is not defined

In [30]:
mc_df.head(10)

Unnamed: 0,Calories,Total_Fat,Total_Carbs,Protein,Saturated_Fat,Dietary_Fiber,Calcium,Total_Sugars,Iron,Cholesterol,Vitamin_D,Potassium,Sodium
spicy-crispy-chicken-sandwich,530.0,26.0,48.0,27.0,4.0,2.0,30.0,9.0,2.5,65.0,0.0,440.0,1320.0
big-mac,550.0,30.0,45.0,25.0,11.0,3.0,120.0,9.0,4.5,80.0,0.0,380.0,1010.0
chicken-mcnuggets-4-piece,170.0,10.0,10.0,9.0,1.5,0.0,6.0,0.0,0.5,25.0,0.0,140.0,330.0
small-french-fries,230.0,11.0,31.0,3.0,1.5,3.0,10.0,0.0,0.5,0.0,0.0,470.0,190.0
quarter-pounder-with-cheese,520.0,26.0,42.0,30.0,12.0,2.0,190.0,10.0,4.0,95.0,0.0,420.0,1140.0
iced-coffee-small,140.0,5.0,24.0,2.0,3.5,0.0,40.0,22.0,0.0,20.0,0.0,140.0,50.0
egg-mcmuffin,310.0,13.0,30.0,17.0,6.0,2.0,170.0,3.0,3.0,250.0,2.0,200.0,770.0
sausage-burrito,310.0,17.0,25.0,13.0,7.0,1.0,140.0,2.0,2.5,170.0,0.0,170.0,800.0
strawberry-watermelon-slushie-small,190.0,0.0,50.0,0.0,0.0,0.0,6.0,49.0,0.0,0.0,0.0,15.0,25.0
tropical-mango-slushie-small,170.0,0.0,47.0,0.0,0.0,0.0,6.0,46.0,0.0,0.0,0.0,10.0,20.0


In [34]:
# Saved work in order to not have to keep running the selenium web scraping every time
mc_df.to_csv('mc_nutrition.csv')

In [3]:
# Starting here if the .csv has been made
mc_df = pd.read_csv('mc_nutrition.csv', index_col=0)

In [5]:
# creating additional columns that I believe will be useful in the future for parsing through the data

# Manually created list. Information was not easily distinguished on the website in
# the way that I was hoping to have it

item_type_list = [2,2,2,3,2,1,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,3,3,2,2,2,2,2,2,2,2,2,2,2,2,4,4,4,4,4,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4,4,4,4,3,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]


#creating empty lists I will use to add each column
item_type_name_list = []
mc_mcdonald = []


for mc_item in item_type_list:
    if mc_item == 1:
        item_type_name_list.append('Drink')
    if mc_item == 2:
        item_type_name_list.append('Entree')
    if mc_item == 3:
        item_type_name_list.append('Side')
    if mc_item == 4:
        item_type_name_list.append('Dessert')
    if mc_item == 5:
        item_type_name_list.append('Meal')


mc_df['Item_Type'] = item_type_name_list


for mc_item in item_type_list:
    mc_mcdonald.append('McDonalds')


mc_df['Restaurant_Name'] = mc_mcdonald




In [6]:
mc_df

Unnamed: 0,Calories,Total_Fat,Total_Carbs,Protein,Saturated_Fat,Dietary_Fiber,Calcium,Total_Sugars,Iron,Cholesterol,Vitamin_D,Potassium,Sodium,Item_Type,Restaurant_Name
spicy-crispy-chicken-sandwich,530.0,26.0,48.0,27.0,4.0,2.0,30.0,9.0,2.5,65.0,0.0,440.0,1320.0,Entree,McDonalds
big-mac,550.0,30.0,45.0,25.0,11.0,3.0,120.0,9.0,4.5,80.0,0.0,380.0,1010.0,Entree,McDonalds
chicken-mcnuggets-4-piece,170.0,10.0,10.0,9.0,1.5,0.0,6.0,0.0,0.5,25.0,0.0,140.0,330.0,Entree,McDonalds
small-french-fries,230.0,11.0,31.0,3.0,1.5,3.0,10.0,0.0,0.5,0.0,0.0,470.0,190.0,Side,McDonalds
quarter-pounder-with-cheese,520.0,26.0,42.0,30.0,12.0,2.0,190.0,10.0,4.0,95.0,0.0,420.0,1140.0,Entree,McDonalds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sausage-mcgriddles-meal,570.0,32.0,60.0,15.0,10.0,4.0,96.0,14.0,3.0,35.0,0.0,610.0,1320.0,Meal,McDonalds
sausage-burrito-meal,450.0,25.0,44.0,17.0,8.0,3.0,156.0,2.0,3.0,170.0,0.0,600.0,1130.0,Meal,McDonalds
hamburger-happy-meal,475.0,16.0,62.0,22.0,5.5,2.0,296.0,20.0,3.5,40.0,2.0,815.0,680.0,Meal,McDonalds
4-piece-chicken-mcnuggets-happy-meal,225.0,7.0,31.0,10.0,2.0,1.0,276.0,14.0,0.5,10.0,2.0,615.0,170.0,Meal,McDonalds


In [18]:
# Creating a new DataFrame with just Entrees for the project, due to next step requiring manual entry of all prices
entree_df = mc_df.where(mc_df['Item_Type']=='Entree').dropna().drop(columns='Item_Type').drop(columns="Restaurant_Name")

# Marking what is breakfast and removing it due to limitations in the app to not show breakfast pricing outside of breakfast hours
entree_df["Time"] = [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
entree_df = entree_df.where(entree_df['Time']==1).dropna()

# Creating the price column
entree_df["Price"] = [6.09,5.99,2.49,5.99,7.39,7.39,3.09,7.59,1.99,3.19,1.79,7.09,6.09,7.09,2.69,5.89]
entree_df

Unnamed: 0,Calories,Total_Fat,Total_Carbs,Protein,Saturated_Fat,Dietary_Fiber,Calcium,Total_Sugars,Iron,Cholesterol,Vitamin_D,Potassium,Sodium,Time,Price
spicy-crispy-chicken-sandwich,530.0,26.0,48.0,27.0,4.0,2.0,30.0,9.0,2.5,65.0,0.0,440.0,1320.0,1.0,6.09
big-mac,550.0,30.0,45.0,25.0,11.0,3.0,120.0,9.0,4.5,80.0,0.0,380.0,1010.0,1.0,5.99
chicken-mcnuggets-4-piece,170.0,10.0,10.0,9.0,1.5,0.0,6.0,0.0,0.5,25.0,0.0,140.0,330.0,1.0,2.49
quarter-pounder-with-cheese,520.0,26.0,42.0,30.0,12.0,2.0,190.0,10.0,4.0,95.0,0.0,420.0,1140.0,1.0,5.99
double-quarter-pounder-with-cheese,740.0,42.0,43.0,48.0,20.0,2.0,200.0,10.0,6.0,165.0,0.0,660.0,1360.0,1.0,7.39
deluxe-quarter-pounder-with-cheese,630.0,37.0,44.0,30.0,14.0,3.0,200.0,11.0,4.5,105.0,0.0,500.0,1210.0,1.0,7.39
mcdouble,400.0,20.0,33.0,22.0,9.0,2.0,100.0,7.0,3.5,70.0,0.0,330.0,920.0,1.0,3.09
bacon-quarter-pounder-with-cheese,630.0,35.0,43.0,36.0,15.0,3.0,190.0,10.0,4.5,115.0,0.0,490.0,1470.0,1.0,7.59
cheeseburger,300.0,13.0,32.0,15.0,6.0,2.0,100.0,7.0,3.0,40.0,0.0,220.0,720.0,1.0,1.99
double-cheeseburger,450.0,24.0,34.0,25.0,11.0,2.0,180.0,7.0,3.5,85.0,0.0,350.0,1120.0,1.0,3.19


In [19]:
entree_df.to_csv("mc_entree_with_price.csv")