In [105]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

In [7]:

mc_url = "https://www.mcdonalds.com/us/en-us/full-menu.html"
mc_page = requests.get(mc_url)

In [55]:
# creating a BeautifulSoup object from the full menu in order to create a list of websites to scrape over for nutritional data

mc_soup = BeautifulSoup(mc_page.content, 'html.parser')
mc_items = mc_soup.find_all('a', attrs={'href': re.compile("^/us/en-us/product")})
mc_meals = mc_soup.find_all('a', attrs={'href': re.compile("^/us/en-us/meal")})

In [250]:
mc_main = 'https://www.mcdonalds.com'
# turning html text into full web pages
mc_links = []
mc_menu_item_names = []
for mc_item in mc_items:
    mc_link = mc_item.get('href')
    mc_links.append(mc_main + mc_link)
    mc_menu_item_names.append(mc_link)
for mc_meal in mc_meals:
    mc_link = mc_meal.get('href')
    mc_links.append(mc_main + mc_link)
    mc_menu_item_names.append(mc_link)

In [266]:
# cleaning up the item names for the dataframe later
mc_item_names = []
for item, name in enumerate(mc_menu_item_names):
    if mc_menu_item_names[item].startswith('/us/en-us/product/'):
        mc_item_names.append(mc_menu_item_names[item].removeprefix('/us/en-us/product/').removesuffix('.html'))
for item, name in enumerate(mc_menu_item_names):
    if mc_menu_item_names[item].startswith('/us/en-us/meal/'):
        mc_item_names.append(mc_menu_item_names[item].removeprefix('/us/en-us/meal/').removesuffix('.html'))
print(len(mc_item_names))
print(len(mc_links))

134
134


In [271]:
# getting into selenium webdriver because BeautifulSoup is unable to access hidden information. All nutritional data
# on the mcdonalds website is behind a button that will have to get pressed

driver = webdriver.Chrome('chromedriver')
mc_dict = {}
for (mc_link, mc_name) in zip(mc_links, mc_item_names):
    # going through each link and opening the nutrition summary button in order to then scrape the info
    driver.get(mc_link)
    nutrition_summary_button = driver.find_element(By.XPATH, "//button[normalize-space()='Nutrition Summary']")
    nutrition_summary_button.location_once_scrolled_into_view
    nutrition_summary_button.click()
    
    # openning BeautifulSoup now that nutrition summary is available
    
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'lxml')
    
    list_of_nutrition_values = []
    
    # the nutrition data is seperated into two sections, 
    # each for loop is nearly identical it is just slightly altered for the two classes
    nutrition_info = soup.find_all('li', class_='cmp-nutrition-summary__heading-primary-item')
    for info in nutrition_info:
        
        value = info.find('span', class_='sr-only sr-only-pd').text
        # turning the very long string into the important piece of data (grams, milligrams, etc.)
        numbers_list = re.findall(r"[-+]?(?:\d*\.\d+|\d+)", value)
        nutrition_data_number = 0
        list_of_nutrition_values.append(numbers_list[nutrition_data_number])
    
    nutrition_info = soup.find_all('li', class_='label-item')
    for info in nutrition_info:
        
        value = info.find('span', class_="sr-only").text
        numbers_list = re.findall(r"[-+]?(?:\d*\.\d+|\d+)", value)
        nutrition_data_number = 0
        list_of_nutrition_values.append(numbers_list[nutrition_data_number])
        
    mc_dict[mc_name] = list_of_nutrition_values

In [273]:
# double checking to make sure the process was done properly, especially the findall command

mc_dict['deluxe-crispy-chicken-sandwich-meal']

['1050',
 '41',
 '142',
 '32',
 '6',
 '6',
 '47',
 '41',
 '61',
 '3.5',
 '65',
 '0',
 '1160',
 '1410',
 '6',
 '6',
 '47',
 '41',
 '61',
 '3.5',
 '65',
 '0',
 '1160',
 '1410']

In [274]:
expected_len = 14
for item in mc_dict:
    mc_dict[item] = mc_dict[item][:expected_len]

In [275]:
for item in mc_dict:
    if len(mc_dict[item]) != expected_len:
        print(item)

mccaf-mocha-small
egg-mcmuffin-meal


In [276]:
# these links had issues in the process or do not have nutritional information on the mcdonalds website.
# below is me handling each one by hand using the mcdonalds website

mc_dict['egg-mcmuffin-meal'] = [450, 21, 49, 21, 7, 4, 186, 21, 3, 3.5, 250, 2, 630, 1100]
mc_dict.pop('mccaf-mocha-small')

['0', '0', '0', '0']

In [277]:
for item in mc_dict:
    print(len(mc_dict[item]))

14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14


In [278]:
# creating a dataframe from dict with list of column names

mc_column_names = ['Calories', 'Total_Fat', 'Total_Carbs', 'Protein', 'Saturated_Fat', 'Dietary_Fiber', 'Calcium', 'Total_Fat2', 'Total_Sugars', 'Iron', 'Cholesterol', 'Vitamin_D', 'Potassium', 'Sodium' ]
mc_df = pd.DataFrame(mc_dict).transpose()
mc_df.columns = mc_column_names
# removing duplicate column
del mc_df['Total_Fat2']

In [279]:
mc_df

Unnamed: 0,Calories,Total_Fat,Total_Carbs,Protein,Saturated_Fat,Dietary_Fiber,Calcium,Total_Sugars,Iron,Cholesterol,Vitamin_D,Potassium,Sodium
spicy-crispy-chicken-sandwich,530,26,48,27,4,2,30,9,2.5,65,0,440,1320
big-mac,550,30,45,25,11,3,120,9,4.5,80,0,380,1010
chicken-mcnuggets-4-piece,170,10,10,9,1.5,0,6,0,0.5,25,0,140,330
small-french-fries,230,11,31,3,1.5,3,10,0,0.5,0,0,470,190
quarter-pounder-with-cheese,520,26,42,30,12,2,190,10,4,95,0,420,1140
...,...,...,...,...,...,...,...,...,...,...,...,...,...
sausage-mcgriddles-meal,570,32,60,15,10,4,96,14,3.0,35,0,610,1320
sausage-burrito-meal,450,25,44,17,8,3,156,2,3.0,170,0,600,1130
hamburger-happy-meal,475,16,62,22,5.5,2,296,20,3.5,40,2,815,680
4-piece-chicken-mcnuggets-happy-meal,225,7,31,10,2.0,1,276,14,0.5,10,2,615,170


In [280]:
mc_df.to_csv('mc_nutrition.csv')