In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
base_url = "https://www.allrecipes.com/recipes/"

In [3]:
session = requests.Session()

In [4]:
response = session.get(base_url)
soup = BeautifulSoup(response.text)
soup.title

<title>Recipes</title>

## Using a web crawler to get Recipe Links

In [5]:
stack = [{"id": 0, "name": "All Recipes", "url": base_url}]
recipes = {}
visited = []
visited_ids = []

In [6]:
def scrape_recipe_list():
    while len(recipes) <= 1000:
        top = stack.pop()
        visited_ids.append(top["id"])
        visited.append(top)
        print("Parsing", top["url"])
        response = session.get(top["url"])
        soup = BeautifulSoup(response.text)
        for a in soup.find_all("a"):
            link = a.get("href")
            if link.startswith(base_url) and link != base_url:
                id = int(link.split("/")[4])
                if id not in visited_ids:
                    stack.append({"id": id, "name": a.text.strip(), "url": link})
            elif link.startswith("https://www.allrecipes.com/recipe/"):
                id = int(link.split("/")[4])
                recipes[id] = {"name": a.text.strip(), "url": link}
        print("Visited:{}\tRecipe Lists: {}\tRecipes: {}".format(len(visited), len(stack), len(recipes)), end="\n\n\n")

In [7]:
scrape_recipe_list()

Parsing https://www.allrecipes.com/recipes/
Visited:1	Recipe Lists: 127	Recipes: 2


Parsing https://www.allrecipes.com/recipes/1642/everyday-cooking/
Visited:2	Recipe Lists: 274	Recipes: 3


Parsing https://www.allrecipes.com/recipes/86/world-cuisine/
Visited:3	Recipe Lists: 406	Recipes: 11


Parsing https://www.allrecipes.com/recipes/85/holidays-and-events/
Visited:4	Recipe Lists: 547	Recipes: 25


Parsing https://www.allrecipes.com/recipes/17567/ingredients/
Visited:5	Recipe Lists: 676	Recipes: 28


Parsing https://www.allrecipes.com/recipes/17562/dinner/
Visited:6	Recipe Lists: 791	Recipes: 81


Parsing https://www.allrecipes.com/recipes/96/salad/
Visited:7	Recipe Lists: 915	Recipes: 140


Parsing https://www.allrecipes.com/recipes/95/pasta-and-noodles/
Visited:8	Recipe Lists: 1018	Recipes: 156


Parsing https://www.allrecipes.com/recipes/94/soups-stews-and-chili/
Visited:9	Recipe Lists: 1119	Recipes: 194


Parsing https://www.allrecipes.com/recipes/93/seafood/
Visited:10	Recipe Li

In [8]:
recipes

{20144: {'name': 'Banana Banana Bread\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n16,328\n\nRatings',
  'url': 'https://www.allrecipes.com/recipe/20144/banana-banana-bread/'},
 245686: {'name': 'Spicy Lemon Ginger Switchel\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n9\n\nRatings',
  'url': 'https://www.allrecipes.com/recipe/245686/spicy-lemon-ginger-switchel/'},
 270939: {'name': 'Air Fryer Chicken Nuggets\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n12\n\nRatings',
  'url': 'https://www.allrecipes.com/recipe/270939/air-fryer-chicken-nuggets/'},
 238510: {'name': 'Homemade Arepas\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n30 mins',
  'url': 'https://www.allrecipes.com/recipe/238510/homemade-arepas/'},
 8536477: {'name': 'Torta Caprese (Italian Flourless Chocolate Torte)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n9\n\nRatings',
  'url': 'https://www.allrecipes.com/recipe/8536477/italian-flourless-chocolate-torte-torta-caprese/'},
 8532800: {'name': 'Spotted Dick\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n4

In [9]:
recipes_df = pd.DataFrame([{"id": id, **recipe} for id, recipe in recipes.items()])
recipes_df.to_json("recipes1000.json")
recipes_df.head()

Unnamed: 0,id,name,url
0,20144,Banana Banana Bread\n\n\n\n\n\n\n\n\n\n\n\n\n\...,https://www.allrecipes.com/recipe/20144/banana...
1,245686,Spicy Lemon Ginger Switchel\n\n\n\n\n\n\n\n\n\...,https://www.allrecipes.com/recipe/245686/spicy...
2,270939,Air Fryer Chicken Nuggets\n\n\n\n\n\n\n\n\n\n\...,https://www.allrecipes.com/recipe/270939/air-f...
3,238510,Homemade Arepas\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,https://www.allrecipes.com/recipe/238510/homem...
4,8536477,Torta Caprese (Italian Flourless Chocolate Tor...,https://www.allrecipes.com/recipe/8536477/ital...


## Scraping a Recipe

In [10]:
soup = BeautifulSoup(session.get("https://www.allrecipes.com/recipe/236432/brown-sugar-ham-steak/").text)
soup.title

<title>Brown Sugar Ham Steak Recipe</title>

### Recipe Title

In [11]:
recipe_title = soup.find("h1", {"class": "article-heading type--lion"}).text
recipe_title

'Brown Sugar Ham Steak'

### Recipe Subheading

In [12]:
recipe_subheading = soup.find("p", {"class": "article-subheading"}).text
recipe_subheading

'A sweet ham steak recipe. Pairs great with roasted potatoes and green beans.'

### Recipe Details

In [13]:
detail_list = soup.find("div", {"class": "mntl-recipe-details__content"})
details = {}
for item in detail_list.find_all("div", {"class": "mntl-recipe-details__item"}):
    label = item.find("div", {"class": "mntl-recipe-details__label"}).text.strip().replace(":", "")
    value = item.find("div", {"class": "mntl-recipe-details__value"}).text.strip()
    details[label] = value
details

{'Prep Time': '5 mins',
 'Cook Time': '20 mins',
 'Total Time': '25 mins',
 'Servings': '2'}

### Ingredients

In [14]:
ul = soup.find("ul", {"class": "mntl-structured-ingredients__list"})
ingredients = []
for li in ul.find_all("li"):
    ingredients.append(li.text.strip())
ingredients

['1 (8 ounce) bone-in fully cooked ham steak',
 '5 tablespoons butter, cubed',
 '5 tablespoons brown sugar']

### Directions

In [15]:
directions = soup.find("div", {"class": "recipe__steps-content"}).text.strip()
directions

'Cook ham steak in a large skillet over medium heat until browned, 3 to 4 minutes per side.\n\n\n\n\n Remove ham from skillet; drain off any fat.\n\n\n\n\n Melt butter in the same skillet over medium-low heat. Stir in brown sugar.\n\n\n\n\n Return ham to skillet. Cook, turning ham often, until heated through and brown sugar has dissolved, about 10 minutes. Reduce heat if brown sugar/butter mixture starts to pop or splatter.\n\n \n\n\n\n\n\n \n Zansheree Knight'

### Scrape Recipe function

In [16]:
def scrape_recipe(recipe_url):
    soup = soup = BeautifulSoup(session.get(recipe_url).text)
    recipe_data = {}
    
    recipe_data["title"] = soup.find("h1", {"class": "article-heading type--lion"}).text
    
    recipe_data["sub_heading"] = soup.find("p", {"class": "article-subheading"}).text
    
    detail_list = soup.find("div", {"class": "mntl-recipe-details__content"})
    for item in detail_list.find_all("div", {"class": "mntl-recipe-details__item"}):
        label = item.find("div", {"class": "mntl-recipe-details__label"}).text.strip().replace(":", "")
        value = item.find("div", {"class": "mntl-recipe-details__value"}).text.strip()
        recipe_data[label] = value

    ul = soup.find("ul", {"class": "mntl-structured-ingredients__list"})
    recipe_data["ingredients"] = []
    for li in ul.find_all("li"):
        recipe_data["ingredients"].append(li.text.strip())

    recipe_data["directions"] = soup.find("div", {"class": "recipe__steps-content"}).text.strip()

    return recipe_data

In [17]:
scrape_recipe("https://www.allrecipes.com/recipe/9011/simple-lemon-herb-chicken/")

{'title': 'Simple Lemon Herb Chicken',
 'sub_heading': 'This lemon-herb chicken is a simple, quick, and delicious dish. All you need are a few herbs, a lemon, and of course, the chicken! The amount of spices is completely up to you. You can add more or less according to your taste.',
 'Prep Time': '5 mins',
 'Cook Time': '10 mins',
 'Total Time': '15 mins',
 'Servings': '2',
 'ingredients': ['2 (5 ounce) skinless, boneless chicken breast halves',
  '1 medium lemon, juiced, divided',
  'salt and freshly ground black pepper to taste',
  '1 tablespoon olive oil',
  '1 pinch dried oregano',
  '2 sprigs fresh parsley, chopped, for garnish'],
 'directions': "Place chicken in a bowl; pour 1/2 of the lemon juice over chicken and season with salt.\n\n\n\n\n Heat olive oil in a medium skillet over medium-low heat. Place chicken into hot oil. Add remaining lemon juice and oregano; season with black pepper. Cook chicken until golden brown and the juices run clear, 5 to 10 minutes per side. An inst

In [18]:
parsed_recipes = []
for i, (id, recipe) in enumerate(recipes.items()):
    print("Parsing {}/{}: {}".format(i + 1, len(recipes), recipe["url"]))
    recipe_data = scrape_recipe(recipe["url"])
    recipe_data["id"] = id
    recipe_data["url"] = recipe["url"]
    parsed_recipes.append(recipe_data)

Parsing 1/1015: https://www.allrecipes.com/recipe/20144/banana-banana-bread/
Parsing 2/1015: https://www.allrecipes.com/recipe/245686/spicy-lemon-ginger-switchel/
Parsing 3/1015: https://www.allrecipes.com/recipe/270939/air-fryer-chicken-nuggets/
Parsing 4/1015: https://www.allrecipes.com/recipe/238510/homemade-arepas/
Parsing 5/1015: https://www.allrecipes.com/recipe/8536477/italian-flourless-chocolate-torte-torta-caprese/
Parsing 6/1015: https://www.allrecipes.com/recipe/8532800/spotted-dick/
Parsing 7/1015: https://www.allrecipes.com/recipe/8516210/chicken-adovada/
Parsing 8/1015: https://www.allrecipes.com/recipe/158799/stout-braised-lamb-shanks/
Parsing 9/1015: https://www.allrecipes.com/recipe/8509102/chicken-al-pastor/
Parsing 10/1015: https://www.allrecipes.com/recipe/233531/quick-whole-wheat-chapati/
Parsing 11/1015: https://www.allrecipes.com/recipe/215231/empanadas-beef-turnovers/
Parsing 12/1015: https://www.allrecipes.com/recipe/9426/panettone-loaves/
Parsing 13/1015: http

In [19]:
parsed_recipes

[{'title': 'Banana Banana Bread',
  'sub_heading': "This banana bread recipe creates the most delicious, moist loaf with loads of banana flavor. Why compromise the banana flavor? Friends and family love my recipe and say it's by far the best! It tastes wonderful toasted. Enjoy!",
  'Prep Time': '15 mins',
  'Cook Time': '1 hr',
  'Total Time': '1 hr 15 mins',
  'Servings': '12',
  'Yield': '1 (9x5-inch) loaf',
  'ingredients': ['2 cups all-purpose flour',
   '1 teaspoon baking soda',
   '¼ teaspoon salt',
   '¾ cup brown sugar',
   '½ cup butter',
   '2 large eggs, beaten',
   '2 ⅓ cups mashed overripe bananas'],
  'directions': 'Preheat the oven to 350 degrees F (175 degrees C). Lightly grease a 9x5-inch loaf pan.\n\n\n\n\n Combine flour, baking soda, and salt in a large bowl. Beat brown sugar and butter with an electric mixer in a separate large bowl until smooth. Stir in eggs and mashed bananas until well blended. Stir banana mixture into flour mixture until just combined. Pour batt

In [20]:
parsed_recipes_df = pd.DataFrame(parsed_recipes)
parsed_recipes_df.to_json("parsed_recipes.json")
parsed_recipes_df.head()

Unnamed: 0,title,sub_heading,Prep Time,Cook Time,Total Time,Servings,Yield,ingredients,directions,id,...,Stand Time,Additional Time,Rest Time,Chill Time,Release Pressure Time,Soak Time,Rise Time,Freeze Time,Fry Time,Grill Time
0,Banana Banana Bread,This banana bread recipe creates the most deli...,15 mins,1 hr,1 hr 15 mins,12,1 (9x5-inch) loaf,"[2 cups all-purpose flour, 1 teaspoon baking s...",Preheat the oven to 350 degrees F (175 degrees...,20144,...,,,,,,,,,,
1,Spicy Lemon Ginger Switchel,Switchel or haymaker's punch is an old-timey f...,10 mins,,10 mins,1,,"[1 cup water, 2 tablespoons unfiltered apple c...","Place water, apple cider vinegar, lemon juice,...",245686,...,,,,,,,,,,
2,Air Fryer Chicken Nuggets,These chicken nuggets come out perfectly crisp...,15 mins,15 mins,30 mins,8,,"[1 cup buttermilk, 2 pounds chicken tenderloin...",Mix buttermilk and chicken in a large bowl; se...,270939,...,,,,,,,,,,
3,Homemade Arepas,I first tasted arepas at a New York City hole-...,10 mins,20 mins,30 mins,8,8 arepas,"[2 ½ cups lukewarm water, 1 teaspoon salt, 2 c...",Stir water and salt together in a medium bowl;...,238510,...,,,,,,,,,,
4,Torta Caprese (Italian Flourless Chocolate Torte),This Italian flourless chocolate torte has a l...,20 mins,,2 hrs 15 mins,10,,"[6 ounces dark chocolate (70% cacao), chopped ...",Preheat the oven to 350 degrees F (175 degrees...,8536477,...,,,,,,,,,,


In [21]:
parsed_recipes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1015 entries, 0 to 1014
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   title                  1015 non-null   object
 1   sub_heading            1015 non-null   object
 2   Prep Time              993 non-null    object
 3   Cook Time              715 non-null    object
 4   Total Time             1008 non-null   object
 5   Servings               1011 non-null   object
 6   Yield                  519 non-null    object
 7   ingredients            1015 non-null   object
 8   directions             1015 non-null   object
 9   id                     1015 non-null   int64 
 10  url                    1015 non-null   object
 11  Bake Time              12 non-null     object
 12  Cool Time              15 non-null     object
 13  Active Time            6 non-null      object
 14  Marinate Time          4 non-null      object
 15  Stand Time           