In [18]:
import requests
from bs4 import BeautifulSoup

import json
import uuid

import time
import random

# Global sets to store unique ingredients and categories
all_ingredients = set()
all_categories = set()

## Extracts all recipe URLs from a given page. ##
def get_recipes_from_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    recipe_links = [a['href'] for a in soup.find_all('a', href=True) if "/recipes/" in a['href']]
    return recipe_links


## Finds the URL for the next page based on alphabetical navigation. ##
def generate_urls(base_url):
    parts = base_url.split("123")
    urls = [parts[0] + letter + parts[1] for letter in list(map(chr, range(97, 120)))]  # 'a' to 'w'
    urls.append(parts[0] + "xyz" + parts[1])  # Adding 'xyz'
    return urls


## Scrapes detailed information about a recipe from its page. ##
def scrape_recipe_details(url):
    time.sleep(random.uniform(5.0, 10.0))  # Random delay between requests
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    response = requests.get(url)
            
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract data using BeautifulSoup's methods
        title = soup.find('span', class_="o-AssetTitle__a-HeadlineText").get_text(strip=True)
        level_headline = soup.find('span', class_='o-RecipeInfo__a-Headline')
        level = level_headline.find_next_sibling('span').get_text(strip=True) if level_headline else 'Level not found'
        yield_headline = soup.find('span', string='Yield:')
        servings = yield_headline.find_next_sibling('span').get_text(strip=True) if yield_headline else 'Servings not found'
        cooking_time = soup.find('span', class_="o-RecipeInfo__a-Description m-RecipeInfo__a-Description--Total").get_text(strip=True)
        ingredients_list = soup.find_all('span', class_="o-Ingredients__a-Ingredient--CheckboxLabel")
        directions_elements = soup.find_all('li', class_='o-Method__m-Step')
        directions = [direction.get_text(strip=True) for direction in directions_elements]
        cook_note_element = soup.find('p', class_='o-ChefNotes__a-Description')
        cook_note = cook_note_element.get_text(strip=True) if cook_note_element else 'Cook\'s note not found'

        # Extract the categories
        category_elements = soup.find_all('a', class_='o-Capsule__a-Tag a-Tag')
        categories = [category.get_text(strip=True) for category in category_elements]

    recipe_info = {
        'title': title,
        'level': level,
        'yield': servings,
        'cooking time': cooking_time,
        'ingredients': ingredients_list, ## TODO: before adding ingredients to recipe info, remove the "- Deselect All"
        'steps': directions,
        'cook note': cook_note,
        'categories': categories
    }
    
    all_ingredients.update(recipe_info['ingredients']) ## TODO: remove the measurments before adding to ingredients
    all_categories.update(recipe_info['categories'])
    
    return recipe_info


## Writes data to a JSON file. ##
def write_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)


def main():
    base_url = "https://www.foodnetwork.com/recipes/food-network-kitchen/123"  # Starting with '123'
    urls = generate_urls(base_url)
    recipe_index = {}

    for url in urls:
        print(f"Processing {url}...")
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract recipes from current page and scrape them
        recipes = get_recipes_from_page(url)
        print(f"Found {len(recipes)} recipes.")

        # Iterate over each URL to access and extract information
        for recipe in recipes:
            recipe_id = str(uuid.uuid4())
            details = scrape_recipe_details(recipe)
            recipe_index[recipe_id] = details['title']
            write_json(details, f"{recipe_id}.json")
        print(f"{url} has been processed.")
    
    print("All URLs have been processed.")
    
    # Writing the index file and ingredient/category accumulations
    write_json(recipe_index, "recipe_index.json")
    write_json(list(all_ingredients), "all_ingredients.json")
    write_json(list(all_categories), "all_categories.json")

# main() TODO: UNCOMMENT TO RUN

Title: 3-Ingredient Bundt Cake
Level: Easy
Yield: 8 to 10 servings
Cooking Time: 1 hr 40 min
Ingredients:
- Deselect All
- Nonstick cooking spray, for the pan
- One 15.25-ounce box cake mix (any flavor)
- 1 pint high-quality ice cream (any flavor), completely melted
- 3 large eggs
Directions:
Step 1: Preheat the oven to 350 degrees F. Thoroughly spray a 12-cup Bundt pan with the cooking spray, making sure to cover the entire inner surface.
Step 2: Whisk together the cake mix, ice cream and eggs in a large bowl until well combined, then pour into the prepared Bundt pan. Bake until a cake tester inserted in middle of cake ring comes out clean and the sides of the cake are beginning to pull away from the edge of the pan, 35 to 40 minutes. Cool the cake in the pan for 20 minutes, then place a wire rack over the pan and invert the cake onto the rack. Cool completely.
Cook's Note: Cook's note not found
Categories: Bundt Cake, Dairy Recipes, Dessert
