In [5]:
import os
import requests
from bs4 import BeautifulSoup
from litellm import completion

def extract_recipe_text(url):
    """Basic extraction: grabs all paragraphs. For real use, tune this per site."""
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
    response = requests.get(url, headers=headers)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Try to extract only the recipe instructions
    # Fallback: get all text
    instructions = []
    for tag in soup.find_all(['ol', 'ul']):
        if 'instruction' in tag.get('class', []) or 'directions' in tag.get('class', []):
            for li in tag.find_all('li'):
                instructions.append(li.get_text(strip=True))
    if not instructions:
        # fallback: all paragraphs
        instructions = [p.get_text(strip=True) for p in soup.find_all('p')]
    return '\n'.join(instructions[:20])  # limit for token efficiency

def get_recipe_steps_json(url, openai_model="gpt-4o"):
    recipe_text = extract_recipe_text(url)
    prompt = f"""You are a helpful assistant. Given the following cooking recipe instructions, break them down into a JSON array of numbered steps, where each step is an object with 'step_number', 'instruction', and 'ingredients' (list) used in that step. Be as precise as possible. Here is the recipe:\n\n{recipe_text}\n\nReturn only the JSON."""
    messages = [{"role": "user", "content": prompt}]
    response = completion(
        model=openai_model,
        messages=messages,
        max_tokens=800,
        temperature=0.2
    )
    # Extract only the content
    content = response['choices'][0]['message']['content']
    return content



In [6]:
# Example usage:
'''if __name__ == "__main__":
os.environ["OPENAI_API_KEY"] = "sk-your-openai-key-here"
    url = "https://meaningfuleats.com/gluten-free-cheesecake/"
    steps_json = get_recipe_steps_json(url)
    print(steps_json)
    '''

url = "https://meaningfuleats.com/gluten-free-cheesecake/"

temp = extract_recipe_text(url)

In [7]:
temp

'This website is using a security service to protect itself from online attacks. The action you just performed triggered the security solution. There are several actions that could trigger this block including submitting a certain word or phrase, a SQL command or malformed data.\nYou can email the site owner to let them know you were blocked. Please include what you were doing when this page came up and the Cloudflare Ray ID found at the bottom of this page.\nCloudflare Ray ID:94379f449f26cb7a•Your IP:Click to reveal2607:fb91:3c7:9202:ec34:401e:9501:b11f•Performance & security byCloudflare'

In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

def extract_recipe_text_selenium(url):
    # Set up headless Chrome
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run headless
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--log-level=3")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(options=chrome_options)  # You can specify executable_path if needed
    driver.get(url)
    time.sleep(3)  # Give time for page to load, increase if needed

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # Try to find instructions - you might want to adjust this selector for specific sites
    instructions = []
    for tag in soup.find_all(['ol', 'ul']):
        if 'instruction' in (tag.get('class') or []) or 'directions' in (tag.get('class') or []):
            for li in tag.find_all('li'):
                instructions.append(li.get_text(strip=True))
    if not instructions:
        # Fallback: just get all <li> in the first <ol> or <ul>
        first_list = soup.find(['ol', 'ul'])
        if first_list:
            instructions = [li.get_text(strip=True) for li in first_list.find_all('li')]
    driver.quit()
    return soup
#, "\n".join(instructions[:20])  # Adjust the slice as needed

# Example use:
if __name__ == "__main__":
    url = "https://meaningfuleats.com/gluten-free-cheesecake/"
    soup = extract_recipe_text_selenium(url)
    #print(text)

In [21]:
soup

<html lang="en-US"><head><script async="" src="https://imasdk.googleapis.com/js/sdkloader/ima3.js" type="text/javascript"></script><script async="" src="https://collector.brandmetrics.com/c.js?siteid=f9816ecc-b51b-4747-bc3e-1ea86a0677a2&amp;toploc=meaningfuleats.com&amp;rnd=7007767" type="text/javascript"></script><script src="https://cdn.brandmetrics.com/tag/aa466d868b2742ffa2cc31bb6341dc12/cafemedia.js" type="text/javascript"></script><script async="" src="https://s0.2mdn.net/instream/video/client.js" type="text/javascript"></script><script async="" src="https://s0.2mdn.net/instream/video/client.js" type="text/javascript"></script><script async="" defer="" src="https://launchpad.privacymanager.io/latest/launchpad.bundle.js"></script><script async="" src="//cdn.confiant-integrations.net/gptprebidnative/202504140911/wrap.js"></script><script async="" src="https://imasdk.googleapis.com/js/sdkloader/ima3.js" type="text/javascript"></script><script async="" src="https://imasdk.googleapis.

In [13]:
soup = BeautifulSoup(ht, 'html.parser')

In [24]:
soup

<html lang="en-US"><head><script async="" src="https://imasdk.googleapis.com/js/sdkloader/ima3.js" type="text/javascript"></script><script async="" src="https://collector.brandmetrics.com/c.js?siteid=f9816ecc-b51b-4747-bc3e-1ea86a0677a2&amp;toploc=meaningfuleats.com&amp;rnd=7007767" type="text/javascript"></script><script src="https://cdn.brandmetrics.com/tag/aa466d868b2742ffa2cc31bb6341dc12/cafemedia.js" type="text/javascript"></script><script async="" src="https://s0.2mdn.net/instream/video/client.js" type="text/javascript"></script><script async="" src="https://s0.2mdn.net/instream/video/client.js" type="text/javascript"></script><script async="" defer="" src="https://launchpad.privacymanager.io/latest/launchpad.bundle.js"></script><script async="" src="//cdn.confiant-integrations.net/gptprebidnative/202504140911/wrap.js"></script><script async="" src="https://imasdk.googleapis.com/js/sdkloader/ima3.js" type="text/javascript"></script><script async="" src="https://imasdk.googleapis.

In [27]:
import json
from bs4 import BeautifulSoup

def extract_recipe_jsonld(soup):
    #soup = BeautifulSoup(html, "html.parser")
      # Find all <script type="application/ld+json">
    scripts = soup.find_all("script", type="application/ld+json")
    for script in scripts:
        try:
            data = json.loads(script.string)
            # Check if it's a recipe (sometimes it's a list or dict)
            if isinstance(data, list):
                for entry in data:
                    if entry.get("@type") == "Recipe":
                        return entry
            elif isinstance(data, dict):
                if data.get("@type") == "Recipe":
                    return data
        except Exception as e:
            continue
    return None

# Usage example:
# html = ... # full HTML source of the page
recipe_data = extract_recipe_jsonld(soup)
if recipe_data:
    ingredients = recipe_data.get("recipeIngredient", [])
    instructions = [step["text"] for step in recipe_data["recipeInstructions"]]
    print("Ingredients:", ingredients)
    print("\nInstructions:")
    for i, step in enumerate(instructions, 1):
        print(f"{i}. {step}")

In [29]:
z

In [30]:
scripts

[<script class="yoast-schema-graph" type="application/ld+json">{"@context":"https://schema.org","@graph":[{"@type":"Article","@id":"https://meaningfuleats.com/gluten-free-cheesecake/#article","isPartOf":{"@id":"https://meaningfuleats.com/gluten-free-cheesecake/"},"author":{"name":"Erin Collins","@id":"https://meaningfuleats.com/#/schema/person/e270f0a2b3b28e086a876ffa94f8b8ed"},"headline":"Gluten-Free Vanilla Bean Cheesecake","datePublished":"2023-11-15T20:52:54+00:00","dateModified":"2023-11-15T20:52:58+00:00","wordCount":1805,"commentCount":64,"publisher":{"@id":"https://meaningfuleats.com/#organization"},"image":{"@id":"https://meaningfuleats.com/gluten-free-cheesecake/#primaryimage"},"thumbnailUrl":"https://meaningfuleats.com/wp-content/uploads/2022/06/gluten-free-cheesecake-recipe-1.jpg","articleSection":["Gluten-Free Desserts","Gluten-Free Holiday Recipes","Gluten-Free Thanksgiving Recipes","Popular"],"inLanguage":"en-US","potentialAction":[{"@type":"CommentAction","name":"Commen

In [34]:
from bs4 import BeautifulSoup
import json

#soup = BeautifulSoup(html, "html.parser")
script = soup.find("script", class_="yoast-schema-graph", type="application/ld+json")
data = json.loads(script.string)
print(json.dumps(data, indent=2))  # Pretty print for inspection

{
  "@context": "https://schema.org",
  "@graph": [
    {
      "@type": "Article",
      "@id": "https://meaningfuleats.com/gluten-free-cheesecake/#article",
      "isPartOf": {
        "@id": "https://meaningfuleats.com/gluten-free-cheesecake/"
      },
      "author": {
        "name": "Erin Collins",
        "@id": "https://meaningfuleats.com/#/schema/person/e270f0a2b3b28e086a876ffa94f8b8ed"
      },
      "headline": "Gluten-Free Vanilla Bean Cheesecake",
      "datePublished": "2023-11-15T20:52:54+00:00",
      "dateModified": "2023-11-15T20:52:58+00:00",
      "wordCount": 1805,
      "commentCount": 64,
      "publisher": {
        "@id": "https://meaningfuleats.com/#organization"
      },
      "image": {
        "@id": "https://meaningfuleats.com/gluten-free-cheesecake/#primaryimage"
      },
      "thumbnailUrl": "https://meaningfuleats.com/wp-content/uploads/2022/06/gluten-free-cheesecake-recipe-1.jpg",
      "articleSection": [
        "Gluten-Free Desserts",
        "Glut

In [46]:
def find_recipe_object(data):
    # If it's a dict with @type == "Recipe"
    if isinstance(data, dict) and data.get("@type") == "Recipe":
        return data
    # If it's a dict with @graph '''
  if isinstance(data, dict) and "@graph" in data:
        for entry in data["@graph"]:
            if entry.get("@type") == "Recipe":
                return entry        ''' '''
        # If it's a list of dicts
    if isinstance(data, list):
        for entry in data:
            if isinstance(entry, dict) and entry.get("@type") == "Recipe":
                return entry
        
    return None


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 6)

In [47]:
rec = find_recipe_object(data)

In [49]:
rec

{'@type': 'Recipe',
 'name': 'Gluten-Free Vanilla Bean Cheesecake',
 'author': {'@id': 'https://meaningfuleats.com/#/schema/person/e270f0a2b3b28e086a876ffa94f8b8ed'},
 'description': 'This gluten-free cheesecake is a long-time reader and family favorite recipe! It&#039;s naturally gluten-free thanks to a delicious, walnut-brown sugar crust!This recipe originally comes from a family friend who was known for her delicious southern recipes. We&#039;ve been making it for years and I hope you love it too!',
 'datePublished': '2023-11-15T13:52:54+00:00',
 'image': ['https://meaningfuleats.com/wp-content/uploads/2022/06/gluten-free-cheesecake-recipe-1.jpg',
  'https://meaningfuleats.com/wp-content/uploads/2022/06/gluten-free-cheesecake-recipe-1-500x500.jpg',
  'https://meaningfuleats.com/wp-content/uploads/2022/06/gluten-free-cheesecake-recipe-1-500x375.jpg',
  'https://meaningfuleats.com/wp-content/uploads/2022/06/gluten-free-cheesecake-recipe-1-480x270.jpg'],
 'recipeYield': ['16', '16 serv

In [None]:
rec