In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

# os.environ["OPENAI_API_KEY"] = getpass.getpass()

tracing_v2 = os.getenv("LANGCHAIN_TRACING_V2")
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-3.5-turbo")

# Preparing the data
In this prototype, i have exported all my recipes from the Paprika app in html format. These files can all be found in the `recipe_data` folder. The first thing we want to do is parse this html content into a JSON format for now. 

At a later date, we will want this data to be stored in a database.

In [18]:
import re
from bs4 import BeautifulSoup
import json

# Load the HTML content
html_file = './recipe_data/Recipes/Beef Stroganoff.html'
with open(html_file, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the recipe details
recipe_name = soup.find('h1', class_='name').text.strip()
prep_time = int(soup.find('span', itemprop='prepTime').text.strip().replace('min', '').strip())
cook_time = int(soup.find('span', itemprop='cookTime').text.strip().replace('min', '').strip())
servings = int(soup.find('span', itemprop='recipeYield').text.strip().split()[-1])
source_url = soup.find('a', itemprop='url')['href']
source_author = soup.find('span', itemprop='author').text.strip()
source = f"[{source_author}]({source_url})"

# Extract image URL
image_url = soup.find('div', class_='photobox').find('a')['href']

# Extract ingredients
ingredient_pattern = re.compile(r'(?P<quantity>[\d/ .-]+)?\s*(?P<unit>[^\d\s]+)?\s*(?P<item>.*)')
ingredients = []
for item in soup.find_all('p', itemprop='recipeIngredient'):
    ingredient_text = item.text.strip()
    match = ingredient_pattern.match(ingredient_text)
    if match:
        quantity = match.group('quantity').strip() if match.group('quantity') else None
        unit = match.group('unit').strip() if match.group('unit') else None
        item_name = match.group('item').strip()
        ingredients.append({
            "text": ingredient_text,
            "quantity": quantity,
            "unit": unit,
            "item": item_name
        })

# Extract directions
directions_container = soup.find('div', itemprop='recipeInstructions')
directions = [re.sub(r'\s+', ' ', item.text.strip().replace('\n', ' ')).strip() for item in directions_container.find_all('p', class_='line')]

# Extract nutrition information
nutrition_text = re.sub(r'\s+', ' ', soup.find('div', itemprop='nutrition').text.strip().replace('\n', ' '))
nutrition_items = re.split(r'(?<=\))\s*(?=[A-Z])|(?<=\d)\s*(?=[A-Z])', nutrition_text)  # Split by spaces before capital letters after numbers or parentheses

nutrition = {}
for item in nutrition_items:
    if ':' in item:
        key, value = item.split(':', 1)
        key = key.strip()
        value = value.strip()
        if key.startswith("Vitamin A") and "IU" in value:
            key = "Vitamin A"
            value = value.replace(" IU", "IU")
        nutrition[key] = value

# Construct the output JSON object
recipe_data = {
    "name": recipe_name,
    "prep_time": prep_time,
    "cook_time": cook_time,
    "servings": servings,
    "source": source,
    "image_url": image_url,
    "ingredients": ingredients,
    "directions": directions,
    "nutrition": nutrition
}

# Output the JSON object
json_output = json.dumps(recipe_data, indent=4)
print(json_output)

{
    "name": "Beef Stroganoff",
    "prep_time": 15,
    "cook_time": 15,
    "servings": 4,
    "source": "[recipetineats.com](https://www.recipetineats.com/beef-stroganoff/)",
    "image_url": "https://www.recipetineats.com/wp-content/uploads/2018/01/Beef-Stroganoff_2-1-1.jpg?resize=650,910",
    "ingredients": [
        {
            "text": "600 g scotch fillet steak (Note 1)",
            "quantity": "600",
            "unit": "g",
            "item": "scotch fillet steak (Note 1)"
        },
        {
            "text": "2 T vegetable oil, divided",
            "quantity": "2",
            "unit": "T",
            "item": "vegetable oil, divided"
        },
        {
            "text": "1 large onion, sliced",
            "quantity": "1",
            "unit": "large",
            "item": "onion, sliced"
        },
        {
            "text": "300 g mushrooms, sliced",
            "quantity": "300",
            "unit": "g",
            "item": "mushrooms, sliced"
        },
  

# Using LLM to parse content.
I have defined the JSON schema I want the recipe to be in in `recipe_schema.json`. What if we could use pass the model the html recipe and the recipe_schema and get the model to parse it for us?

In [69]:
import json
from bs4 import BeautifulSoup
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from jsonschema import validate

# Function to read the JSON schema
def read_json_schema(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Read the HTML file content
def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to extract the raw recipe text from the HTML
def extract_recipe_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    recipe_div = soup.find("div", {"class": "recipe"})
    return recipe_div.get_text(separator="\n")

# Function to convert recipe in html format to json using a model

def convert_recipe_html_to_json(html_file_path, schema_file_path):
    # set system and user message:
    system_message = """The recipe below is provided in HTML format. I want you to:
        1.	Extract the relevant information from the recipe into the JSON structure below, using the property descriptions for guidance.
        2.	Populate all required fields. If the information doesn’t exist, leave the field empty.
        3.	Where specific values are specified, select from those available values.
        4.	Provide the data in the requested formats.

    JSON Structure:
    {schema}
    """
    user_message = "{recipe_text}"

    # create prompt template
    prompt_template = ChatPromptTemplate.from_messages([("system",system_message),("user", user_message)])

    # define model and chain
    model = ChatOpenAI(model="gpt-4o-mini")
    schema = read_json_schema(schema_file_path) # extract json dict form schema
    structured_model = model.with_structured_output(schema) # ensuring structured output from model
    chain = prompt_template | structured_model
    
    # prep recipe and invoke chain.
    recipe_text = extract_recipe_text(read_html_file(html_file_path))
    result = chain.invoke({"recipe_text":recipe_text, "schema":schema})

    return result


In [71]:
html_file_path = './recipe_data/Recipes/Beef Stroganoff.html'
schema_file_path = 'recipe_schema.json'

convert_recipe_html_to_json(html_file_path, schema_file_path)

{'name': 'Beef Stroganoff',
 'notes': None,
 'prepTime': 'PT15M',
 'tags': None,
 'recipeCuisine': 'American',
 'suitableForDiet': None,
 'cookTime': 'PT15M',
 'recipeYield': 4,
 'source': 'http://recipetineats.com',
 'recipeImage': None,
 'recipeIngredients': [{'raw': '600 g scotch fillet steak',
   'quantity': '600',
   'unit': 'gram'},
  {'raw': '2 T vegetable oil, divided', 'quantity': '2', 'unit': 'tablespoon'},
  {'raw': '1 large onion, sliced', 'quantity': None, 'unit': None},
  {'raw': '300 g mushrooms, sliced', 'quantity': '300', 'unit': 'gram'},
  {'raw': '40 g butter', 'quantity': '40', 'unit': 'gram'},
  {'raw': '2 T flour', 'quantity': '2', 'unit': 'tablespoon'},
  {'raw': '2 c beef broth, preferably salt reduced',
   'quantity': '2',
   'unit': 'cup'},
  {'raw': '1 T Dijon mustard', 'quantity': '1', 'unit': 'tablespoon'},
  {'raw': '150 ml sour cream', 'quantity': '150', 'unit': 'milliliter'},
  {'raw': 'Salt and pepper', 'quantity': None, 'unit': None},
  {'raw': '250-30

In [73]:
read_html_file(html_file_path)

'<!DOCTYPE html>\n<html>\n  <head>\n    <meta charset="UTF-8" />\n    <style type="text/css">\n      /* Shared styles */\n      body {\n        font-family: Helvetica, sans-serif;\n        font-size: 16px;\n        color: #34302e;\n        margin: 0.25in;\n      }\n      .name {\n        font-size: 18px;\n        font-family: Helvetica, sans-serif;\n        font-weight: normal;\n        margin: 0 0 10px 0;\n      }\n      .categories {\n        color: #605d5d;\n        font-size: 14px;\n        font-family: Helvetica, sans-serif;\n        font-style: italic;\n      }\n      .rating {\n        color: #d10505;\n        font-size: 14px;\n      }\n      .metadata {\n        font-size: 14px;\n      }\n      .infobox p {\n        margin: 0;\n        line-height: 150%;\n      }\n      .subhead {\n        color: #d10505;\n        font-weight: bold;\n        font-size: 14px;\n        text-transform: uppercase;\n        margin: 10px 0;\n      }\n\n      .ingredients p {\n        margin: 4px 0;\n