# Setup environment

In [5]:
import os
from dotenv import load_dotenv
import json
import os
from bs4 import BeautifulSoup
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from jsonschema import validate
from datetime import datetime
import markdown


load_dotenv()

tracing_v2 = os.getenv("LANGCHAIN_TRACING_V2")
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

from langchain_openai import ChatOpenAI

# Preparing the data
In this prototype, i have exported all my recipes from the Paprika app in html format. These files can all be found in the `recipe_data` folder. The first thing we want to do is parse this html content into a JSON format for now. 

At a later date, we will want this data to be stored in a database.

## Using LLM to parse content.
I have defined the JSON schema I want the recipe to be in in `recipe_schema.json`. 

User story:
- I want to convert a recipe that is in html format into a json object that corresponds to the `recipe_schema.json`. Processing and updating the raw recipe as necessary so that there is consistency in my recipe formats.

In [6]:
# Function to read the JSON schema
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Read the HTML file content
def read_html_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Function to extract the raw recipe text from the HTML
def extract_recipe_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    recipe_div = soup.find("div", {"class": "recipe"})
    return recipe_div.get_text(separator="\n")

# Function to save JSON data to a file
def save_json_to_file(data, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# Function to convert ISO8601 time into readable format.
def convert_time(time_str):
    time_str = time_str.replace("PT", "")
    hours = minutes = ""
    if "H" in time_str:
        hours, time_str = time_str.split("H")
        hours += " hours"
    if "M" in time_str:
        minutes, _ = time_str.split("M")
        minutes += " minutes"
    return f"{hours} {minutes}".strip()

# Function to convert a recipe in JSON format to a temporary readable html format.
def convert_recipe_json_to_html(input_file_path, output_dir):
    # Load JSON data
    data = read_json(input_file_path)

    # Extract and convert times
    prep_time = convert_time(data["prepTime"])
    cook_time = convert_time(data["cookTime"])

    # Convert Markdown to HTML and handle newlines for instructions
    instructions_html = ''.join(["<li>" + markdown.markdown(step['instruction'].replace('\n', '  \n')) + "</li>" for step in data["recipeInstructions"]])

    # Generate HTML content
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>{data['name']}</title>
        <style>
            body {{
                font-family: Arial, sans-serif;
                line-height: 1.6;
            }}
            .container {{
                max-width: 800px;
                margin: 0 auto;
                padding: 20px;
            }}
            h1 {{
                text-align: center;
                margin-bottom: 20px;
            }}
            .notes {{
                font-style: italic;
                margin-bottom: 20px;
            }}
            .time {{
                margin-bottom: 20px;
            }}
            .ingredients, .instructions {{
                margin-bottom: 20px;
            }}
            .ingredients ul, .instructions ul {{
                list-style-type: none;
                padding: 0;
            }}
            .ingredients li, .instructions li {{
                margin-bottom: 10px;
            }}
        </style>
    </head>
    <body>
        <div class="container">
            <h1>{data['name']}</h1>
            <div class="notes">{data['notes']}</div>
            <div class="time">
                <strong>Prep Time:</strong> {prep_time}<br>
                <strong>Cook Time:</strong> {cook_time}
            </div>
            <div class="ingredients">
                <h2>Ingredients</h2>
                <ul>
    """

    for ingredient in data["recipeIngredients"]:
        html_content += f"<li>{ingredient['quantity']} {ingredient['unit']} {ingredient['ingredient']}</li>"

    html_content += """
                </ul>
            </div>
            <div class="instructions">
                <h2>Instructions</h2>
                <ul>
    """

    html_content += instructions_html

    html_content += """
                </ul>
            </div>
        </div>
    </body>
    </html>
    """

    # Define output file name and path
    output_file_name = os.path.basename(input_file_path).replace('.json', '.html')
    output_file_path = os.path.join(output_dir, output_file_name)

    # Write HTML content to file
    with open(output_file_path, "w") as file:
        file.write(html_content)

    print(f"HTML file has been created: {output_file_path}")

# Function to convert recipe in html format to json and a corresponding html using a model.
def convert_recipe_html(html_file_path, schema_file_path, json_output_dir, html_output_dir):
    # set system and user message:
    system_message = """The recipe below is provided in HTML format. I want you to:
        1. Extract the relevant information from the recipe into the JSON structure below, using the property descriptions for guidance.
        2. Populate all required fields. If the information doesn’t exist, leave the field empty.
        3. Where specific values are specified, select from those available values.
        4. Provide the data in the requested formats.

    JSON Structure:
    {schema}
    """
    user_message = "{recipe_text}"

    # create prompt template
    prompt_template = ChatPromptTemplate.from_messages([("system", system_message), ("user", user_message)])

    # define model and chain
    model = ChatOpenAI(model="gpt-4o-mini")
    schema = read_json(schema_file_path)  # extract json dict form schema
    structured_model = model.with_structured_output(schema)  # ensuring structured output from model
    chain = prompt_template | structured_model

    # prep recipe and invoke chain.
    recipe_text = extract_recipe_text(read_html_file(html_file_path))
    recipe_json = chain.invoke({"recipe_text": recipe_text, "schema": schema})

    # create json file
    json_output_file_path = os.path.join(json_output_dir, os.path.basename(html_file_path).replace('.html', '.json'))
    save_json_to_file(recipe_json, json_output_file_path)
    print(f"Recipe JSON saved to {json_output_file_path}")

    # create html file
    convert_recipe_json_to_html(json_output_file_path, html_output_dir)

# Function to process an array of HTML file paths
def process_recipes_from_list(html_file_paths, schema_file_path, json_output_dir, html_output_dir):
    for html_file_path in html_file_paths:
        convert_recipe_html(html_file_path, schema_file_path, json_output_dir, html_output_dir)


Testing the function.

In [19]:
html_file_paths = [
    './recipe_data/Recipes/recipes_original_html/Beef Stroganoff.html'
]

schema_file_path = 'schemas/recipe_schema_v2.json'
json_output_dir = './recipe_data/Recipes/recipes_json'
html_output_dir = './recipe_data/Recipes/recipes_html'

process_recipes_from_list(html_file_paths, schema_file_path, json_output_dir, html_output_dir)
# convert_recipe_html(html_file_paths[6], schema_file_path, output_dir)


Recipe JSON saved to ./recipe_data/Recipes/recipes_json/Beef Stroganoff.json
HTML file has been created: ./recipe_data/Recipes/recipes_html/Beef Stroganoff.html


# Creating meal prep plans
Now that we have processed a number of recipes, the next use case we want to try tackle is combining these recipes into a consolidated set of ingredients and instructions so you can prep them in bulk.

In [12]:
# Function to convert recipe in html format to json using a model

def convert_mealprep_json_to_html(input_file_path, output_dir):
    
    # Load JSON data
    data = read_json(input_file_path)

    # Extract and convert times
    prep_time = convert_time(data["prepTime"])
    cook_time = convert_time(data["cookTime"])

    # Convert Markdown to HTML and handle newlines
    instructions_html = markdown.markdown(data["recipeInstructions"].replace("\n", "  \n"))

    # Generate HTML content
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>{data['name']}</title>
        <style>
            body {{
                font-family: Arial, sans-serif;
                line-height: 1.6;
            }}
            .container {{
                max-width: 800px;
                margin: 0 auto;
                padding: 20px;
            }}
            h1 {{
                text-align: center;
                margin-bottom: 20px;
            }}
            .notes {{
                font-style: italic;
                margin-bottom: 20px;
            }}
            .time {{
                margin-bottom: 20px;
            }}
            .ingredients, .instructions {{
                margin-bottom: 20px;
            }}
            .ingredients ul, .instructions ul {{
                list-style-type: none;
                padding: 0;
            }}
            .ingredients li, .instructions li {{
                margin-bottom: 10px;
            }}
        </style>
    </head>
    <body>
        <div class="container">
            <h1>{data['name']}</h1>
            <div class="notes">{data['notes']}</div>
            <div class="time">
                <strong>Prep Time:</strong> {prep_time}<br>
                <strong>Cook Time:</strong> {cook_time}
            </div>
            <div class="ingredients">
                <h2>Ingredients</h2>
                <ul>
    """

    for ingredient in data["recipeIngredients"]:
        html_content += f"<li>{ingredient['quantity']} {ingredient['unit']} {ingredient['ingredient']}</li>"

    html_content += """
                </ul>
            </div>
            <div class="instructions">
                <h2>Instructions</h2>
                <div>
    """

    html_content += instructions_html

    html_content += """
                </div>
            </div>
        </div>
    </body>
    </html>
    """

    # Define output file name and path
    output_file_name = os.path.basename(input_file_path).replace('.json', '.html')
    output_file_path = os.path.join(output_dir, output_file_name)

    # Write HTML content to file
    with open(output_file_path, "w") as file:
        file.write(html_content)

    print(f"HTML file has been created: {output_file_path}")

def convert_recipes_to_meal_prep(recipe_file_path_1, recipe_file_path_2, recipe_file_path_3, schema_file_path, json_output_dir, html_output_dir):
    # set system and user message:
    system_message = """The recipes below are provided in JSON format. I want you to:
        1. Consolidate the ingredients and instructions from these recipes into a single set of ingredients and instructions. This consolidation of recipes is called a meal prep plan. 
        2. Use the property descriptions in the provided JSON schema for guidance on how to populate each property.
        3. Populate all fields in the JSON structure.
        4. Provide the consolidated meal prep data in the JSON schema format.

    JSON Schema:
    {schema}
    """
    user_message = """
    Recipe 1:
    {recipe_text_1}
    Recipe 2:
    {recipe_text_2}
    Recipe 3:
    {recipe_text_3}
    """

    # create prompt template
    prompt_template = ChatPromptTemplate.from_messages([("system", system_message), ("user", user_message)])

    # define model and chain
    model = ChatOpenAI(model="gpt-4o")
    schema = read_json(schema_file_path)  # extract json dict from schema
    structured_model = model.with_structured_output(schema)  # ensuring structured output from model
    chain = prompt_template | structured_model

    # prep recipe and invoke chain.
    recipe_text_1 = read_json(recipe_file_path_1)
    recipe_text_2 = read_json(recipe_file_path_2)
    recipe_text_3 = read_json(recipe_file_path_3)
    
    meal_prep_json = chain.invoke({"recipe_text_1": recipe_text_1, "recipe_text_2": recipe_text_2, "recipe_text_3": recipe_text_3, "schema": schema})


    # save output to json file.
    current_datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    json_output_file_path = os.path.join(json_output_dir, f'meal_plan_{current_datetime}.json')
    save_json_to_file(meal_prep_json, json_output_file_path)
    print(f"Recipe JSON saved to {json_output_file_path}")

    # create html file
    convert_mealprep_json_to_html(json_output_file_path, html_output_dir)


In [16]:
recipe_file_paths = [
  './recipe_data/Recipes/recipes_json/The BEST Japanese Fried Chicken.json',
  './recipe_data/Recipes/recipes_json/Slow-Roasted Crispy Pork Belly.json',
  './recipe_data/Recipes/recipes_json/Yotam Ottolenghi’s Cauliflower Salad.json'
]

recipe_file_path_1 = recipe_file_paths[0]
recipe_file_path_2 = recipe_file_paths[1]
recipe_file_path_3 = recipe_file_paths[2]
schema_file_path = './schemas/meal_prep_schema_v1.json'
json_output_dir = './recipe_data/meal_prep/meal_prep_json'
html_output_dir = './recipe_data/meal_prep/meal_prep_html'

convert_recipes_to_meal_prep(recipe_file_path_1, recipe_file_path_2, recipe_file_path_3, schema_file_path, json_output_dir, html_output_dir)


Recipe JSON saved to ./recipe_data/meal_prep/meal_prep_json/meal_plan_2024-08-03_19-22-36.json
HTML file has been created: ./recipe_data/meal_prep/meal_prep_html/meal_plan_2024-08-03_19-22-36.html


# Viewing the recipes
Now that are able to convert the recipes into a structured JSON format, we need a quick and easy way to view the content in HTML.

# Trying to extract structured data from existing recipe URLs

In [20]:
import requests
from bs4 import BeautifulSoup
import json

# Step 1: Fetch the webpage content
url = "https://www.recipetineats.com/beef-stroganoff/"
response = requests.get(url)
web_content = response.content

# Step 2: Parse the HTML content
soup = BeautifulSoup(web_content, 'html.parser')

# Step 3: Extract the structured data
script_tag = soup.find('script', type='application/ld+json')
if script_tag:
    structured_data = json.loads(script_tag.string)
    print(json.dumps(structured_data, indent=2))
else:
    print("No structured data found.")

{
  "@context": "https://schema.org",
  "@graph": [
    {
      "@type": "Article",
      "@id": "https://www.recipetineats.com/beef-stroganoff/#article",
      "isPartOf": {
        "@id": "https://www.recipetineats.com/beef-stroganoff/"
      },
      "author": {
        "name": "Nagi",
        "@id": "https://www.recipetineats.com/#/schema/person/1684e6a75e9f91ae2e33ca2de95b47e2"
      },
      "headline": "Beef Stroganoff",
      "datePublished": "2019-02-27T23:35:05+00:00",
      "dateModified": "2020-02-20T18:31:34+00:00",
      "wordCount": 1368,
      "commentCount": 1126,
      "publisher": {
        "@id": "https://www.recipetineats.com/#organization"
      },
      "image": {
        "@id": "https://www.recipetineats.com/beef-stroganoff/#primaryimage"
      },
      "thumbnailUrl": "https://www.recipetineats.com/tachyon/2018/01/Beef-Stroganoff_2-1-1.jpg",
      "articleSection": [
        "Beef",
        "Beef Recipes",
        "Gluten Free",
        "Main Dishes",
        "