# html-2-json.ipynb

## Installation 

See [README.md](README.md) for installation instructions.

In [1]:
!pip install -U beautifulsoup4



## Common Definitions

In [2]:
import json
from bs4 import BeautifulSoup
import re

# Define constants
EMAIL_TAG = "email-tag"
HTML_TEMPLATE_FILE_PATH = "email-template.html"
JSON_TEMPLATE_FILE_PATH = HTML_TEMPLATE_FILE_PATH.rsplit(".", 1)[0] + ".json"
JSON_INPUT_FILE_PATH = "email-data-input.json"
JSON_OUTPUT_FILE_PATH = "email-data-output.json"

## Convert HTML Template to JSON Template

In [3]:
def extract_nested_tags(tag):
    children = {}
    for child in tag.find_all(attrs={EMAIL_TAG: True}, recursive=False):
        child_tag = child.get(EMAIL_TAG)
        children[child_tag] = extract_nested_tags(child)

    if children:
        return children
    else:
        return str(tag.encode_contents().decode("utf-8")).strip()


def extract_email_tags_to_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, "html.parser")
    root_tag = soup.find(attrs={EMAIL_TAG: "email"})
    if root_tag:
        result = {root_tag.get(EMAIL_TAG): extract_nested_tags(root_tag)}
        return json.dumps(result, indent=4)
    else:
        return json.dumps({}, indent=4)


def save_json_to_file(json_output, file_path):
    with open(file_path, "w", encoding="utf-8") as json_file:
        json_file.write(json_output)

json_output = extract_email_tags_to_json(HTML_TEMPLATE_FILE_PATH)
save_json_to_file(json_output, JSON_TEMPLATE_FILE_PATH)
print(json_output)


{
    "email": {
        "subject": "<b>Meeting Tomorrow</b>",
        "email-logo": "<img alt=\"Logo\" src=\"./email-logo-01.png\" style=\"width: 20%;\"/>",
        "from": "[[email_from]]",
        "to": "[[email_to]]",
        "cc": "[[email_cc]]",
        "bcc": "[[email_bcc]]",
        "body": "Let's meet tomorrow at 10am.<br/>\n<i style=\"font-size: 14pt; color: blue;\">Please read the report [[email_report_name]].</i><br/>\n        and remember:\n        <ul>\n<li>Drink H<sub>2</sub>O</li>\n<li>2<sup>8</sup> = 256</li>\n</ul>"
    }
}


## Extract Variables from JSON Template [OPTIONAL]

In [4]:
def find_variables(json_data):
    variables = set()
    for key, value in json_data.items():
        if isinstance(value, dict):
            variables.update(find_variables(value))
        elif isinstance(value, str):
            matches = re.findall(r"\[\[(.*?)\]\]", value)
            variables.update(matches)
    return variables


def print_variables(variables):
    print("Variables found:")
    for var in variables:
        print(var)

with open(JSON_TEMPLATE_FILE_PATH, "r", encoding="utf-8") as f:
    json_data = json.load(f)

variables = find_variables(json_data)
print_variables(variables)


Variables found:
email_from
email_report_name
email_to
email_cc
email_bcc


## JSON Merge

In [5]:
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)


def replace_variables(template, values):
    if isinstance(template, dict):
        return {
            key: replace_variables(value, values) for key, value in template.items()
        }
    elif isinstance(template, str):
        return re.sub(
            r"\[\[(.*?)\]\]", lambda m: values.get(m.group(1), m.group(0)), template
        )
    else:
        return template


def merge_json_emails(template_json, values_json):
    final_json = {}
    for email_key, email_values in values_json.items():
        final_json[email_key] = replace_variables(template_json["email"], email_values)
    return final_json


# Main code
json_template = read_json_file(JSON_TEMPLATE_FILE_PATH)
data_input_json = read_json_file(JSON_INPUT_FILE_PATH)

merged_json = merge_json_emails(json_template, data_input_json)

print(json.dumps(merged_json, indent=4))

save_json_to_file(json.dumps(merged_json, indent=4), JSON_OUTPUT_FILE_PATH)

{
    "email_001": {
        "subject": "<b>Meeting Tomorrow</b>",
        "email-logo": "<img alt=\"Logo\" src=\"./email-logo-01.png\" style=\"width: 20%;\"/>",
        "from": "joe.001@example.com",
        "to": "tom.001@example.com",
        "cc": "charlie.001@example.com",
        "bcc": "ben.001@example.com",
        "body": "Let's meet tomorrow at 10am.<br/>\n<i style=\"font-size: 14pt; color: blue;\">Please read the report report-2023-06-08.xlsx.</i><br/>\n        and remember:\n        <ul>\n<li>Drink H<sub>2</sub>O</li>\n<li>2<sup>8</sup> = 256</li>\n</ul>"
    },
    "email_002": {
        "subject": "<b>Meeting Tomorrow</b>",
        "email-logo": "<img alt=\"Logo\" src=\"./email-logo-01.png\" style=\"width: 20%;\"/>",
        "from": "joe.002@example.com",
        "to": "tom.002@example.com",
        "cc": "charlie.002@example.com",
        "bcc": "ben.002@example.com",
        "body": "Let's meet tomorrow at 10am.<br/>\n<i style=\"font-size: 14pt; color: blue;\">Please re