# html-2-json.ipynb

## Introduction

- HTML to JSON: extract custom tags from HTML and save them as **JSON template** **[INTERMEDIATE STEP]**
- **[ACTUAL REQUIREMENT]** Generate a JSON **[Payload]** file from a JSON template and a JSON data file   
- JSON to HTML: render variables based on a HTML template and a JSON payload **[DISPLAY/TEST PURPOSES]**

## Installation 

See [README.md](README.md) for installation instructions.

In [13]:
#!pip install beautifulsoup4 

import subprocess
import re

# Capture the output of the pip install command
result = subprocess.run(['pip', 'install', 'beautifulsoup4'], capture_output=True, text=True)

# Use regex to replace local path patterns
# This will replace paths like "C:\Users\..." or "/Users/..." with "..."
cleaned_output = re.sub(r'([A-Za-z]:\\[^ \n]+)|(/[^ \n]+)', '...', result.stdout)

print(cleaned_output)




## Common Definitions

In [2]:
import json
from bs4 import BeautifulSoup
import re

# Define constants
EMAIL_TAG = "email-tag"
OUTPUT_PATH="output"

HTML_TEMPLATE_INPUT_FILE_PATH = "email-template-input.html"
HTML_TEMPLATE_OUTPUT_FILE_PATH = "email-template-output.html"

JSON_TEMPLATE_INPUT_FILE_PATH = f"{OUTPUT_PATH}/email-template-input.json"  
JSON_INPUT_FILE_PATH = "email-data-input.json"
JSON_OUTPUT_FILE_PATH = f"{OUTPUT_PATH}/email-data-output.json"

VAR_OPEN_CHAR = "{"
VAR_CLOSE_CHAR = "}"
VAR_OPEN_SEQUENCE = f"\{VAR_OPEN_CHAR}\{VAR_OPEN_CHAR}"
VAR_CLOSE_SEQUENCE = f"\{VAR_CLOSE_CHAR}\{VAR_CLOSE_CHAR}"
TEMPLATE_VAR_PATTERN = rf"{VAR_OPEN_SEQUENCE}(.*?){VAR_CLOSE_SEQUENCE}"

## Convert HTML Template to JSON Template [OPTIONAL]

In [3]:
def extract_nested_tags(tag):
    children = {}
    for child in tag.find_all(attrs={EMAIL_TAG: True}, recursive=False):
        child_tag = child.get(EMAIL_TAG)
        children[child_tag] = extract_nested_tags(child)

    if children:
        return children
    else:
        return str(tag.encode_contents().decode("utf-8")).strip()


def extract_email_tags_to_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, "html.parser")
    root_tag = soup.find(attrs={EMAIL_TAG: "email"})
    if root_tag:
        result = {root_tag.get(EMAIL_TAG): extract_nested_tags(root_tag)}
        return json.dumps(result, indent=4)
    else:
        return json.dumps({}, indent=4)


def save_json_to_file(json_output, file_path):
    with open(file_path, "w", encoding="utf-8") as json_file:
        json_file.write(json_output)

json_output = extract_email_tags_to_json(HTML_TEMPLATE_INPUT_FILE_PATH)
save_json_to_file(json_output, JSON_TEMPLATE_INPUT_FILE_PATH)
print(json_output)


{
    "email": {
        "email_subject": "<b>Meeting Tomorrow</b>",
        "email_logo": "<img alt=\"Logo\" src=\"../images/{{email_logo}}\" style=\"width: 20%;\"/>",
        "email_from": "{{email_from}}",
        "email_to": "{{email_to}}",
        "email_cc": "{{email_cc}}",
        "email_bcc": "{{email_bcc}}",
        "email_body": "Let's meet tomorrow at 10am.<br/>\n<i style=\"font-size: 14pt; color: blue;\">\n            Please read the report {{email_report_name}}.</i><br/>\n        and remember:\n        <ul>\n<li>Drink H<sub>2</sub>O</li>\n<li>2<sup>8</sup> = 256</li>\n</ul>"
    }
}


## Extract Variables from JSON Template [OPTIONAL]

In [4]:

def find_variables(json_data):
    variables = set()
    for key, value in json_data.items():
        if isinstance(value, dict):
            variables.update(find_variables(value))
        elif isinstance(value, str):
            matches = re.findall(TEMPLATE_VAR_PATTERN, value)
            variables.update(matches)
    return variables


def print_variables(variables):
    print("Variables found:")
    for var in variables:
        print(f"{VAR_OPEN_CHAR}{VAR_OPEN_CHAR}{var}{VAR_CLOSE_CHAR}{VAR_CLOSE_CHAR} => {var}")

with open(JSON_TEMPLATE_INPUT_FILE_PATH, "r", encoding="utf-8") as f:
    json_data = json.load(f)

variables = find_variables(json_data)
print_variables(variables)

Variables found:
{{email_report_name}} => email_report_name
{{email_from}} => email_from
{{email_cc}} => email_cc
{{email_to}} => email_to
{{email_logo}} => email_logo
{{email_bcc}} => email_bcc


## JSON Merge

In [5]:
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)


def replace_variables(template, values):
    if isinstance(template, dict):
        return {
            key: replace_variables(value, values) for key, value in template.items()
        }
    elif isinstance(template, str):
        return re.sub(
            TEMPLATE_VAR_PATTERN, lambda m: values.get(m.group(1), m.group(0)), template
        )
    else:
        return template


def merge_json_emails(template_json, values_json):
    final_json = {}
    for email_key, email_values in values_json.items():
        final_json[email_key] = replace_variables(template_json["email"], email_values)
    return final_json


# Main code
json_template = read_json_file(JSON_TEMPLATE_INPUT_FILE_PATH)
data_input_json = read_json_file(JSON_INPUT_FILE_PATH)

merged_json = merge_json_emails(json_template, data_input_json)

print(json.dumps(merged_json, indent=4))

save_json_to_file(json.dumps(merged_json, indent=4), JSON_OUTPUT_FILE_PATH)

{
    "email_001": {
        "email_subject": "<b>Meeting Tomorrow</b>",
        "email_logo": "<img alt=\"Logo\" src=\"../images/email-logo-01.png\" style=\"width: 20%;\"/>",
        "email_from": "joe.001@example.com",
        "email_to": "tom.001@example.com",
        "email_cc": "charlie.001@example.com",
        "email_bcc": "ben.001@example.com",
        "email_body": "Let's meet tomorrow at 10am.<br/>\n<i style=\"font-size: 14pt; color: blue;\">\n            Please read the report report-2023-06-08.xlsx.</i><br/>\n        and remember:\n        <ul>\n<li>Drink H<sub>2</sub>O</li>\n<li>2<sup>8</sup> = 256</li>\n</ul>"
    },
    "email_002": {
        "email_subject": "<b>Meeting Tomorrow</b>",
        "email_logo": "<img alt=\"Logo\" src=\"../images/email-logo-02.png\" style=\"width: 20%;\"/>",
        "email_from": "joe.002@example.com",
        "email_to": "tom.002@example.com",
        "email_cc": "charlie.002@example.com",
        "email_bcc": "ben.002@example.com",
        

## Generate HTML files based on Output HTML Template and JSON Data

In [6]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def write_to_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def replace_variables_in_html(html_template, variables):
    for var, value in variables.items():
        html_template = html_template.replace(f"{VAR_OPEN_CHAR}{VAR_OPEN_CHAR}{var}{VAR_CLOSE_CHAR}{VAR_CLOSE_CHAR}", value)
    return html_template

def generate_html_files_from_json(html_template, json_data):
    for email_key, email_values in json_data.items():
        html_content = replace_variables_in_html(html_template, email_values)
        output_file_name = f"{OUTPUT_PATH}/{email_key}.html"
        write_to_file(output_file_name, html_content)

html_template = read_file(HTML_TEMPLATE_OUTPUT_FILE_PATH)
json_data = read_json_file(JSON_OUTPUT_FILE_PATH)

generate_html_files_from_json(html_template, json_data)