# html-2-json.ipynb

In [None]:
!pip install -U beautifulsoup4

In [11]:
import json
from bs4 import BeautifulSoup

# Define constants
EMAIL_TAG = "email-tag"
HTML_FILE_PATH = "email.html"
JSON_FILE_PATH = HTML_FILE_PATH.rsplit('.', 1)[0] + '.json'

# Function to extract nested tags
def extract_nested_tags(tag):
    children = {}
    for child in tag.find_all(attrs={EMAIL_TAG: True}, recursive=False):
        child_tag = child.get(EMAIL_TAG)
        children[child_tag] = extract_nested_tags(child)
    
    if children:
        return children
    else:
        return str(tag.encode_contents().decode('utf-8')).strip()

# Function to extract email tags to JSON
def extract_email_tags_to_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    root_tag = soup.find(attrs={EMAIL_TAG: "email"})
    if root_tag:
        result = {root_tag.get(EMAIL_TAG): extract_nested_tags(root_tag)}
        return json.dumps(result, indent=4)
    else:
        return json.dumps({}, indent=4)

# Function to save JSON to file
def save_json_to_file(json_output, file_path):
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json_file.write(json_output)

# Main code to run the functions
json_output = extract_email_tags_to_json(HTML_FILE_PATH)
save_json_to_file(json_output, JSON_FILE_PATH)
print(json_output)


{
    "email": {
        "subject": "<b>Meeting Tomorrow</b>",
        "email-logo": "<img alt=\"Logo\" src=\"./email-logo-01.png\" style=\"width: 20%;\"/>",
        "from": "jane.doe@example.com",
        "to": "john.doe@example.com",
        "cc": "manager@example.com",
        "bcc": "hr@example.com",
        "body": "<br/>Let's meet tomorrow at 10am.<br/>\n<i style=\"font-size: 14pt; color: blue;\">Please read the presentation from last week.</i>"
    }
}
