In [1]:
from bs4 import BeautifulSoup
import os

def parse_local_html(file_path):
    """
    Reads a local HTML file and parses it using BeautifulSoup.

    Parameters:
        file_path (str): Path to the local HTML file.

    Returns:
        BeautifulSoup object: Parsed HTML ready for analysis.
    """
    # Open the HTML file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup

# Example usage
file_path = os.path.join("datasets", 'soc_codes.html')
soup = parse_local_html(file_path)
soup.prettify()

"<ul>\n <li>\n  11-1000\xa0 Top Executives\n  <ul>\n   <li>\n    11-1010\xa0 Chief Executives\n    <ul>\n     <li>\n      11-1011 Chief Executives\n     </li>\n    </ul>\n   </li>\n   <li>\n    11-1020\xa0 General and Operations Managers\n    <ul>\n     <li>\n      11-1021 General and Operations Managers\n     </li>\n    </ul>\n   </li>\n   <li>\n    11-1030\xa0 Legislators\n    <ul>\n     <li>\n      11-1031 Legislators\n     </li>\n    </ul>\n   </li>\n  </ul>\n </li>\n <li>\n  11-2000\xa0 Advertising, Marketing, Promotions, Public Relations, and Sales\n    Managers\n  <ul>\n   <li>\n    11-2010\xa0 Advertising and Promotions Managers\n    <ul>\n     <li>\n      11-2011 Advertising and Promotions Managers\n     </li>\n    </ul>\n   </li>\n   <li>\n    11-2020\xa0 Marketing and Sales Managers\n    <ul>\n     <li>\n      11-2021 Marketing Managers\n     </li>\n     <li>\n      11-2022 Sales Managers\n     </li>\n    </ul>\n   </li>\n   <li>\n    11-2030\xa0 Public Relations and Fundrai

In [2]:
classifications = {
    "11-0000": "Management Occupations",
    "13-0000": "Business and Financial Operations Occupations",
    "15-0000": "Computer and Mathematical Occupations",
    "17-0000": "Architecture and Engineering Occupations",
    "19-0000": "Life, Physical, and Social Science Occupations",
    "21-0000": "Community and Social Service Occupations",
    "23-0000": "Legal Occupations",
    "25-0000": "Educational Instruction and Library Occupations",
    "27-0000": "Arts, Design, Entertainment, Sports, and Media Occupations",
    "29-0000": "Healthcare Practitioners and Technical Occupations",
    "31-0000": "Healthcare Support Occupations",
    "33-0000": "Protective Service Occupations",
    "35-0000": "Food Preparation and Serving Related Occupations",
    "37-0000": "Building and Grounds Cleaning and Maintenance Occupations",
    "39-0000": "Personal Care and Service Occupations",
    "41-0000": "Sales and Related Occupations",
    "43-0000": "Office and Administrative Support Occupations",
    "45-0000": "Farming, Fishing, and Forestry Occupations",
    "47-0000": "Construction and Extraction Occupations",
    "49-0000": "Installation, Maintenance, and Repair Occupations",
    "51-0000": "Production Occupations",
    "53-0000": "Transportation and Material Moving Occupations",
    "55-0000": "Military Specific Occupations"
}
cat1_list = list(classifications.values())

In [3]:
output = {}

for index, ul1 in enumerate(soup.find_all('ul', recursive=False)):
    cat1 = cat1_list[index]
    for li1 in ul1.find_all('li', recursive=False):
        text2 = li1.find(text=True, recursive=False).strip()
        cat2 = " ".join(text2.split()[1:]).strip()
        code = text2.split()[0].strip()
        output[code] = [cat1, cat2, "Not assigned"]
        for li2 in li1.find("ul").find_all("li", recursive=False):
            text3 = li2.find(text=True, recursive=False).strip()
            cat3 = " ".join(text3.split()[1:]).strip()
            code = text3.split()[0].strip()
            output[code] = [cat1, cat2, cat3]
            codes = [x.find(text=True, recursive=False).split()[0].strip() for x in li2.find("ul").find_all('li', recursive=False)]
            for code in codes:
                output[code] = [cat1, cat2, cat3]

output

  text2 = li1.find(text=True, recursive=False).strip()
  text3 = li2.find(text=True, recursive=False).strip()
  codes = [x.find(text=True, recursive=False).split()[0].strip() for x in li2.find("ul").find_all('li', recursive=False)]


{'11-1000': ['Management Occupations', 'Top Executives', 'Not assigned'],
 '11-1010': ['Management Occupations', 'Top Executives', 'Chief Executives'],
 '11-1011': ['Management Occupations', 'Top Executives', 'Chief Executives'],
 '11-1020': ['Management Occupations',
  'Top Executives',
  'General and Operations Managers'],
 '11-1021': ['Management Occupations',
  'Top Executives',
  'General and Operations Managers'],
 '11-1030': ['Management Occupations', 'Top Executives', 'Legislators'],
 '11-1031': ['Management Occupations', 'Top Executives', 'Legislators'],
 '11-2000': ['Management Occupations',
  'Advertising, Marketing, Promotions, Public Relations, and Sales Managers',
  'Not assigned'],
 '11-2010': ['Management Occupations',
  'Advertising, Marketing, Promotions, Public Relations, and Sales Managers',
  'Advertising and Promotions Managers'],
 '11-2011': ['Management Occupations',
  'Advertising, Marketing, Promotions, Public Relations, and Sales Managers',
  'Advertising and

In [4]:
output["00-0000"] = ["Not assigned"] * 3 
output["99-9999"] = ["Insufficient info"] * 3
output["45-2092"] = ['Farming, Fishing, and Forestry Occupations','Agricultural Workers', 'Miscellaneous Agricultural Workers']
output["45-2093"] = ['Farming, Fishing, and Forestry Occupations','Agricultural Workers', 'Miscellaneous Agricultural Workers']
output["45-2099"] = ['Farming, Fishing, and Forestry Occupations','Agricultural Workers', 'Miscellaneous Agricultural Workers']
output["25-2010"] = ['Educational Instruction and Library Occupations','Preschool, Elementary, Middle, Secondary, and Special Education Teachers','Preschool and Kindergarten Teachers']

In [5]:
import pickle

with open(os.path.join("datasets", 'code_to_description.pkl'), 'wb') as file:
    pickle.dump(output, file)