In [71]:
import requests as re
from bs4 import BeautifulSoup

In [72]:
BASE_URL = 'https://www.dol.gov'

__Get HTML__

In [73]:
html = get_html(f"{BASE_URL}/agencies/whd/minimum-wage/state")
soup = BeautifulSoup(html, "html.parser")

In [74]:
cat = soup.find('div',{"id": "states"})
state = cat.find_all('div')

In [88]:
def get_html(url):
    """Fetch HTML content from a given URL."""
    response = re.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    return response.text

def parse_html(html):
    """Parse HTML and extract relevant data."""
    data = []
    for link in html.find_all('a', href=True):
        data.append({
            'text': link.get_text(strip=True),
            'url': BASE_URL + link['href'] if link['href'].startswith('/') else link['href']
        })
    
    return data

def replace_href(text, link):
    """Replace hrefs in the data with full URLs."""
    if not link:
        return
    url = link[0]['url']
    href_html = get_html(url)
    href_soup = BeautifulSoup(href_html, "html.parser")
    if '#' in url:
        div = soup.find('div', {'id': url[url.index('#') + 1:]})
        text_to_replace = div.sup.text.strip()
        text = text.replace(text_to_replace, href_soup.get_text(strip=True))
        return text
    

In [76]:
footnote = re.get("https://www.dol.gov/agencies/whd/minimum-wage/state#footnote")
soup2 = BeautifulSoup(footnote.text, "html.parser")
soup2.find('div', {"id": "footnote"})

<div id="footnote">
<p><sup>1 </sup>The overtime premium rate is one and one-half times the employee's regular rate, unless otherwise specified.</p>
</div>

In [89]:
countries = {}

for state_name in state:
    link = parse_html(state_name)
    countries[state_name.h2.text.capitalize()] = {
        'acronym': state_name.get('id').lower(),
        'text': ' '.join(s.text.replace('\n', ' ') for s in state_name.h2.find_next_siblings()),
        'href': link
    }
    countries[state_name.h2.text.capitalize()]['text'] = replace_href(countries[state_name.h2.text.capitalize()]['text'], link)

    

    

In [90]:
countries

{'Alabama': {'acronym': 'al', 'text': None, 'href': []},
 'Alaska': {'acronym': 'ak',
  'text': None,
  'href': [{'text': '1',
    'url': 'https://www.dol.gov/agencies/whd/minimum-wage/state#footnote'}]},
 'American samoa': {'acronym': 'as',
  'text': None,
  'href': [{'text': 'special minimum wage rates',
    'url': 'https://www.dol.gov/agencies/whd/state/minimum-wage/american-samoa'}]},
 'Arizona': {'acronym': 'az', 'text': None, 'href': []},
 'Arkansas': {'acronym': 'ar',
  'text': None,
  'href': [{'text': '1',
    'url': 'https://www.dol.gov/agencies/whd/minimum-wage/state#footnote'}]},
 'California': {'acronym': 'ca',
  'text': None,
  'href': [{'text': '1',
    'url': 'https://www.dol.gov/agencies/whd/minimum-wage/state#footnote'}]},
 'Colorado': {'acronym': 'co',
  'text': None,
  'href': [{'text': '1',
    'url': 'https://www.dol.gov/agencies/whd/minimum-wage/state#footnote'}]},
 'Connecticut': {'acronym': 'ct',
  'text': None,
  'href': [{'text': '1',
    'url': 'https://www.