In [1]:
import requests
from bs4 import BeautifulSoup
import re

__1968 - 1981__

In [2]:
import pandas as pd


In [None]:
page = requests.get("https://www.dol.gov/agencies/whd/state/minimum-wage/history")
soup = BeautifulSoup(page.content, 'html.parser')
tabelas = soup.find_all('table')

df_wage = []
for i, tabela in enumerate(tabelas):
    linhas = tabela.find_all('tr')
    cabecalho = linhas[0]
    anos = [th.text for th in cabecalho.find_all('th')[1:]]
    estados = []
    for estado in linhas[1:]:
        estados.append([td.text for td in estado.find_all('td')])
    df = pd.DataFrame(estados, columns=['state'] + anos)
    df_wage.append(df)
df = pd.concat(df_wage, ignore_index=True)

footnotes = soup.find('div', id='content')
footnotes.find_all('p')
list_footnotes = []
for p in footnotes.find_all('p'):
    if re.match(r'^[\[\(].[\]\)]', p.text):
        id_footnote = p.text.strip().split(' ')[0]
        text_footnote = ' '.join(p.text.strip().split(' ')[1:]).replace('- ','')
        list_footnotes.append((id_footnote, text_footnote))
footnotes_dict = {id_: text for id_, text in list_footnotes}
columns_to_adjust = [col for col in df.columns if  not col.isnumeric() and col != 'Estado']
footnote_year_bridge = {}
for key, _ in footnotes_dict.items():
    for col in columns_to_adjust:
        if key in col:
            footnote_year_bridge[col.replace(key, '').strip()] = key
            df = df.rename(columns={col: col.replace(key, '').strip()})

df_melted = df.melt(id_vars=['state'], var_name='year', value_name='minimal_wage').dropna()
df_melted['year'] = df_melted['year'].astype(int)
df_melted['minimal_wage'] = df_melted['minimal_wage'].str.replace('$', '')
df_melted['id'] = df_melted.index + 1

df_melted['minimal_wage'] = df_melted['minimal_wage'].str.replace(r'[\[\(].*?[\]\)]', '', regex=True)
df_melted['minimal_wage'] = df_melted['minimal_wage'].mask(
    df_melted['minimal_wage'].isin(['...', 'NA']), 
    pd.NA
)
if 'notes' not in df_melted.columns:
    df_melted['notes'] = pd.NA

if 'frequency' not in df_melted.columns:
    df_melted['frequency'] = pd.NA


def add_leading_zero(value):
    value = value.strip()
    if value.startswith('.'):
        return '0' + value
    return value

# Fun√ß√£o para processar valores com m√∫ltiplas taxas
def process_multiple_rates(row):
    wage = row['minimal_wage']
    
    if pd.notna(wage) and isinstance(wage, str):
        original_wage = wage
        
        # 1. Detectar e remover frequency markers
        frequency = None
        if '/day' in wage:
            frequency = 2
            wage = wage.replace('/day', '').strip()
        elif '/wk' in wage:
            frequency = 3
            wage = wage.replace('/wk', '').strip()
        
        # 2. Detectar m√∫ltiplos valores com regex mais robusto
        # Padr√£o: captura valores monet√°rios separados por -, &, /, ou espa√ßos
        pattern = r'\$?\d+\.?\d*'
        matches = re.findall(pattern, wage)
        
        if len(matches) >= 2:
            first_value = add_leading_zero(matches[0])
            second_value = add_leading_zero(matches[1])
            
            row['minimal_wage'] = first_value
            note = f"Or can be {second_value}, this reflects which rates differ by industry, occupation or other factors, as established under a wage-board type law"
            row['notes'] = note
        elif len(matches) == 1:
            row['minimal_wage'] = add_leading_zero(matches[0])
        else:
            row['minimal_wage'] = add_leading_zero(wage)
        
        # 3. Atualizar frequency
        if frequency is not None:
            row['frequency'] = frequency
    
    # Garantir valor padr√£o para frequency
    if pd.isna(row['frequency']):
        row['frequency'] = 1
    
    return row
# Aplicar a fun√ß√£o
df_melted = df_melted.apply(process_multiple_rates, axis=1)
df_melted['minimal_wage'] = df_melted['minimal_wage'].astype(str).str.extract(r'([\d.]+)', expand=False)

def notes_for_null_wage(row):
    if pd.isna(row['minimal_wage']) and pd.isna(row['notes']):
        return "This state utilizes the federal minimum wage"
    return row['notes']

df_melted['notes'] = df_melted.apply(notes_for_null_wage, axis=1)  
df_melted['minimal_wage'] = pd.to_numeric(df_melted['minimal_wage'], errors='coerce')
df_final = df_melted[['id', 'state', 'year', 'minimal_wage', 'frequency','notes']]


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def extrair_tabela_tipped_minimum_wage(year):
    url = f'https://www.dol.gov/agencies/whd/state/minimum-wage/tipped/{year}'
    tip_test = requests.get(url)
    if tip_test.status_code != 200:
        print(f"‚ùå Falha ao obter dados de {year} (status {tip_test.status_code})")
        return pd.DataFrame()
    
    tip_soup = BeautifulSoup(tip_test.content, 'html.parser')

    # 1. EXTRAIR FOOTNOTES
    footnotes_dict = {}
    for a_tag in tip_soup.find_all('a', attrs={'name': lambda x: x and x.startswith('foot')}):
        name = a_tag.get('name')
        parent_p = a_tag.find_parent('p')
        if parent_p:
            footnote_num = a_tag.get_text(strip=True)
            texto_completo = ' '.join(parent_p.get_text().split())
            texto_nota = texto_completo.replace(footnote_num, '', 1).strip()
            footnotes_dict[name] = texto_nota

    # 2. PROCESSAR TABELA
    tip_table = tip_soup.find('table')
    if not tip_table:
        print(f"‚ö†Ô∏è Nenhuma tabela encontrada em {year}")
        return pd.DataFrame()

    tip_linhas = tip_table.find_all('tr')[1:]
    header_order = ['jurisdiction', 'combinedrate', 'tipcredit', 'cashwage', 'definition']

    def processar_celula_valor(td_element, column_name):
        if not td_element:
            return None, None, []
        footnote_refs = []
        for link in td_element.find_all('a', href=True):
            href = link.get('href')
            if href:
                match = re.search(r'#(foot\d+)', href)
                if match:
                    footnote_refs.append(match.group(1))
        td_html = str(td_element)
        soup_copy = BeautifulSoup(td_html, 'html.parser')
        for link in soup_copy.find_all('a'):
            link.decompose()
        valor = ' '.join(soup_copy.get_text().split())
        footnote_texts = []
        for ref in footnote_refs:
            if ref in footnotes_dict:
                footnote_texts.append(f"[{column_name}] {footnotes_dict[ref]}")
        footnote_text = ' ; '.join(footnote_texts) if footnote_texts else None
        return valor if valor else None, footnote_text, footnote_refs

    def processar_jurisdiction(td_element):
        if not td_element:
            return None, None, None
        footnote_refs = []
        for link in td_element.find_all('a', href=True):
            href = link.get('href')
            if href:
                match = re.search(r'#(foot\d+)', href)
                if match:
                    footnote_refs.append(match.group(1))
        td_html = str(td_element)
        soup_copy = BeautifulSoup(td_html, 'html.parser')
        for link in soup_copy.find_all('a'):
            link.decompose()
        strong_tag = soup_copy.find('strong')
        if strong_tag:
            texto = ' '.join(strong_tag.get_text().split())
            nome_limpo = re.sub(r'[^a-zA-Z0-9\s]', '', texto)
        else:
            nome_limpo = soup_copy.get_text(strip=True)
        extra_text = soup_copy.get_text().replace(nome_limpo, '').strip()
        footnote_texts = [footnotes_dict[ref] for ref in footnote_refs if ref in footnotes_dict]
        footnote_text = ' ; '.join(footnote_texts) if footnote_texts else None
        return nome_limpo, footnote_text, extra_text

    dados_tabela = []
    ultima_jurisdiction = None
    ultima_footnote = None
    for tr in tip_linhas:
        row_data = {}
        tds = tr.find_all('td')
        if tds and tds[0].get('colspan'):
            continue
        td_jurisdiction = tr.find('td', headers='jurisdiction')
        todas_notas = []
        if td_jurisdiction and td_jurisdiction.find('strong'):
            jurisdiction_limpa, footnote_text, extra_text = processar_jurisdiction(td_jurisdiction)
            ultima_jurisdiction = jurisdiction_limpa
            ultima_footnote = footnote_text
            row_data['jurisdiction'] = jurisdiction_limpa
            if footnote_text:
                todas_notas.append(footnote_text)
            if extra_text:
                todas_notas.append(extra_text)
        else:
            if ultima_jurisdiction:
                row_data['jurisdiction'] = ultima_jurisdiction
                if ultima_footnote:
                    todas_notas.append(ultima_footnote)
        for td in tds:
            header_name = td.get('headers')[0] if td.get('headers') else None
            if not header_name:
                header_name = header_order[tds.index(td)] if len(tds) == 4 else header_order[tds.index(td) - 1]
            valor_limpo, footnote_text, _ = processar_celula_valor(td, header_name)
            if header_name != 'jurisdiction':
                row_data[header_name] = valor_limpo
            if footnote_text:
                todas_notas.append(footnote_text)
        if todas_notas:
            row_data['notes'] = ' ; '.join(todas_notas)
        if row_data and any(v for k, v in row_data.items() if k not in ['jurisdiction', 'notes']):
            row_data['year'] = year
            dados_tabela.append(row_data)

    df_tips = pd.DataFrame(dados_tabela)
    return df_tips


# === LOOP PELOS ANOS 2003‚Äì2024 ===
dfs = []
for year in range(2024, 2025):
    df_year = extrair_tabela_tipped_minimum_wage(year)
    if not df_year.empty:
        dfs.append(df_year)

# Concatenar todos os DataFrames
df_tips = pd.concat(dfs, ignore_index=True)

# Mostrar resultado final
print(f"\n‚úÖ Total de registros extra√≠dos: {len(df_tips)}")

def process_tip_wages(row):
    """
    Processa valores de sal√°rio tipped, lidando com:
    - M√∫ltiplos valores separados (-, &, /, espa√ßos)
    - Textos descritivos movidos para notes
    - Porcentagens e valores especiais
    """
    
    def is_monetary_value(value):
        """Verifica se √© um valor monet√°rio v√°lido"""
        if pd.isna(value) or not isinstance(value, str):
            return False
        # Remove espa√ßos e verifica se tem formato de dinheiro
        clean = value.strip()
        # Padr√£o: pode ter $ e n√∫meros com ponto decimal
        return bool(re.match(r'^\$?\d+\.?\d*$', clean))
    
    def is_percentage(value):
        """Verifica se √© uma porcentagem"""
        if pd.isna(value) or not isinstance(value, str):
            return False
        return '%' in value or value.lower() in ['50%', 'to 50%']
    
    def extract_multiple_values(value):
        """Extrai m√∫ltiplos valores monet√°rios de uma string"""
        if pd.isna(value) or not isinstance(value, str):
            return None
        
        # Procurar por m√∫ltiplos valores monet√°rios
        pattern = r'\$?\d+\.?\d*'
        matches = re.findall(pattern, value)
        
        # Filtrar apenas valores que parecem dinheiro (com ou sem $)
        valid_matches = [m for m in matches if re.match(r'^\$?\d+\.\d+$', m)]
        
        return valid_matches if len(valid_matches) > 1 else None
    
    def move_text_to_notes(column_name, value, row):
        """Move texto descritivo para notes"""
        if pd.isna(value) or not isinstance(value, str):
            return value, row
        
        # Se n√£o √© valor monet√°rio nem porcentagem, √© texto descritivo
        if not is_monetary_value(value) and not is_percentage(value):
            # Adicionar √† nota
            note_text = f"[{column_name}] {value}"
            
            if pd.notna(row.get('notes')) and row['notes'] != 'Missing value':
                row['notes'] = f"{row['notes']} ; {note_text}"
            else:
                row['notes'] = note_text
            
            return None, row  # Limpar o valor original
        
        return value, row
    
    # Processar cada coluna de valor
    for col in ['combinedrate', 'tipcredit', 'cashwage']:
        if col not in row:
            continue
            
        value = row[col]
        
        if pd.isna(value) or value == 'Missing value':
            continue
        
        # 1. Verificar se tem m√∫ltiplos valores
        multiple_values = extract_multiple_values(value)
        
        if multiple_values:
            # Tem m√∫ltiplos valores - usar o primeiro e criar nota
            first_value = multiple_values[0]
            if not first_value.startswith('$'):
                first_value = f'${first_value}'
            
            row[col] = first_value
            
            # Criar nota com os valores alternativos
            other_values = ', '.join(multiple_values[1:])
            note_text = f"[{col}] Alternative rate(s): {other_values}"
            
            if pd.notna(row.get('notes')) and row['notes'] != 'Missing value':
                row['notes'] = f"{row['notes']} ; {note_text}"
            else:
                row['notes'] = note_text
        
        # 2. Se n√£o √© valor monet√°rio nem porcentagem, mover para notes
        else:
            value, row = move_text_to_notes(col, value, row)
            row[col] = value
    
    return row

# Aplicar a fun√ß√£o
df_tips = df_tips.apply(process_tip_wages, axis=1)
df_tips[['combinedrate', 'tipcredit', 'cashwage']] = df_tips[['combinedrate', 'tipcredit', 'cashwage']].apply(lambda x: x.str.replace('$', '', regex=False))

def convert_with_context(value, column_name, row):
    """Converte e adiciona tipo na coluna + nota quando necess√°rio"""
    if pd.isna(value):
        return None, None, row
    
    if not isinstance(value, str):
        return float(value) if isinstance(value, (int, float)) else None, 'exact', row
    
    original = value.strip()
    value = original.replace('$', '')
    
    if value.lower() in ['not specified', 'missing value', '']:
        return None, None, row
    
    # Porcentagem
    if '%' in value:
        match = re.search(r'(\d+\.?\d*)\s*%', value)
        if match:
            note = f"[{column_name}] Original value: {original}"
            if pd.notna(row.get('notes')) and row['notes'] != 'Missing value':
                row['notes'] = f"{row['notes']} ; {note}"
            else:
                row['notes'] = note
            return float(match.group(1)), 'percentage', row
    
    # Range
    range_patterns = {
        'up to': r'up to\s+(\d+\.?\d*)',
        'more than': r'more than\s+(\d+\.?\d*)',
        'at least': r'at least\s+(\d+\.?\d*)'
    }
    
    for range_type, pattern in range_patterns.items():
        match = re.search(pattern, value, re.IGNORECASE)
        if match:
            note = f"[{column_name}] {range_type.capitalize()} {match.group(1)}"
            if pd.notna(row.get('notes')) and row['notes'] != 'Missing value':
                row['notes'] = f"{row['notes']} ; {note}"
            else:
                row['notes'] = note
            return float(match.group(1)), 'range', row
    
    # Exato
    try:
        return float(value), 'exact', row
    except ValueError:
        return None, None, row

# Aplicar
def process_with_types(row):
    for col in ['combinedrate', 'tipcredit', 'cashwage']:
        if col in row:
            value, value_type, row = convert_with_context(row[col], col, row)
            row[col] = value
            row[f'{col}_type'] = value_type
    return row

df_tips = df_tips.apply(process_with_types, axis=1)
df_tips


‚úÖ Total de registros extra√≠dos: 54


Unnamed: 0,jurisdiction,combinedrate,tipcredit,cashwage,definition,notes,year,combinedrate_type,tipcredit_type,cashwage_type
0,FEDERAL,7.25,5.12,2.13,More than $30,: Fair Labor Standards Act (FLSA),2024,exact,exact,exact
1,Minnesota,,,,Large employer: annual gross revenue of at lea...,Minnesota. A large employer means an enterpris...,2024,,,
2,Minnesota,,,,Small employer: annual gross revenue of less t...,Minnesota. A large employer means an enterpris...,2024,,,
3,Montana,,,,"Business with gross annual sales over $110,000",,2024,,,
4,Montana,,,,Business not covered by the Fair Labor Standar...,,2024,,,
5,Arizona,14.35,3.0,11.35,Not specified,,2024,exact,exact,exact
6,Arkansas,11.0,8.37,2.63,Not specified,,2024,exact,exact,exact
7,Colorado,3.02,,11.4,More than $30,,2024,exact,,exact
8,Connecticut,,9.31,,"Hotel, restaurant",Connecticut. The Connecticut minimum wage is a...,2024,,exact,
9,Connecticut,,7.46,,Bartenders who customarily receive tips,Connecticut. The Connecticut minimum wage is a...,2024,,exact,


In [5]:
"""
Scraper para dados de Youth Employment Rules (Age Certificates)
"""
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from typing import Dict, List
import sys
sys.path.append('../..')


class YouthEmploymentScraper:
    """Classe para extrair dados de certificados de idade/emprego juvenil"""
    
    def __init__(self, url: str = "https://www.dol.gov/agencies/whd/state/age-certificates"):
        self.url = url
        self.soup = None
        self.footnotes_dict = {}
        self.year = 2024  # Padr√£o, pode ser extra√≠do do conte√∫do
    
    def fetch_page(self) -> bool:
        """Busca a p√°gina HTML"""
        try:
            response = requests.get(self.url)
            response.raise_for_status()
            self.soup = BeautifulSoup(response.content, 'html.parser')
            return True
        except requests.RequestException as e:
            print(f"‚ùå Erro ao buscar p√°gina: {e}")
            return False
    
    def extract_footnotes(self) -> Dict[str, str]:
        """Extrai footnotes da p√°gina"""
        if not self.soup:
            return {}
        
        footnotes = {}
        
        # Buscar por links de footnote na p√°gina
        for a_tag in self.soup.find_all('a', href=lambda x: x and x.startswith('#')):
            footnote_id = a_tag.get_text(strip=True)
            if footnote_id and re.match(r'^\d+$', footnote_id):
                # Buscar o texto correspondente
                target_id = a_tag.get('href').replace('#', '')
                target = self.soup.find(attrs={'id': target_id})
                if target:
                    footnote_text = target.get_text(strip=True)
                    footnotes[footnote_id] = footnote_text
        
        # Se n√£o encontrou pelos IDs, buscar no texto das footnotes
        if not footnotes:
            # Procurar se√ß√£o "Footnotes:"
            text = self.soup.get_text()
            if 'Footnotes:' in text:
                footnote_section = text.split('Footnotes:')[1]
                # Padr√£o: [n√∫mero] texto at√© pr√≥ximo [n√∫mero]
                pattern = r'\[(\d+)\]\s*([^\[]+)'
                matches = re.findall(pattern, footnote_section)
                for num, text in matches:
                    footnotes[num] = text.strip()
        
        self.footnotes_dict = footnotes
        return footnotes
    
    def extract_table_data(self) -> List[Dict]:
        """Extrai dados da tabela"""
        if not self.soup:
            return []
        
        table = self.soup.find('table')
        if not table:
            print("‚ö†Ô∏è Tabela n√£o encontrada")
            return []
        
        rows = table.find_all('tr')[1:]  # Pular cabe√ßalho
        data = []
        print(rows)
        for row in rows:
            cells = row.find_all(['td', 'th'])
            
            if len(cells) < 2:
                continue
            
            # Primeira c√©lula: Estado
            state_cell = cells[0]
            state_link = state_cell.find('a')
            
            if not state_link:
                continue
            
            state_name = state_link.get_text(strip=True)
            
            # Extrair footnotes do estado
            state_footnotes = []
            for link in state_cell.find_all('a', href=lambda x: x and x.startswith('#')):
                footnote_ref = link.get_text(strip=True)
                if re.match(r'^\d+$', footnote_ref):
                    state_footnotes.append(footnote_ref)
            
            # Processar colunas de certificados
            # Estrutura esperada: State | Employment Cert | Age Cert | Idades...
            
            row_data = {
                'state': state_name,
                'year': self.year,
                'footnote_refs': state_footnotes
            }
            
            # C√©lulas restantes
            cert_index = 1
            age_ranges = []
            
            for i, cell in enumerate(cells[1:], start=1):
                cell_text = cell.get_text(strip=True)
                
                # Identificar tipo de c√©lula pela posi√ß√£o ou conte√∫do
                if i == 1:  # Employment certificate
                    row_data['has_employment_cert'] = bool(cell_text and cell_text != 'No provision')
                    row_data['employment_cert_type'] = cell_text if cell_text else None
                
                elif i == 2:  # Age certification
                    row_data['has_age_cert'] = bool(cell_text and cell_text != 'No provision')
                    row_data['age_cert_type'] = cell_text if cell_text else None
                
                else:  # Idades
                    if cell_text:
                        age_ranges.append(cell_text)
            
            # Processar faixas et√°rias
            if age_ranges:
                row_data['age_ranges'] = ' ; '.join(age_ranges)
                
                # Tentar extrair idade m√≠nima e m√°xima
                ages = []
                for age_text in age_ranges:
                    # Extrair n√∫meros
                    nums = re.findall(r'\d+', age_text)
                    ages.extend([int(n) for n in nums])
                
                if ages:
                    row_data['age_min'] = min(ages)
                    row_data['age_max'] = max(ages)
            
            # Determinar se √© obrigat√≥rio (M), pedido (R) ou pr√°tica (P)
            all_text = ' '.join([c.get_text() for c in cells])
            if '(M)' in all_text or 'Mandated' in all_text:
                row_data['is_mandatory'] = True
            elif '(R)' in all_text or 'Request' in all_text:
                row_data['is_mandatory'] = False
            elif '(P)' in all_text or 'Practice' in all_text:
                row_data['is_mandatory'] = False
            else:
                row_data['is_mandatory'] = None
            
            data.append(row_data)
        
        return data
    
    def process_footnotes(self, data: List[Dict]) -> List[Dict]:
        """Adiciona texto dos footnotes aos registros"""
        for row in data:
            footnote_texts = []
            for ref in row.get('footnote_refs', []):
                if ref in self.footnotes_dict:
                    footnote_texts.append(self.footnotes_dict[ref])
            
            if footnote_texts:
                row['rule_description'] = ' | '.join(footnote_texts)
            else:
                row['rule_description'] = None
            
            # Remover footnote_refs (j√° processado)
            if 'footnote_refs' in row:
                del row['footnote_refs']
        
        return data
    
    def scrape(self) -> pd.DataFrame:
        """Executa o scraping completo"""
        print("üîç Iniciando scraping de Youth Employment Rules...")
        
        if not self.fetch_page():
            return pd.DataFrame()
        
        # Extrair ano
        self.year = 2024
        print(f"   üìÖ Ano dos dados: {self.year}")
        
        # Extrair footnotes
        print("üìù Extraindo footnotes...")
        self.extract_footnotes()
        print(f"   ‚úì {len(self.footnotes_dict)} footnotes encontrados")
        
        # Extrair tabela
        print("üìä Extraindo dados da tabela...")
        data = self.extract_table_data()
        print(f"   ‚úì {len(data)} estados/territ√≥rios encontrados")
        
        if not data:
            return pd.DataFrame()
        
        # Processar footnotes
        print("üîó Vinculando footnotes...")
        data = self.process_footnotes(data)
        
        # Criar DataFrame
        df = pd.DataFrame(data)
        
        # Adicionar colunas faltantes
        if 'age_min' not in df.columns:
            df['age_min'] = None
        if 'age_max' not in df.columns:
            df['age_max'] = None
        if 'is_mandatory' not in df.columns:
            df['is_mandatory'] = None
        
        # Determinar certificate_type baseado nas colunas
        def determine_cert_type(row):
            if row.get('has_employment_cert'):
                return 'Employment Certificate'
            elif row.get('has_age_cert'):
                return 'Age Certificate'
            else:
                return 'No Certificate Required'
        
        df['certificate_type'] = df.apply(determine_cert_type, axis=1)
        
        # Determinar issuing_authority (simplificado)
        df['issuing_authority'] = 'State Department of Labor'  # Padr√£o, pode ser refinado
        
        print(f"‚úÖ Scraping conclu√≠do: {len(df)} registros")
        
        return df


def main():
    """Fun√ß√£o principal para teste"""
    scraper = YouthEmploymentScraper()
    df = scraper.scrape()
    
    if not df.empty:
        print("\nüìã Preview dos dados (primeiras 10 linhas):")
        cols_to_show = ['state', 'year', 'certificate_type', 'is_mandatory', 
                        'age_min', 'age_max', 'issuing_authority']
        available_cols = [c for c in cols_to_show if c in df.columns]
        print(df[available_cols].head(10).to_string())
        
        print(f"\nüìä Resumo:")
        print(f"   Total de registros: {len(df)}")
        print(f"   Colunas: {', '.join(df.columns)}")
        
        if 'certificate_type' in df.columns:
            print(f"\nüìÑ Tipos de certificado:")
            print(df['certificate_type'].value_counts().to_string())
    display(df)
    return df


if __name__ == "__main__":
    main()

üîç Iniciando scraping de Youth Employment Rules...
   üìÖ Ano dos dados: 2024
üìù Extraindo footnotes...
   ‚úì 24 footnotes encontrados
üìä Extraindo dados da tabela...
[<tr>
<th colspan="3" id="empcert" style="text-align: center;">
<strong>Employment</strong><br>
<strong>certificate</strong>
</br></th>
<th colspan="3" id="agecert" style="text-align: center;"><strong>Age certification</strong> ¬†</th>
</tr>, <tr>
<th id="minorage1" rowspan="2">
<strong>For minors of age indicated</strong>
<a href="#2">
<strong><sup>2</sup></strong>
</a>
</th>
<th colspan="2" id="issued" style="text-align: center;"><strong>Issued by:</strong></th>
<th id="minorage2" rowspan="2">
<strong>For minors of age indicated</strong>
<a href="#2">
<strong><sup>2</sup></strong>
</a>
</th>
<th colspan="2" id="issued2" style="text-align: center;"><strong>Issued by:</strong></th>
</tr>, <tr>
<th id="labor1"><strong>Labor Department</strong></th>
<th id="school1"><strong>School</strong></th>
<th id="labor2"><stro

Unnamed: 0,state,year,has_employment_cert,employment_cert_type,has_age_cert,age_cert_type,age_ranges,is_mandatory,rule_description,age_min,age_max,certificate_type,issuing_authority
0,2,2024,True,Issued by:,True,For minors of age indicated2,Issued by:,,"2Under the columns ""For minors of age indicate...",,,Employment Certificate,State Department of Labor
1,Alabama,2024,True,Under 18 (M)18 in mines3,True,X,X ; Not issued,True,,,,Employment Certificate,State Department of Labor
2,Alaska,2024,True,Under 1716 and 17 if employer licensed to sell...,True,X4,Not issued,True,,,,Employment Certificate,State Department of Labor
3,Arizona,2024,True,Not issued,False,,Not issued,,,,,Employment Certificate,State Department of Labor
4,Arkansas,2024,True,Not issued,False,,Not issued,,,,,Employment Certificate,State Department of Labor
5,California,2024,True,Under 18 for minors enrolled in school (M),True,X (for entertainment industry),X ; Not issued5,True,,5.0,5.0,Employment Certificate,State Department of Labor
6,Colorado,2024,True,Under 16 during school hours (M),False,,X ; Under 18 (R) Not issued to minors under 16...,True,,6.0,18.0,Employment Certificate,State Department of Labor
7,Connecticut,2024,True,Under 18 (M)7,False,,X ; Not issued,True,,,,Employment Certificate,State Department of Labor
8,Delaware,2024,True,Under 18 (M),True,X,X ; No provision,True,,,,Employment Certificate,State Department of Labor
9,District of Columbia,2024,True,Under 18 (M),False,,X ; No provision,True,,,,Employment Certificate,State Department of Labor


In [None]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.dol.gov/agencies/whd/state/age-certificates"

response = requests.get(URL)
response.raise_for_status()  # ‚úÖ garante que falhas HTTP sejam detectadas

soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table")

if not table:
    raise ValueError("‚ùå Tabela n√£o encontrada na p√°gina.")

rows = table.find_all("tr")[4:] 

# ==============================================================
# Fun√ß√µes utilit√°rias
# ==============================================================

def detect_requirement_level(text: str):
    """
    Retorna a lista de n√≠veis de requisito encontrados no texto.
    (M)=Menor, (R)=Requerido, (P)=Parcial
    """
    levels = []
    mapping = {'(M)': 1, '(R)': 2, '(P)': 3}
    for mark, level in mapping.items():
        if mark in text:
            levels.append(level)
    return levels


def extract_text(td):
    """Extrai texto limpo de uma c√©lula <td>."""
    return '; '.join(part.strip() for part in td.stripped_strings)


def detect_footnote(values):
    """
    Detecta notas de rodap√© (links) dentro das c√©lulas.
    Remove os links do HTML e retorna uma lista de refer√™ncias.
    """
    links = []
    for idx, td in enumerate(values):
        anchors = td.find_all("a", href=True)
        if anchors:
            for link in anchors:
                href = link.text.strip()
                link.decompose()  # remove do conte√∫do
                links.append({
                    "href": href,
                    "index": idx,
                    "clean_td": td
                })
    return links or None


def parse_state_row(state_row):
    """
    L√™ uma linha <tr> da tabela e retorna dicion√°rios
    com as regras de certificado de emprego e de idade.
    """
    jurisdiction = state_row.th.strong.get_text(strip=True)
    values = state_row.find_all("td")

    # Detecta e limpa footnotes
    clean_texts = detect_footnote(values)
    if clean_texts:
        for ref in clean_texts:
            values[ref["index"]] = ref["clean_td"]

    # Evita erro caso tenha menos colunas
    if len(values) < 6:
        print(f"‚ö†Ô∏è Linha ignorada ({jurisdiction}): colunas insuficientes")
        return None

    # Limpa textos
    v = [extract_text(td) for td in values]

    # Emprego
    employment = {
        "state": jurisdiction,
        "certificate_type": "employment",
        "rule_description": v[0],
        "is_labor": "1" if "X" in v[1] else "0",
        "is_school": "1" if "X" in v[2] else "0",
        "requirement_level": detect_requirement_level(v[0]),
        "footnotes": clean_text['href']
    }

    # Idade
    age = {
        "state": jurisdiction,
        "certificate_type": "age",
        "rule_description": v[3],
        "is_labor": "1" if "X" in v[4] else "0",
        "is_school": "1" if "X" in v[5] else "0",
        "requirement_level": detect_requirement_level(v[3])
    }

    return employment, age

youth_employment = []

for row in rows:
    result = parse_state_row(row)
    if result:
        employment, age = result
        youth_employment.extend([employment, age])
for item in youth_employment:
    print(item)


{'state': 'Alabama', 'certificate_type': 'employment', 'rule_description': 'Under 18 (M); 18 in mines', 'is_labor': '1', 'is_school': '1', 'requirement_level': [1]}
{'state': 'Alabama', 'certificate_type': 'age', 'rule_description': 'Not issued', 'is_labor': '0', 'is_school': '0', 'requirement_level': []}
{'state': 'Alaska', 'certificate_type': 'employment', 'rule_description': 'Under 17; 16 and 17 if employer licensed to sell alcohol (M)', 'is_labor': '1', 'is_school': '0', 'requirement_level': [1]}
{'state': 'Alaska', 'certificate_type': 'age', 'rule_description': 'Not issued', 'is_labor': '0', 'is_school': '0', 'requirement_level': []}
{'state': 'Arizona', 'certificate_type': 'employment', 'rule_description': 'Not issued', 'is_labor': '0', 'is_school': '0', 'requirement_level': []}
{'state': 'Arizona', 'certificate_type': 'age', 'rule_description': 'Not issued', 'is_labor': '0', 'is_school': '0', 'requirement_level': []}
{'state': 'Arkansas', 'certificate_type': 'employment', 'rule_