In [None]:
import requests
from bs4 import BeautifulSoup
import re

__1968 - 1981__

In [None]:
import pandas as pd


In [None]:
page = requests.get("https://www.dol.gov/agencies/whd/state/minimum-wage/history")
soup = BeautifulSoup(page.content, 'html.parser')
tabelas = soup.find_all('table')

df_wage = []
for i, tabela in enumerate(tabelas):
    linhas = tabela.find_all('tr')
    cabecalho = linhas[0]
    anos = [th.text for th in cabecalho.find_all('th')[1:]]
    estados = []
    for estado in linhas[1:]:
        estados.append([td.text for td in estado.find_all('td')])
    df = pd.DataFrame(estados, columns=['state'] + anos)
    df_wage.append(df)
df = pd.concat(df_wage, ignore_index=True)

footnotes = soup.find('div', id='content')
footnotes.find_all('p')
list_footnotes = []
for p in footnotes.find_all('p'):
    if re.match(r'^[\[\(].[\]\)]', p.text):
        id_footnote = p.text.strip().split(' ')[0]
        text_footnote = ' '.join(p.text.strip().split(' ')[1:]).replace('- ','')
        list_footnotes.append((id_footnote, text_footnote))
footnotes_dict = {id_: text for id_, text in list_footnotes}
columns_to_adjust = [col for col in df.columns if  not col.isnumeric() and col != 'Estado']
footnote_year_bridge = {}
for key, _ in footnotes_dict.items():
    for col in columns_to_adjust:
        if key in col:
            footnote_year_bridge[col.replace(key, '').strip()] = key
            df = df.rename(columns={col: col.replace(key, '').strip()})

df_melted = df.melt(id_vars=['state'], var_name='year', value_name='minimal_wage').dropna()
df_melted['year'] = df_melted['year'].astype(int)
df_melted['minimal_wage'] = df_melted['minimal_wage'].str.replace('$', '')
df_melted['id'] = df_melted.index + 1

df_melted['minimal_wage'] = df_melted['minimal_wage'].str.replace(r'[\[\(].*?[\]\)]', '', regex=True)
df_melted['minimal_wage'] = df_melted['minimal_wage'].mask(
    df_melted['minimal_wage'].isin(['...', 'NA']), 
    pd.NA
)
if 'notes' not in df_melted.columns:
    df_melted['notes'] = pd.NA

if 'frequency' not in df_melted.columns:
    df_melted['frequency'] = pd.NA


def add_leading_zero(value):
    value = value.strip()
    if value.startswith('.'):
        return '0' + value
    return value

# Função para processar valores com múltiplas taxas
def process_multiple_rates(row):
    wage = row['minimal_wage']
    
    if pd.notna(wage) and isinstance(wage, str):
        original_wage = wage
        
        # 1. Detectar e remover frequency markers
        frequency = None
        if '/day' in wage:
            frequency = 2
            wage = wage.replace('/day', '').strip()
        elif '/wk' in wage:
            frequency = 3
            wage = wage.replace('/wk', '').strip()
        
        # 2. Detectar múltiplos valores com regex mais robusto
        # Padrão: captura valores monetários separados por -, &, /, ou espaços
        pattern = r'\$?\d+\.?\d*'
        matches = re.findall(pattern, wage)
        
        if len(matches) >= 2:
            first_value = add_leading_zero(matches[0])
            second_value = add_leading_zero(matches[1])
            
            row['minimal_wage'] = first_value
            note = f"Or can be {second_value}, this reflects which rates differ by industry, occupation or other factors, as established under a wage-board type law"
            row['notes'] = note
        elif len(matches) == 1:
            row['minimal_wage'] = add_leading_zero(matches[0])
        else:
            row['minimal_wage'] = add_leading_zero(wage)
        
        # 3. Atualizar frequency
        if frequency is not None:
            row['frequency'] = frequency
    
    # Garantir valor padrão para frequency
    if pd.isna(row['frequency']):
        row['frequency'] = 1
    
    return row
# Aplicar a função
df_melted = df_melted.apply(process_multiple_rates, axis=1)
df_melted['minimal_wage'] = df_melted['minimal_wage'].astype(str).str.extract(r'([\d.]+)', expand=False)

def notes_for_null_wage(row):
    if pd.isna(row['minimal_wage']) and pd.isna(row['notes']):
        return "This state utilizes the federal minimum wage"
    return row['notes']

df_melted['notes'] = df_melted.apply(notes_for_null_wage, axis=1)  
df_melted['minimal_wage'] = pd.to_numeric(df_melted['minimal_wage'], errors='coerce')
df_final = df_melted[['id', 'state', 'year', 'minimal_wage', 'frequency','notes']]


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def extrair_tabela_tipped_minimum_wage(year):
    url = f'https://www.dol.gov/agencies/whd/state/minimum-wage/tipped/{year}'
    tip_test = requests.get(url)
    if tip_test.status_code != 200:
        print(f"❌ Falha ao obter dados de {year} (status {tip_test.status_code})")
        return pd.DataFrame()
    
    tip_soup = BeautifulSoup(tip_test.content, 'html.parser')

    # 1. EXTRAIR FOOTNOTES
    footnotes_dict = {}
    for a_tag in tip_soup.find_all('a', attrs={'name': lambda x: x and x.startswith('foot')}):
        name = a_tag.get('name')
        parent_p = a_tag.find_parent('p')
        if parent_p:
            footnote_num = a_tag.get_text(strip=True)
            texto_completo = ' '.join(parent_p.get_text().split())
            texto_nota = texto_completo.replace(footnote_num, '', 1).strip()
            footnotes_dict[name] = texto_nota

    # 2. PROCESSAR TABELA
    tip_table = tip_soup.find('table')
    if not tip_table:
        print(f"⚠️ Nenhuma tabela encontrada em {year}")
        return pd.DataFrame()

    tip_linhas = tip_table.find_all('tr')[1:]
    header_order = ['jurisdiction', 'combinedrate', 'tipcredit', 'cashwage', 'definition']

    def processar_celula_valor(td_element, column_name):
        if not td_element:
            return None, None, []
        footnote_refs = []
        for link in td_element.find_all('a', href=True):
            href = link.get('href')
            if href:
                match = re.search(r'#(foot\d+)', href)
                if match:
                    footnote_refs.append(match.group(1))
        td_html = str(td_element)
        soup_copy = BeautifulSoup(td_html, 'html.parser')
        for link in soup_copy.find_all('a'):
            link.decompose()
        valor = ' '.join(soup_copy.get_text().split())
        footnote_texts = []
        for ref in footnote_refs:
            if ref in footnotes_dict:
                footnote_texts.append(f"[{column_name}] {footnotes_dict[ref]}")
        footnote_text = ' ; '.join(footnote_texts) if footnote_texts else None
        return valor if valor else None, footnote_text, footnote_refs

    def processar_jurisdiction(td_element):
        if not td_element:
            return None, None, None
        footnote_refs = []
        for link in td_element.find_all('a', href=True):
            href = link.get('href')
            if href:
                match = re.search(r'#(foot\d+)', href)
                if match:
                    footnote_refs.append(match.group(1))
        td_html = str(td_element)
        soup_copy = BeautifulSoup(td_html, 'html.parser')
        for link in soup_copy.find_all('a'):
            link.decompose()
        strong_tag = soup_copy.find('strong')
        if strong_tag:
            texto = ' '.join(strong_tag.get_text().split())
            nome_limpo = re.sub(r'[^a-zA-Z0-9\s]', '', texto)
        else:
            nome_limpo = soup_copy.get_text(strip=True)
        extra_text = soup_copy.get_text().replace(nome_limpo, '').strip()
        footnote_texts = [footnotes_dict[ref] for ref in footnote_refs if ref in footnotes_dict]
        footnote_text = ' ; '.join(footnote_texts) if footnote_texts else None
        return nome_limpo, footnote_text, extra_text

    dados_tabela = []
    ultima_jurisdiction = None
    ultima_footnote = None
    for tr in tip_linhas:
        row_data = {}
        tds = tr.find_all('td')
        if tds and tds[0].get('colspan'):
            continue
        td_jurisdiction = tr.find('td', headers='jurisdiction')
        todas_notas = []
        if td_jurisdiction and td_jurisdiction.find('strong'):
            jurisdiction_limpa, footnote_text, extra_text = processar_jurisdiction(td_jurisdiction)
            ultima_jurisdiction = jurisdiction_limpa
            ultima_footnote = footnote_text
            row_data['jurisdiction'] = jurisdiction_limpa
            if footnote_text:
                todas_notas.append(footnote_text)
            if extra_text:
                todas_notas.append(extra_text)
        else:
            if ultima_jurisdiction:
                row_data['jurisdiction'] = ultima_jurisdiction
                if ultima_footnote:
                    todas_notas.append(ultima_footnote)
        for td in tds:
            header_name = td.get('headers')[0] if td.get('headers') else None
            if not header_name:
                header_name = header_order[tds.index(td)] if len(tds) == 4 else header_order[tds.index(td) - 1]
            valor_limpo, footnote_text, _ = processar_celula_valor(td, header_name)
            if header_name != 'jurisdiction':
                row_data[header_name] = valor_limpo
            if footnote_text:
                todas_notas.append(footnote_text)
        if todas_notas:
            row_data['notes'] = ' ; '.join(todas_notas)
        if row_data and any(v for k, v in row_data.items() if k not in ['jurisdiction', 'notes']):
            row_data['year'] = year
            dados_tabela.append(row_data)

    df_tips = pd.DataFrame(dados_tabela)
    return df_tips


# === LOOP PELOS ANOS 2003–2024 ===
dfs = []
for year in range(2024, 2025):
    df_year = extrair_tabela_tipped_minimum_wage(year)
    if not df_year.empty:
        dfs.append(df_year)

# Concatenar todos os DataFrames
df_tips = pd.concat(dfs, ignore_index=True)

# Mostrar resultado final
print(f"\n✅ Total de registros extraídos: {len(df_tips)}")

def process_tip_wages(row):
    """
    Processa valores de salário tipped, lidando com:
    - Múltiplos valores separados (-, &, /, espaços)
    - Textos descritivos movidos para notes
    - Porcentagens e valores especiais
    """
    
    def is_monetary_value(value):
        """Verifica se é um valor monetário válido"""
        if pd.isna(value) or not isinstance(value, str):
            return False
        # Remove espaços e verifica se tem formato de dinheiro
        clean = value.strip()
        # Padrão: pode ter $ e números com ponto decimal
        return bool(re.match(r'^\$?\d+\.?\d*$', clean))
    
    def is_percentage(value):
        """Verifica se é uma porcentagem"""
        if pd.isna(value) or not isinstance(value, str):
            return False
        return '%' in value or value.lower() in ['50%', 'to 50%']
    
    def extract_multiple_values(value):
        """Extrai múltiplos valores monetários de uma string"""
        if pd.isna(value) or not isinstance(value, str):
            return None
        
        # Procurar por múltiplos valores monetários
        pattern = r'\$?\d+\.?\d*'
        matches = re.findall(pattern, value)
        
        # Filtrar apenas valores que parecem dinheiro (com ou sem $)
        valid_matches = [m for m in matches if re.match(r'^\$?\d+\.\d+$', m)]
        
        return valid_matches if len(valid_matches) > 1 else None
    
    def move_text_to_notes(column_name, value, row):
        """Move texto descritivo para notes"""
        if pd.isna(value) or not isinstance(value, str):
            return value, row
        
        # Se não é valor monetário nem porcentagem, é texto descritivo
        if not is_monetary_value(value) and not is_percentage(value):
            # Adicionar à nota
            note_text = f"[{column_name}] {value}"
            
            if pd.notna(row.get('notes')) and row['notes'] != 'Missing value':
                row['notes'] = f"{row['notes']} ; {note_text}"
            else:
                row['notes'] = note_text
            
            return None, row  # Limpar o valor original
        
        return value, row
    
    # Processar cada coluna de valor
    for col in ['combinedrate', 'tipcredit', 'cashwage']:
        if col not in row:
            continue
            
        value = row[col]
        
        if pd.isna(value) or value == 'Missing value':
            continue
        
        # 1. Verificar se tem múltiplos valores
        multiple_values = extract_multiple_values(value)
        
        if multiple_values:
            # Tem múltiplos valores - usar o primeiro e criar nota
            first_value = multiple_values[0]
            if not first_value.startswith('$'):
                first_value = f'${first_value}'
            
            row[col] = first_value
            
            # Criar nota com os valores alternativos
            other_values = ', '.join(multiple_values[1:])
            note_text = f"[{col}] Alternative rate(s): {other_values}"
            
            if pd.notna(row.get('notes')) and row['notes'] != 'Missing value':
                row['notes'] = f"{row['notes']} ; {note_text}"
            else:
                row['notes'] = note_text
        
        # 2. Se não é valor monetário nem porcentagem, mover para notes
        else:
            value, row = move_text_to_notes(col, value, row)
            row[col] = value
    
    return row

# Aplicar a função
df_tips = df_tips.apply(process_tip_wages, axis=1)
df_tips[['combinedrate', 'tipcredit', 'cashwage']] = df_tips[['combinedrate', 'tipcredit', 'cashwage']].apply(lambda x: x.str.replace('$', '', regex=False))

def convert_with_context(value, column_name, row):
    """Converte e adiciona tipo na coluna + nota quando necessário"""
    if pd.isna(value):
        return None, None, row
    
    if not isinstance(value, str):
        return float(value) if isinstance(value, (int, float)) else None, 'exact', row
    
    original = value.strip()
    value = original.replace('$', '')
    
    if value.lower() in ['not specified', 'missing value', '']:
        return None, None, row
    
    # Porcentagem
    if '%' in value:
        match = re.search(r'(\d+\.?\d*)\s*%', value)
        if match:
            note = f"[{column_name}] Original value: {original}"
            if pd.notna(row.get('notes')) and row['notes'] != 'Missing value':
                row['notes'] = f"{row['notes']} ; {note}"
            else:
                row['notes'] = note
            return float(match.group(1)), 'percentage', row
    
    # Range
    range_patterns = {
        'up to': r'up to\s+(\d+\.?\d*)',
        'more than': r'more than\s+(\d+\.?\d*)',
        'at least': r'at least\s+(\d+\.?\d*)'
    }
    
    for range_type, pattern in range_patterns.items():
        match = re.search(pattern, value, re.IGNORECASE)
        if match:
            note = f"[{column_name}] {range_type.capitalize()} {match.group(1)}"
            if pd.notna(row.get('notes')) and row['notes'] != 'Missing value':
                row['notes'] = f"{row['notes']} ; {note}"
            else:
                row['notes'] = note
            return float(match.group(1)), 'range', row
    
    # Exato
    try:
        return float(value), 'exact', row
    except ValueError:
        return None, None, row

# Aplicar
def process_with_types(row):
    for col in ['combinedrate', 'tipcredit', 'cashwage']:
        if col in row:
            value, value_type, row = convert_with_context(row[col], col, row)
            row[col] = value
            row[f'{col}_type'] = value_type
    return row

df_tips = df_tips.apply(process_with_types, axis=1)
df_tips


✅ Total de registros extraídos: 54


Unnamed: 0,jurisdiction,combinedrate,tipcredit,cashwage,definition,notes,year,combinedrate_type,tipcredit_type,cashwage_type
0,FEDERAL,7.25,5.12,2.13,More than $30,: Fair Labor Standards Act (FLSA),2024,exact,exact,exact
1,Minnesota,,,,Large employer: annual gross revenue of at lea...,Minnesota. A large employer means an enterpris...,2024,,,
2,Minnesota,,,,Small employer: annual gross revenue of less t...,Minnesota. A large employer means an enterpris...,2024,,,
3,Montana,,,,"Business with gross annual sales over $110,000",,2024,,,
4,Montana,,,,Business not covered by the Fair Labor Standar...,,2024,,,
5,Arizona,14.35,3.0,11.35,Not specified,,2024,exact,exact,exact
6,Arkansas,11.0,8.37,2.63,Not specified,,2024,exact,exact,exact
7,Colorado,3.02,,11.4,More than $30,,2024,exact,,exact
8,Connecticut,,9.31,,"Hotel, restaurant",Connecticut. The Connecticut minimum wage is a...,2024,,exact,
9,Connecticut,,7.46,,Bartenders who customarily receive tips,Connecticut. The Connecticut minimum wage is a...,2024,,exact,


In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

URL = "https://www.dol.gov/agencies/whd/state/age-certificates"

response = requests.get(URL)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table")

if not table:
    raise ValueError("Tabela não encontrada na página.")

rows = table.find_all("tr")[4:]


def detect_requirement_level(text: str):
    """Identifica o(s) nível(is) de requisito no texto."""
    mapping = {'(M)': 1, '(R)': 2, '(P)': 3}
    return [level for mark, level in mapping.items() if mark in text]


def extract_text(td):
    """Extrai texto limpo de uma célula <td>."""
    return '; '.join(part.strip() for part in td.stripped_strings)

def remove_requirement_marks(text: str):
    """Remove marcas de requisito do texto."""
    return re.sub(r'\s*\(M\)|\s*\(R\)|\s*\(P\)', '', text)

def detect_footnote(values):
    """Remove links das células e retorna as referências encontradas."""
    links = []
    for idx, td in enumerate(values):
        anchors = td.find_all("a", href=True)
        if anchors:
            for link in anchors:
                href = link.text.strip()
                link.decompose()
                links.append({
                    "href": href,
                    "index": idx,
                    "clean_td": td
                })
    return links or None


def parse_state_row(state_row):
    """Extrai informações estruturadas de uma linha da tabela."""
    jurisdiction = state_row.th.strong.get_text(strip=True)
    values = state_row.find_all("td")
    
    if len(values) < 6:
        return None

    clean_texts = detect_footnote(values)
    if clean_texts:
        for ref in clean_texts:
            values[ref["index"]] = ref["clean_td"]

    v = [extract_text(td) for td in values]

    employment = {
        "state": jurisdiction,
        "certificate_type": "employment",
        "rule_description": remove_requirement_marks(v[0]),
        "is_labor": "1" if "X" in v[1] else "0",
        "is_school": "1" if "X" in v[2] else "0",
        "requirement_level": detect_requirement_level(v[0]),
        "notes": f"Labor: {v[1].replace('X', '').strip() or 'Não'}; School: {v[2].replace('X', '').strip() or 'Não'}",
        "footnotes": [clean_texts[i]['href'] for i in range(len(clean_texts)) if clean_texts[i]['index'] <= 2] if clean_texts else None
    }

    age = {
        "state": jurisdiction,
        "certificate_type": "age",
        "rule_description": remove_requirement_marks(v[3]),
        "is_labor": "1" if "X" in v[4] else "0",
        "is_school": "1" if "X" in v[5] else "0",
        "requirement_level": detect_requirement_level(v[3]),
        "notes": f"Labor: {v[4].replace('X', '').strip() or 'Não'}; School: {v[5].replace('X', '').strip() or 'Não'}",
        "footnotes": [clean_texts[i]['href'] for i in range(len(clean_texts)) if clean_texts[i]['index'] >= 3] if clean_texts else None
    }
    return employment, age


youth_employment = []

for row in rows:
    parsed = parse_state_row(row)
    if parsed:
        youth_employment.extend(parsed)
df_youth_employment = pd.DataFrame(youth_employment)
df_youth_employment

Unnamed: 0,state,certificate_type,rule_description,is_labor,is_school,requirement_level,notes,footnotes
0,Alabama,employment,Under 18; 18 in mines,1,1,[1],Labor: Não; School: Não,[3]
1,Alabama,age,Not issued,0,0,[],Labor: Não; School: Não,[]
2,Alaska,employment,Under 17; 16 and 17 if employer licensed to se...,1,0,[1],Labor: Não; School: Não,[4]
3,Alaska,age,Not issued,0,0,[],Labor: Não; School: Não,[]
4,Arizona,employment,Not issued,0,0,[],Labor: Não; School: Não,
...,...,...,...,...,...,...,...,...
103,West Virginia,age,16 and 17,0,1,[2],Labor: Não; School: Não,[23]
104,Wisconsin,employment,Under 18,1,1,[1],Labor: ; Through permit officers; School: Não,[24]
105,Wisconsin,age,18 and over,1,0,[2],Labor: ; Through permit officers; School: Não,[24]
106,Wyoming,employment,Not issued,0,0,[],Labor: Não; School: Não,[]


In [65]:
"""
Scraper para dados de tipped minimum wage
"""
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from typing import Dict, List, Tuple
import sys
sys.path.append('..')
from config import BASE_URL_TIPPED_WAGE, TIPPED_WAGE_START_YEAR, TIPPED_WAGE_END_YEAR, REQUEST_TIMEOUT


class TippedWageScraper:
    """Classe para extrair dados de tipped minimum wage"""
    
    def __init__(self, base_url: str = BASE_URL_TIPPED_WAGE):
        self.base_url = base_url
        self.header_order = ['jurisdiction', 'combinedrate', 'tipcredit', 'cashwage', 'definition']
        self.footnotes_dict = {}
    
    def extract_footnotes(self, soup: BeautifulSoup) -> Dict[str, str]:
        """Extrai footnotes de uma página"""
        footnotes_dict = {}
        
        for a_tag in soup.find_all('a', attrs={'name': lambda x: x and x.startswith('foot')}):
            name = a_tag.get('name')
            parent_p = a_tag.find_parent('p')
            
            if parent_p:
                footnote_num = a_tag.get_text(strip=True)
                texto_completo = ' '.join(parent_p.get_text().split())
                texto_nota = texto_completo.replace(footnote_num, '', 1).strip()
                footnotes_dict[name] = texto_nota
        
        return footnotes_dict
    
    def processar_celula_valor(self, td_element, column_name: str, footnotes_dict: Dict) -> tuple:
        """Extrai valor limpo e footnotes de uma célula"""
        if not td_element:
            return None, [], []
        
        # Procurar links de footnote
        footnote_refs = []
        for link in td_element.find_all('a', href=True):
            href = link.get('href')
            if href:
                match = re.search(r'#(foot\d+)', href)
                if match:
                    footnote_refs.append(match.group(1))
        
        # Criar cópia e remover links
        td_html = str(td_element)
        soup_copy = BeautifulSoup(td_html, 'html.parser')
        
        for link in soup_copy.find_all('a'):
            link.decompose()
        # Extrair valor limpo
        valor = ' '.join(soup_copy.get_text().split())
        if soup_copy.find('strong'):
            valor = None
        return valor if valor else None, footnote_refs

    def processar_jurisdiction(self, td_element, footnotes_dict: Dict) -> Tuple[str, list, str]:
        """Extrai o nome limpo da jurisdiction e seus footnotes"""
        if not td_element:
            return None, [], ""
        
        # Extrair footnotes a partir dos links
        footnote_refs = []
        for link in td_element.find_all('a', href=True):
            href = link.get('href')
            if href:
                match = re.search(r'#(foot\d+)', href)
                if match:
                    footnote_refs.append(match.group(1))

        # Copiar HTML e remover os links e strongs para não afetar o texto final
        soup_copy = BeautifulSoup(str(td_element), 'html.parser')

        # Extrair nome da jurisdição (normalmente no primeiro <strong>)
        first_strong = soup_copy.find('strong')
        if first_strong:
            nome_limpo = re.sub(r'[^a-zA-Z0-9\s]', '', first_strong.get_text(strip=True))
        else:
            nome_limpo = soup_copy.get_text(strip=True)
        

        # Remover <strong> e <a> completamente para isolar o texto explicativo
        for tag in soup_copy.find_all(['strong', 'a']):
            tag.decompose()
        if nome_limpo == 'Ohio':
            print(f"Jurisdiction raw: {td_element}")
            print(f"Jurisdiction processed: {soup_copy}")
            print(f"Other text: {' '.join(soup_copy.get_text(strip=True).split())}")
        # Agora pegar só o texto restante
        other_extra_text = ' '.join(soup_copy.get_text(strip=True).split())

        return nome_limpo, footnote_refs, other_extra_text



    def extract_table_for_year(self, year: int) -> pd.DataFrame:
        """Extrai tabela de um ano específico"""
        url = f'{self.base_url}/{year}'
        
        try:
            response = requests.get(url, timeout=REQUEST_TIMEOUT)
            response.raise_for_status()
        except requests.RequestException:
            return pd.DataFrame()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extrair footnotes
        footnotes = self.extract_footnotes(soup)
        print(footnotes)
        self.footnotes_dict[year] = footnotes
        # Processar tabela
        tip_table = soup.find('table')
        if not tip_table:
            return pd.DataFrame()
        
        tip_linhas = tip_table.find_all('tr')[1:]
        
        dados_tabela = []
        ultima_jurisdiction = None
        ultima_footnote = None
        ultima_footnote_refs = []
        
        for tr in tip_linhas:
            row_data = {}
            tds = tr.find_all('td')
            if len(tds) > len(self.header_order):
                tds.pop(1)
            # Pular linhas com colspan
            if tds and tds[0].get('colspan'):
                continue
            
            td_jurisdiction = tr.find('td', headers='jurisdiction')
            todas_notas = []
            all_footnote_refs = []
            all_footnote_texts = []
            
            if td_jurisdiction and td_jurisdiction.find('strong'):
                jurisdiction_limpa, footnote_refs, note_text = self.processar_jurisdiction(
                    td_jurisdiction, self.footnotes_dict
                )
                if jurisdiction_limpa == 'Ohio':
                    print(f"Jurisdiction raw: {td_jurisdiction}")
                    print(f"Jurisdiction processed: {jurisdiction_limpa}")
                    print(f"Footnote refs: {footnote_refs}")
                    print(f"Note text: {note_text}")
                ultima_jurisdiction = jurisdiction_limpa
                ultima_footnote_refs = footnote_refs
                if note_text:
                    if jurisdiction_limpa == 'Ohio':
                        print(f"Note text for Ohio: {note_text}")
                    row_data['notes'] = note_text
                row_data['jurisdiction'] = jurisdiction_limpa
                
                if footnote_refs:
                    all_footnote_refs.extend(footnote_refs)
            else:
                if ultima_jurisdiction:
                    row_data['jurisdiction'] = ultima_jurisdiction
                    if ultima_footnote_refs:
                        all_footnote_refs.extend(ultima_footnote_refs)
            
            # Processar valores das colunas
            for i, td in enumerate(tds):
                header_name = td.get('headers')[0] if td.get('headers') else None
                if not header_name:
                    header_name = self.header_order[i]
  
                
                valor_limpo, footnote_refs = self.processar_celula_valor(
                    td, header_name, self.footnotes_dict
                )
                if valor_limpo:
                    if header_name != 'jurisdiction':
                        row_data[header_name] = valor_limpo
                    else:
                        row_data['notes'] = valor_limpo
                    if footnote_refs:
                        all_footnote_refs.extend(footnote_refs)
                
            
            if all_footnote_refs:
                row_data['footnotes'] = list(set(all_footnote_refs))  # Remove duplicatas
            
            if row_data and any(v for k, v in row_data.items() if k not in ['jurisdiction', 'notes', 'footnotes']):
                row_data['year'] = year
                dados_tabela.append(row_data)

        return pd.DataFrame(dados_tabela)
    
    def scrape(self, start_year: int = TIPPED_WAGE_START_YEAR, 
               end_year: int = TIPPED_WAGE_END_YEAR) -> pd.DataFrame:
        """Executa o scraping para todos os anos"""
        
        dfs = []
        for year in range(start_year, end_year + 1):
            df_year = self.extract_table_for_year(year)
            if not df_year.empty:
                dfs.append(df_year)
                print(f"✓ {len(df_year)} registros")
            else:
                print("✗")
        
        if not dfs:
            print("❌ Nenhum dado foi extraído")
            return pd.DataFrame()
        
        df_final = pd.concat(dfs, ignore_index=True)
        
        return df_final


def main():
    """Função principal para teste"""
    scraper = TippedWageScraper()
    df = scraper.scrape(start_year=2003, end_year=2003)  # Teste com poucos anos
    print("\n📋 Preview dos dados:")
    print(scraper.footnotes_dict)
    return df


if __name__ == "__main__":
    main()

{'foot1': 'Other additional deductions are permitted, for example for meals and lodging, except as noted in footnote 8 .', 'foot2': 'Alaska . Beginning January 1, 2004, and annually thereafter, the rate will be adjusted for inflation using either the Consumer Price Index for all urban consumers for Anchorage, Alaska, or $1 more than the Federal minimum wage, whichever is greater.', 'foot3': 'Minnesota . A large employer is an enterprise with annual receipts of $500,000 or more; a small employer, less than $500,000.', 'foot4': 'Oregon . Beginning January 1, 2004, and annually thereafter, the rate will be adjusted for inflation by a calculation using the U.S. City Average Consumer Price Index for All Urban Consumers for All Items. The wage amount established will be rounded to the nearest five cents.', 'foot5': 'Washington . Beginning January 1, 2001, and annually thereafter, the rate will be adjusted for inflation by a calculation using the Consumer Price Index for urban wage earners an

In [60]:
"""
Processador de dados de tipped minimum wage
"""
import pandas as pd
import re
import sys
sys.path.append('..')
from utils import is_monetary_value, is_percentage, extract_multiple_values, append_note, consolidate_notes_simple 

class TippedWageProcessor:
    """Classe para processar dados de tipped minimum wage"""
    
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
    
    def move_text_to_notes(self, column_name: str, value, row):
        """Move texto descritivo para notes"""
        if pd.isna(value) or not isinstance(value, str):
            return value, row
        
        # Se não é valor monetário nem porcentagem, é texto descritivo
        if not is_monetary_value(value) and not is_percentage(value):
            note_text = f"[{column_name}] {value}"
            row['notes'] = append_note(row.get('notes'), note_text)
            return None, row
        
        return value, row
    
    def process_tip_wages(self, row):
        """Processa valores de salário tipped"""
        for col in ['combinedrate', 'tipcredit', 'cashwage']:
            if col not in row:
                continue
            
            value = row[col]
            
            if pd.isna(value) or value == 'Missing value':
                continue
            
            # 1. Verificar se tem múltiplos valores
            multiple_values = extract_multiple_values(value)
            
            if multiple_values:
                first_value = multiple_values[0]
                if not first_value.startswith('$'):
                    first_value = f'${first_value}'
                
                row[col] = first_value
                
                other_values = ', '.join(multiple_values[1:])
                note_text = f"[{col}] Alternative rate(s): {other_values}"
                row['notes'] = append_note(row.get('notes'), note_text)
            
            # 2. Se não é valor monetário nem porcentagem, mover para notes
            else:
                value, row = self.move_text_to_notes(col, value, row)
                row[col] = value
        
        return row
    
    def convert_with_context(self, value, column_name: str, row):
        """Converte valores para float mantendo contexto"""
        if pd.isna(value):
            return None, None, row
        
        if not isinstance(value, str):
            return float(value) if isinstance(value, (int, float)) else None, 'exact', row
        
        original = value.strip()
        value = original.replace('$', '')
        
        if value.lower() in ['not specified', 'missing value', '']:
            return None, None, row
        
        # Porcentagem
        if '%' in value:
            match = re.search(r'(\d+\.?\d*)\s*%', value)
            if match:
                note = f"[{column_name}] Original value: {original}"
                row['notes'] = append_note(row.get('notes'), note)
                return float(match.group(1)), 'percentage', row
        
        # Range (up to, more than, at least)
        range_patterns = {
            'up to': r'up to\s+(\d+\.?\d*)',
            'more than': r'more than\s+(\d+\.?\d*)',
            'at least': r'at least\s+(\d+\.?\d*)',
            'to': r'to\s+(\d+\.?\d*)'
        }
        
        for range_type, pattern in range_patterns.items():
            match = re.search(pattern, value, re.IGNORECASE)
            if match:
                note = f"[{column_name}] {range_type.capitalize()} {match.group(1)}"
                row['notes'] = append_note(row.get('notes'), note)
                return float(match.group(1)), 'range', row
        
        # Valor exato
        try:
            return float(value), 'exact', row
        except ValueError:
            return None, None, row
    
    def process_with_types(self, row):
        """Processa valores e adiciona tipo"""
        for col in ['combinedrate', 'tipcredit', 'cashwage']:
            if col in row:
                value, value_type, row = self.convert_with_context(row[col], col, row)
                row[col] = value
                row[f'{col}_type'] = value_type
        return row
    
    def process(self) -> pd.DataFrame:
        """Executa o pipeline completo de processamento"""
        
        df = self.df.copy()
        
        df = df.apply(self.process_tip_wages, axis=1)
        for col in ['combinedrate', 'tipcredit', 'cashwage']:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: x.str.replace('$', '', regex=False) if hasattr(x, 'str') else x)
        df = df.apply(self.process_with_types, axis=1)
        df['notes'] = df.apply(lambda row: consolidate_notes_simple(row['notes'], row['definition']), axis=1)
        return df


def main():
    """Função principal para teste"""
    # Criar dados de exemplo
    objects = TippedWageScraper() 
    df = objects.scrape()

    print(df)
    
    processor = TippedWageProcessor(df)
    df_processed = processor.process()
    display(df_processed)
    return df_processed


if __name__ == "__main__":
    main()

✓ 71 registros
✓ 71 registros
✓ 72 registros
✓ 72 registros
✓ 69 registros
✓ 69 registros
✓ 67 registros
✓ 65 registros
✓ 58 registros
✓ 58 registros
✓ 55 registros
✓ 62 registros
✓ 63 registros
✓ 63 registros
✓ 59 registros
Jurisdiction raw: <td headers="jurisdiction"><strong>Ohio </strong><a href="/agencies/whd/state/minimum-wage/tipped/2018#foot5"><strong>5 </strong></a><br/><br/>Employers with annual gross receipts of $305,000 or more</td>
Jurisdiction processed: <td headers="jurisdiction"><br/><br/>Employers with annual gross receipts of $305,000 or more</td>
Other text: Employers with annual gross receipts of $305,000 or more
Jurisdiction raw: <td headers="jurisdiction"><strong>Ohio </strong><a href="/agencies/whd/state/minimum-wage/tipped/2018#foot5"><strong>5 </strong></a><br/><br/>Employers with annual gross receipts of $305,000 or more</td>
Jurisdiction processed: Ohio
Footnote refs: ['foot5']
Note text: Employers with annual gross receipts of $305,000 or more
Note text for O

Unnamed: 0,notes,jurisdiction,combinedrate,tipcredit,cashwage,definition,year,footnotes,combinedrate_type,tipcredit_type,cashwage_type
0,Fair Labor Standards Act (FLSA) ; [DEFINITION]...,FEDERAL,5.15,3.02,2.13,More than $30,2003,,exact,exact,exact
1,,Alaska,,,7.15,,2003,[foot2],,,exact
2,,California,,,6.75,,2003,,,,exact
3,,Guam,,,5.15,,2003,,,,exact
4,Large employer,Minnesota,,,5.15,,2003,[foot3],,,exact
...,...,...,...,...,...,...,...,...,...,...,...
1418,[DEFINITION] More than $30,Tennessee,7.25,5.12,2.13,More than $30,2024,[foot9],exact,exact,exact
1419,[DEFINITION] More than $20,Texas,7.25,5.12,2.13,More than $20,2024,[foot16],exact,exact,exact
1420,[DEFINITION] More than $30,Utah,7.25,5.12,2.13,More than $30,2024,[foot16],exact,exact,exact
1421,[DEFINITION] More than $30,Virginia,12.00,9.87,2.13,More than $30,2024,,exact,exact,exact
