In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

URL = 'https://www.dol.gov/agencies/whd/state/minimum-wage/tipped'

FOOTNOTE_MAP = {}

def extract_footnotes_from_page():
    global FOOTNOTE_MAP
    try:
        response = requests.get(URL)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        page_text = soup.get_text()
        footnote_start = page_text.find('FOOTNOTES')
        
        if footnote_start != -1:
            footnote_section = page_text[footnote_start:footnote_start + 10000]  
            
            lines = footnote_section.split('\n')
            current_footnote = None
            current_text = []
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                match = re.match(r'^(\d+)\s+(.+)', line)
                if match:
                    if current_footnote and current_text:
                        FOOTNOTE_MAP[current_footnote] = ' '.join(current_text).strip()
                    
                    current_footnote = match.group(1)
                    current_text = [match.group(2)]
                elif current_footnote and line and not line.startswith('Prepared By'):
                    current_text.append(line)
                elif line.startswith('Prepared By'):
                    break
            
            if current_footnote and current_text:
                FOOTNOTE_MAP[current_footnote] = ' '.join(current_text).strip()
        
        print(f"‚úÖ Encontradas {len(FOOTNOTE_MAP)} footnotes")
        for num, text in list(FOOTNOTE_MAP.items())[:3]:
            print(f"   [{num}] {text[:60]}...")
            
    except Exception as e:
        print(f"‚ùå Erro ao extrair footnotes: {e}")

def extract_footnote_numbers(text):
    if not isinstance(text, str):
        return []
    numbers = re.findall(r'(\d+)', text)
    return [num for num in numbers if num in FOOTNOTE_MAP]

def clean_jurisdiction_name(text):
    if not isinstance(text, str):
        return text
    cleaned = re.sub(r'[\d\*:]+$', '', text).strip()
    return cleaned

def get_law_text(text):
    footnotes = extract_footnote_numbers(text)
    if not footnotes:
        return ''
    
    laws = []
    for num in footnotes:
        if num in FOOTNOTE_MAP:
            laws.append(f"[{num}] {FOOTNOTE_MAP[num]}")
    
    return ' | '.join(laws)

def is_subcategory_row(text):
    if not isinstance(text, str):
        return False
    
    text_lower = text.lower()
    subcategory_indicators = [
        'hotel', 'restaurant', 'bartender', 'business', 'employer',
        'company', 'establishment', 'enterprise', 'annual sales',
        'gross sales', 'covered by', 'not covered by', 'employees',
        'full-time', 'part-time', 'opportunity', 'seasonal'
    ]
    
    return any(indicator in text_lower for indicator in subcategory_indicators)

def detect_regional_variations(text):
    if not isinstance(text, str):
        return []
    
    patterns = [
        r'\$[\d,]+\.?\d*\s*\([^)]+\)',  
        r'[\d,]+\.?\d*\s*\([^)]+\)',    
    ]
    
    variations = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            # Extrair valor e regi√£o
            value_match = re.search(r'([\d,]+\.?\d*)', match)
            region_match = re.search(r'\(([^)]+)\)', match)
            
            if value_match and region_match:
                variations.append({
                    'value': value_match.group(1),
                    'region': region_match.group(1)
                })
    
    return variations

def scrape_all_tipped_wages():
    response = requests.get(URL)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    
    table = soup.find('table')
    if not table:
        raise ValueError('‚ùå Tabela n√£o encontrada na p√°gina')
    
    headers = [th.get_text(strip=True) for th in table.find_all('th')]
    headers.extend(['Law', 'Observation'])
    
    print(f"üìä Headers encontrados: {headers}")
    
    data = []
    current_jurisdiction = None
    current_law = ''
    
    all_rows = table.find_all('tr')[1:]  
    
    i = 0
    while i < len(all_rows):
        row = all_rows[i]
        cols = row.find_all(['td', 'th'])
        row_texts = [td.get_text(strip=True) for td in cols]
        
        if len(row_texts) >= 3 and row_texts[0]:
            jurisdiction_raw = row_texts[0]
            current_jurisdiction = clean_jurisdiction_name(jurisdiction_raw)
            current_law = get_law_text(jurisdiction_raw)
            
            print(f"\nüèõÔ∏è  Processando: {current_jurisdiction}")
            if current_law:
                print(f"   üìã Lei encontrada: {current_law[:100]}...")
            
            regional_variations = []
            for col_idx, cell_text in enumerate(row_texts):
                variations = detect_regional_variations(cell_text)
                if variations:
                    print(f"   üåç Varia√ß√µes regionais encontradas em '{headers[col_idx]}': {len(variations)}")
                    for var in variations:
                        regional_variations.append({
                            'column': headers[col_idx] if col_idx < len(headers) - 2 else 'Unknown',
                            'value': var['value'],
                            'region': var['region']
                        })
            
            if regional_variations:
                regions = set([var['region'] for var in regional_variations])
                for region in regions:
                    region_dict = {}
                    region_dict['Jurisdiction'] = current_jurisdiction
                    region_dict['Law'] = current_law
                    region_dict['Observation'] = region
                    
                    for j, header in enumerate(headers[1:-2]):
                        if j + 1 < len(row_texts):
                            region_dict[header] = row_texts[j + 1]
                        else:
                            region_dict[header] = ''
                    
                    for var in regional_variations:
                        if var['region'] == region and var['column'] in region_dict:
                            region_dict[var['column']] = f"${var['value']}"
                    
                    data.append(region_dict)
            
            subcategories = []
            j = i + 1
            while j < len(all_rows):
                sub_row = all_rows[j]
                sub_cols = sub_row.find_all(['td', 'th'])
                sub_texts = [td.get_text(strip=True) for td in sub_cols]
                
                if len(sub_texts) >= 3 and sub_texts[0] and not is_subcategory_row(sub_texts[0]):
                    break
                
                if len(sub_texts) > 0 and sub_texts[0] and is_subcategory_row(sub_texts[0]):
                    subcategory = {
                        'category': sub_texts[0],
                        'data': sub_texts
                    }
                    subcategories.append(subcategory)
                    print(f"   üë• Subcategoria: {sub_texts[0]}")
                
                j += 1
            
            if subcategories:
                for sub in subcategories:
                    sub_dict = {}
                    sub_dict['Jurisdiction'] = current_jurisdiction
                    sub_dict['Law'] = current_law
                    sub_dict['Observation'] = sub['category']
                    
                    for k, header in enumerate(headers[1:-2]):
                        if k + 1 < len(sub['data']) and sub['data'][k + 1]:
                            sub_dict[header] = sub['data'][k + 1]
                        elif k + 1 < len(row_texts):
                            
                            sub_dict[header] = row_texts[k + 1]
                        else:
                            sub_dict[header] = ''
                    
                    data.append(sub_dict)
            
            elif not regional_variations:
                simple_dict = {}
                simple_dict['Jurisdiction'] = current_jurisdiction
                simple_dict['Law'] = current_law
                simple_dict['Observation'] = ''
                
                for k, header in enumerate(headers[1:-2]):
                    if k + 1 < len(row_texts):
                        simple_dict[header] = row_texts[k + 1]
                    else:
                        simple_dict[header] = ''
                
                data.append(simple_dict)
            
            i = j  
        else:
            i += 1
    
    df = pd.DataFrame(data)
    
    for col in df.columns:
        if col not in ['Law', 'Observation']:
            df[col] = df[col].apply(lambda x: re.sub(r'[\d\*]+$', '', str(x)).strip() if isinstance(x, str) else x)
    
    df = df.sort_values(by='Jurisdiction').reset_index(drop=True)
    return df

if __name__ == '__main__':
    extract_footnotes_from_page()
    
    df_tips = scrape_all_tipped_wages()

    
    examples = df_tips.head(10)
    for _, row in examples.iterrows():
        print(f"\nüèõÔ∏è  {row['Jurisdiction']}")
        if row['Law']:
            print(f"   üìã Lei: {row['Law'][:80]}...")
        if row['Observation']:
            print(f"   üìù Observa√ß√£o: {row['Observation']}")
        print(f"   üí∞ Sal√°rio m√≠n.: {row.get('Basic Combined Cash & Tip Minimum Wage Rate', 'N/A')}")
    
    filename = 'tipped_minimum_wage_complete.csv'
    df_tips.to_csv(filename, index=False, encoding='utf-8')