In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import json
from pathlib import Path
import re


def parse_complete_box_score(soup, box_score_url):
    """Extract all game data from a box score page"""

    game_data = {
        'box_score_url': box_score_url,
        'parsing_errors': []
    }

    try:
        # PART 1: Extract stats from the main statistics table
        stats_table = None
        tables = soup.find_all('table', class_='all-center')

        # Look for stats table, it will always start with FIRST DOWNS
        for table in tables:
            if 'FIRST DOWNS' in table.get_text():
                stats_table = table 
    
        if not stats_table:
            game_data['parsing_errors'].append('Stats table not found')
            return game_data            

        # Get team names from header row
        header_row = stats_table.find('tr')
        header_cells = header_row.find_all('th')
        if len(header_cells) >= 3:
            game_data['team1'] = header_cells[0].text.strip()
            game_data['team2'] = header_cells[2].text.strip()

        # Get all statistics from stats table
        stats_rows = stats_table.find_all('tr')[1:]

        skip_next = False
        for row in stats_rows:
            if skip_next:
                skip_next = False
                continue
                
            cells = row.find_all('td')
            if len(cells) == 3:
                # FIXED: Get raw text with newlines preserved for third down parsing
                value1_raw = cells[0].get_text('\n', strip=True)
                stat_name_raw = cells[1].get_text('\n', strip=True)
                value2_raw = cells[2].get_text('\n', strip=True)

                # Check if this is a multi line cell
                if '\n' in stat_name_raw:
                    # This cell has multiple stats - skip it for now
                    continue
                
                stat_name = stat_name_raw.strip()

                # Skip subcategory rows (they don't have the stat name in proper format)
                if stat_name in ['Passing', 'Rushing', 'Penalty', 'Average', 
                               'Completions-Attempts', 'Rushing Attempts', 
                               'Total Offensive Plays']:
                    continue
                
                # For FIRST DOWNS, skip the next row (subcategories)
                if stat_name == 'FIRST DOWNS':
                    skip_next = True

                # For most stats, use first line only
                value1 = value1_raw.split('\n')[0].strip()
                value2 = value2_raw.split('\n')[0].strip()
                
                # Store the stat
                clean_stat_name = stat_name.lower().replace(' ', '_').replace(':', '')
                game_data[f'team1_{clean_stat_name}'] = value1
                game_data[f'team2_{clean_stat_name}'] = value2
                
                # SPECIAL HANDLING: For third down efficiency, store the full raw text
                if 'THIRD DOWN EFFICIENCY' in stat_name:
                    game_data[f'team1_third_down_raw'] = value1_raw
                    game_data[f'team2_third_down_raw'] = value2_raw

        # PART 2: Parse third down efficiency (FIXED to handle newlines)
        for team in ['team1', 'team2']:
            # Use the raw third down data that includes newlines
            raw_key = f'{team}_third_down_raw'
            if raw_key in game_data:
                td_text = game_data[raw_key]
                
                # Extract percentage
                pct_match = re.search(r'(\d+)%', td_text)
                if pct_match:
                    game_data[f'{team}_third_down_pct'] = pct_match.group(1)
                
                # Extract attempts - look for "X of Y" pattern anywhere in the text
                attempts_match = re.search(r'(\d+)\s+of\s+(\d+)', td_text)
                if attempts_match:
                    conversions = attempts_match.group(1)
                    attempts = attempts_match.group(2)
                    game_data[f'{team}_third_down_conversions'] = conversions
                    game_data[f'{team}_third_down_att'] = attempts

        # PART 3: Parse fumbles (already working correctly)
        for team in ['team1', 'team2']:
            fumbles_key = f'{team}_fumbles_number-lost'
            if fumbles_key in game_data:
                fumbles_text = game_data[fumbles_key]
                # Handle spaces in fumbles format: "1    -0" becomes "1-0"
                fumbles_clean = re.sub(r'\s+', '', fumbles_text)  # Remove all whitespace
                if '-' in fumbles_clean:
                    parts = fumbles_clean.split('-')
                    game_data[f'{team}_fumbles'] = parts[0].strip()
                    game_data[f'{team}_fumbles_lost'] = parts[1].strip()

        # Part 3.5 - parse interceptions (seperate count from return yards
        for team in ['team1', 'team2']:
            int_key = f'{team}_interceptions_number-yards'
            if int_key in game_data:
                int_text = game_data[int_key]
                # Format: "1-0" or "3-92" (interceptions-return_yards)
                if '-' in int_text:
                    parts = int_text.split('-')
                    game_data[f'{team}_interceptions'] = parts[0].strip()
                    game_data[f'{team}_interception_return_yards'] = parts[1].strip()

        # PART 4: Extract final scores (FIXED based on HTML inspection)
        # From inspection: Table 1 contains "Defiance 0" and "Mount Union 65"
        all_tables = soup.find_all('table')
        
        # Look for the score table - it's the first table with team names and scores
        for table in all_tables:
            table_text = table.get_text()
            
            # Check if this table contains both team names
            team1_name = game_data.get('team1', '')
            team2_name = game_data.get('team2', '')
            
            if (team1_name in table_text and team2_name in table_text and
                # And contains what looks like scores (1-2 digit numbers)
                re.search(r'\b\d{1,2}\b', table_text)):
                
                # Extract scores from this table
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    for cell in cells:
                        cell_text = cell.get_text().strip()
                        
                        # Check if this cell contains a team name
                        if team1_name in cell_text:
                            # Look for score in nearby cells or same cell
                            score_match = re.search(r'\b(\d{1,2})\b', cell_text)
                            if score_match:
                                game_data['team1_final_score'] = score_match.group(1)
                        
                        elif team2_name in cell_text:
                            score_match = re.search(r'\b(\d{1,2})\b', cell_text)
                            if score_match:
                                game_data['team2_final_score'] = score_match.group(1)
                
                # If we found scores, break out of table loop
                if 'team1_final_score' in game_data and 'team2_final_score' in game_data:
                    break
        
        # PART 5: Extract team records (FIXED based on HTML inspection)
        # From inspection: Found "(0-1, 0-0)" and "(1-0, 0-0)" patterns
        page_text = soup.get_text()
        
        # Look for record patterns in the entire page
        record_patterns = re.findall(r'\((\d+-\d+),\s*(\d+-\d+)\)', page_text)
        
        if len(record_patterns) >= 2:
            # First pattern should be team1, second should be team2
            team1_records = record_patterns[0]
            team2_records = record_patterns[1]
            
            # Format is (after, before)
            game_data['team1_record_after'] = team1_records[0]
            game_data['team1_record_before'] = team1_records[1]
            game_data['team2_record_after'] = team2_records[0] 
            game_data['team2_record_before'] = team2_records[1]
        
        elif len(record_patterns) == 1:
            # Only one record found - might be in a different format
            game_data['parsing_errors'].append('Only found one team record pattern')
                                   
    except Exception as e:
        game_data['parsing_errors'].append(f'Error: {str(e)}')

    return game_data

In [2]:
import requests
from bs4 import BeautifulSoup

def inspect_box_score_html(box_score_path):
    """
    Inspect the HTML structure to understand how scores and records are stored.
    This helps us debug why certain elements aren't being parsed.
    """
    
    print(f"INSPECTING HTML STRUCTURE FOR: {box_score_path}")
    print("="*70)
    
    # Make the request
    base_url = "https://www.d3football.com"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }
    
    response = requests.get(base_url + box_score_path, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    print("\n1. LOOKING FOR SCORE TABLES:")
    print("-" * 40)
    
    # Look for all possible score table classes
    possible_score_classes = [
        'stats-fullbox clearfix',
        'stats-fullbox', 
        'clearfix',
        'all-center'
    ]
    
    for class_name in possible_score_classes:
        tables = soup.find_all('table', class_=class_name)
        print(f"   Tables with class '{class_name}': {len(tables)}")
        
        for i, table in enumerate(tables):
            print(f"   Table {i+1} preview: {table.get_text()[:100]}...")
    
    print("\n2. LOOKING FOR ANY TABLES WITH NUMBERS (POTENTIAL SCORES):")
    print("-" * 55)
    
    all_tables = soup.find_all('table')
    for i, table in enumerate(all_tables):
        table_text = table.get_text()
        # Look for tables that contain what looks like scores (numbers)
        if any(char.isdigit() for char in table_text):
            print(f"   Table {i+1}: {table.get('class', 'No class')}")
            print(f"   Preview: {table_text[:150]}...")
            print(f"   Full HTML snippet:")
            print(f"   {str(table)[:300]}...")
            print()
    
    print("\n3. SEARCHING FOR TEAM RECORDS PATTERNS:")
    print("-" * 45)
    
    # Look for patterns like "(2-1, 1-1)" or "(W-L)" in the entire page
    import re
    page_text = soup.get_text()
    
    # Pattern for records: (digit-digit, digit-digit)
    record_pattern = r'\(\s*\d+-\d+\s*,\s*\d+-\d+\s*\)'
    matches = re.findall(record_pattern, page_text)
    
    if matches:
        print(f"   Found {len(matches)} potential record patterns:")
        for match in matches:
            print(f"   - {match}")
    else:
        print("   No record patterns found with format '(W-L, W-L)'")
        
        # Try simpler pattern: just W-L anywhere
        simple_pattern = r'\d+-\d+'
        simple_matches = re.findall(simple_pattern, page_text)
        print(f"   Found {len(simple_matches)} W-L patterns: {simple_matches[:10]}...")
    
    print("\n4. SEARCHING FOR THIRD DOWN PATTERNS:")
    print("-" * 42)
    
    # Look for third down patterns like "2 of 15" or "67% (10 of 15)"
    third_down_patterns = [
        r'\d+\s+of\s+\d+',  # "2 of 15"
        r'\d+%\s*\(\s*\d+\s+of\s+\d+\s*\)',  # "67% (10 of 15)"
        r'\(\s*\d+\s+of\s+\d+\s*\)'  # "(10 of 15)"
    ]
    
    for pattern in third_down_patterns:
        matches = re.findall(pattern, page_text)
        if matches:
            print(f"   Pattern '{pattern}' found {len(matches)} times:")
            for match in matches[:5]:  # Show first 5
                print(f"   - {match}")
    
    print("\n5. LOOKING FOR ACTUAL STATS TABLE STRUCTURE:")
    print("-" * 50)
    
    # Find the stats table we know works
    stats_table = None
    tables = soup.find_all('table', class_='all-center')
    
    for table in tables:
        if 'FIRST DOWNS' in table.get_text():
            stats_table = table
            break
    
    if stats_table:
        print("   Found stats table with FIRST DOWNS")
        print("   Table structure:")
        
        rows = stats_table.find_all('tr')
        for i, row in enumerate(rows[:10]):  # First 10 rows
            cells = row.find_all(['td', 'th'])
            if cells:
                cell_texts = [cell.get_text().strip() for cell in cells]
                print(f"   Row {i}: {cell_texts}")
    
    return soup

In [3]:
import requests
from bs4 import BeautifulSoup

def test_box_score_parsing(box_score_path, expected_results=None):
    """
    Comprehensive testing function for box score parsing.
    Tests all major components: stats, scores, and records.
    """
    
    print(f"\n{'='*60}")
    print(f"TESTING BOX SCORE: {box_score_path}")
    print(f"{'='*60}")
    
    # Make the request
    base_url = "https://www.d3football.com"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(base_url + box_score_path, headers=headers)
        print(f"✓ HTTP Status: {response.status_code}")
        
        if response.status_code != 200:
            print(f"✗ Failed to fetch page")
            return False
            
    except Exception as e:
        print(f"✗ Request failed: {e}")
        return False
    
    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    game_stats = parse_complete_box_score(soup, box_score_path)
    
    # Test 1: Basic Structure
    print(f"\n1. BASIC STRUCTURE TESTS:")
    print(f"   - Box score URL: {game_stats.get('box_score_url', 'MISSING')}")
    print(f"   - Parsing errors: {len(game_stats.get('parsing_errors', []))}")
    
    if game_stats['parsing_errors']:
        print(f"   ✗ Errors found: {game_stats['parsing_errors']}")
    else:
        print(f"   ✓ No parsing errors")
    
    # Test 2: Team Names
    print(f"\n2. TEAM NAME TESTS:")
    team1 = game_stats.get('team1', 'MISSING')
    team2 = game_stats.get('team2', 'MISSING')
    print(f"   - Team 1: {team1}")
    print(f"   - Team 2: {team2}")
    
    if team1 != 'MISSING' and team2 != 'MISSING':
        print(f"   ✓ Both team names extracted")
    else:
        print(f"   ✗ Missing team names")
    
    # Test 3: Final Scores (NEW!)
    print(f"\n3. FINAL SCORE TESTS:")
    team1_score = game_stats.get('team1_final_score', 'MISSING')
    team2_score = game_stats.get('team2_final_score', 'MISSING')
    print(f"   - {team1} final score: {team1_score}")
    print(f"   - {team2} final score: {team2_score}")
    
    # Validate scores are numeric
    scores_valid = True
    try:
        if team1_score != 'MISSING':
            int(team1_score)
        if team2_score != 'MISSING':
            int(team2_score)
        print(f"   ✓ Scores are numeric")
    except ValueError:
        print(f"   ✗ Scores are not valid numbers")
        scores_valid = False
    
    # Test 4: Team Records (NEW!)
    print(f"\n4. TEAM RECORD TESTS:")
    for team_num in [1, 2]:
        team_name = game_stats.get(f'team{team_num}', f'Team{team_num}')
        record_before = game_stats.get(f'team{team_num}_record_before', 'MISSING')
        record_after = game_stats.get(f'team{team_num}_record_after', 'MISSING')
        
        print(f"   - {team_name}:")
        print(f"     Record before: {record_before}")
        print(f"     Record after:  {record_after}")
        
        # Validate record format (should be like "2-1" or "0-3")
        import re
        record_pattern = r'^\d+-\d+$'
        
        if record_before != 'MISSING' and re.match(record_pattern, record_before):
            print(f"     ✓ Before record format valid")
        elif record_before != 'MISSING':
            print(f"     ✗ Before record format invalid")
            
        if record_after != 'MISSING' and re.match(record_pattern, record_after):
            print(f"     ✓ After record format valid")
        elif record_after != 'MISSING':
            print(f"     ✗ After record format invalid")
    
    # Test 5: Key Statistics
    print(f"\n5. KEY STATISTICS TESTS:")
    
    # Get all stat names (remove team1_/team2_ prefix)
    stat_names = set()
    for key in game_stats.keys():
        if key.startswith('team1_'):
            stat_names.add(key.replace('team1_', ''))
    
    print(f"   - Total unique stats extracted: {len(stat_names)}")
    
    # Test for important stats
    key_stats = ['first_downs', 'third_down_efficiency', 'total_offense', 
                 'fumbles_number-lost', 'interceptions', 'time_of_possession']
    
    found_key_stats = 0
    for stat in key_stats:
        if stat in stat_names:
            found_key_stats += 1
            print(f"   ✓ Found: {stat}")
        else:
            print(f"   ✗ Missing: {stat}")
    
    print(f"   - Key stats found: {found_key_stats}/{len(key_stats)}")
    
    # Test 6: Third Down Efficiency Parsing (NEW!)
    print(f"\n6. THIRD DOWN EFFICIENCY PARSING:")
    for team_num in [1, 2]:
        team_name = game_stats.get(f'team{team_num}', f'Team{team_num}')
        td_raw = game_stats.get(f'team{team_num}_third_down_efficiency', 'MISSING')
        td_raw_full = game_stats.get(f'team{team_num}_third_down_raw', 'MISSING')
        td_pct = game_stats.get(f'team{team_num}_third_down_pct', 'MISSING')
        td_conv = game_stats.get(f'team{team_num}_third_down_conversions', 'MISSING')
        td_att = game_stats.get(f'team{team_num}_third_down_att', 'MISSING')
        
        print(f"   - {team_name}:")
        print(f"     Raw (short): {td_raw}")
        print(f"     Raw (full): {repr(td_raw_full)}")  # Use repr to show newlines
        print(f"     Percentage: {td_pct}")
        print(f"     Conversions: {td_conv}")
        print(f"     Attempts: {td_att}")
        
        if td_pct != 'MISSING' and td_att != 'MISSING' and td_conv != 'MISSING':
            print(f"     ✓ Third down data parsed successfully")
        else:
            print(f"     ✗ Third down parsing incomplete")
    
    # Test 7: Fumbles Parsing (NEW!)
    print(f"\n7. FUMBLES PARSING:")
    for team_num in [1, 2]:
        team_name = game_stats.get(f'team{team_num}', f'Team{team_num}')
        fumbles_raw = game_stats.get(f'team{team_num}_fumbles_number-lost', 'MISSING')
        fumbles = game_stats.get(f'team{team_num}_fumbles', 'MISSING')
        fumbles_lost = game_stats.get(f'team{team_num}_fumbles_lost', 'MISSING')
        
        print(f"   - {team_name}:")
        print(f"     Raw: {fumbles_raw}")
        print(f"     Fumbles: {fumbles}")
        print(f"     Lost: {fumbles_lost}")
        
        if fumbles != 'MISSING' and fumbles_lost != 'MISSING':
            print(f"     ✓ Fumbles data parsed successfully")
        else:
            print(f"     ✗ Fumbles parsing incomplete")
    
    # Test 8: Display All Stats in Organized Way
    print(f"\n8. COMPLETE STATISTICS SUMMARY:")
    print(f"   {'':<30} {'Team 1':<15} {'Team 2':<15}")
    print(f"   {'-'*30} {'-'*15} {'-'*15}")
    
    for stat in sorted(stat_names):
        team1_val = game_stats.get(f'team1_{stat}', 'N/A')
        team2_val = game_stats.get(f'team2_{stat}', 'N/A')
        print(f"   {stat:<30} {str(team1_val):<15} {str(team2_val):<15}")
    
    # Test 9: Expected Results Validation (if provided)
    if expected_results:
        print(f"\n9. EXPECTED RESULTS VALIDATION:")
        validation_passed = True
        
        for key, expected_value in expected_results.items():
            actual_value = game_stats.get(key, 'MISSING')
            if actual_value == expected_value:
                print(f"   ✓ {key}: {actual_value}")
            else:
                print(f"   ✗ {key}: expected '{expected_value}', got '{actual_value}'")
                validation_passed = False
        
        if validation_passed:
            print(f"   ✓ All expected results match!")
        else:
            print(f"   ✗ Some expected results don't match")
    
    # Overall Success Assessment
    print(f"\n{'='*60}")
    success_criteria = [
        len(game_stats['parsing_errors']) == 0,
        team1 != 'MISSING' and team2 != 'MISSING',
        team1_score != 'MISSING' and team2_score != 'MISSING',
        found_key_stats >= len(key_stats) // 2,  # At least half the key stats
        scores_valid
    ]
    
    success_count = sum(success_criteria)
    print(f"SUCCESS RATE: {success_count}/{len(success_criteria)} criteria met")
    
    if success_count >= 4:
        print(f"✓ OVERALL RESULT: PASSED")
        return True
    else:
        print(f"✗ OVERALL RESULT: FAILED")
        return False


# Example usage with your current test case
if __name__ == "__main__":
    # Test your current box score
    test_box_score_parsing("/seasons/2022/boxscores/20220903_pwzg.xml")
    
    # You can also test with expected results for validation
    # expected = {
    #     'team1': 'Some Team',
    #     'team2': 'Other Team', 
    #     'team1_final_score': '21',
    #     'team2_final_score': '14'
    # }
    # test_box_score_parsing("/seasons/2022/boxscores/20220903_pwzg.xml", expected)


TESTING BOX SCORE: /seasons/2022/boxscores/20220903_pwzg.xml
✓ HTTP Status: 200

1. BASIC STRUCTURE TESTS:
   - Box score URL: /seasons/2022/boxscores/20220903_pwzg.xml
   - Parsing errors: 0
   ✓ No parsing errors

2. TEAM NAME TESTS:
   - Team 1: Defiance
   - Team 2: Mount Union
   ✓ Both team names extracted

3. FINAL SCORE TESTS:
   - Defiance final score: 0
   - Mount Union final score: 65
   ✓ Scores are numeric

4. TEAM RECORD TESTS:
   - Defiance:
     Record before: 0-0
     Record after:  0-1
     ✓ Before record format valid
     ✓ After record format valid
   - Mount Union:
     Record before: 0-0
     Record after:  1-0
     ✓ Before record format valid
     ✓ After record format valid

5. KEY STATISTICS TESTS:
   - Total unique stats extracted: 24
   ✓ Found: first_downs
   ✓ Found: third_down_efficiency
   ✓ Found: total_offense
   ✓ Found: fumbles_number-lost
   ✓ Found: interceptions
   ✓ Found: time_of_possession
   - Key stats found: 6/6

6. THIRD DOWN EFFICIENCY P

In [4]:
import requests
from bs4 import BeautifulSoup


# You'll need to update this with an actual box score path
base_url = "https://www.d3football.com"
box_score_path = "/seasons/2022/boxscores/20220903_pwzg.xml"  # Get this from clicking a BX link

# Make the request
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}

response = requests.get(base_url + box_score_path, headers=headers)
print(f"Status: {response.status_code}")

# Parse the HTML
soup = BeautifulSoup(response.text, 'html.parser')

game_stats = parse_complete_box_score(soup, box_score_path)

# Display results
print(f"Teams: {game_stats.get('team1', 'Unknown')} vs {game_stats.get('team2', 'Unknown')}")
print(f"\nTotal stats found: {len([k for k in game_stats.keys() if 'team1_' in k])}")

# Show all the stats we extracted
print("\nExtracted Statistics:")
print("-" * 50)

# Get all stat names (remove team1_/team2_ prefix)
stat_names = set()
for key in game_stats.keys():
    if key.startswith('team1_'):
        stat_names.add(key.replace('team1_', ''))

# Display in a nice format
for stat in sorted(stat_names):
    team1_val = game_stats.get(f'team1_{stat}', 'N/A')
    team2_val = game_stats.get(f'team2_{stat}', 'N/A')
    print(f"{stat:30} {team1_val:>10} vs {team2_val:>10}")

# Check for any errors
if game_stats['parsing_errors']:
    print("\nErrors encountered:", game_stats['parsing_errors'])

# First, run the inspector
soup = inspect_box_score_html("/seasons/2022/boxscores/20220903_pwzg.xml")

# Then test the improved parser
test_box_score_parsing("/seasons/2022/boxscores/20220903_pwzg.xml")

Status: 200
Teams: Defiance vs Mount Union

Total stats found: 24

Extracted Statistics:
--------------------------------------------------
final_score                             0 vs         65
first_downs                             9 vs         28
fourth_down_efficiency                 0% vs       100%
fumbles                                 1 vs          0
fumbles_lost                            0 vs          0
fumbles_number-lost               1    -0 vs    0    -0
interception_return_yards               0 vs         92
interceptions                           1 vs          3
interceptions_number-yards            1-0 vs       3-92
net_yards_passing                      74 vs        226
net_yards_rushing                      77 vs        361
penalties_number-yards               3-25 vs       2-10
punts_number-yards                 11-325 vs       1-16
record_after                          0-1 vs        1-0
record_before                         0-0 vs        0-0
sacks_number-yards  

True