In [None]:
%pip install -q openpyxl
%pip install -q pyarrow

In [None]:
# Initial Functions Setup
def normalize_name(name):
    """Normalize tennis player names for matching"""
    if pd.isna(name):
        return ""
    name = str(name).replace('.', '').lower()
    parts = name.split()
    if len(parts) < 2:
        return name.replace(' ', '_')
    if len(parts[-1]) == 1:  # Last part is single letter (first initial)
        last_name = parts[-2]
        first_initial = parts[-1]
    else:  # Handle "First Lastname" format
        last_name = parts[-1]
        first_initial = parts[0][0] if parts[0] else ''
    return f"{last_name}_{first_initial}"

def normalize_jeff_name(name):
    """Normalize Jeff's player names for matching"""
    if pd.isna(name):
        return ""
    name = str(name).lower()
    parts = name.split()
    if len(parts) < 2:
        return name.replace(' ', '_')
    last_name = parts[-1]
    first_initial = parts[0][0] if parts[0] else ''
    return f"{last_name}_{first_initial}"

def normalize_tournament_name(name):
    """Normalize tournament names"""
    if pd.isna(name):
        return ""
    name = str(name).lower()
    name = name.replace('masters cup', 'masters')
    name = name.replace('atp finals', 'masters')
    name = name.replace('wta finals', 'masters')
    return name.strip()

def load_excel_data(file_path):
    """Load data from Excel file"""
    try:
        df = pd.read_excel(file_path)
        if 'Date' not in df.columns:
            print(f"Warning: No Date column in {file_path}")
            return pd.DataFrame()
        print(f"Loaded {len(df)} matches from {file_path}")
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return pd.DataFrame()

def get_tournament_tier_weight(tournament_name: str) -> float:
    """Classify tournament tier and return weight"""
    if pd.isna(tournament_name):
        return 0.5
    tournament_lower = tournament_name.lower()
    if any(slam in tournament_lower for slam in ['roland garros', 'wimbledon', 'australian open', 'us open']):
        return 1.0
    masters_events = ['indian wells', 'miami', 'monte carlo', 'madrid', 'rome', 'canada', 'cincinnati', 'shanghai', 'paris masters']
    if any(masters in tournament_lower for masters in masters_events):
        return 0.9
    atp500_events = ['stuttgart', 'barcelona', 'hamburg', 'halle', 'queens', 'washington', 'dubai', 'rotterdam']
    if any(event in tournament_lower for event in atp500_events):
        return 0.7
    if any(lower_tier in tournament_lower for lower_tier in ['itf', 'challenger', 'juniors']):
        return 0.2
    return 0.5

def calculate_recency_weight(match_date, reference_date='2025-07-01'):
    """Calculate exponential decay weight based on match recency"""
    try:
        if isinstance(match_date, str):
            match_dt = datetime.strptime(match_date, '%Y%m%d')
        else:
            match_dt = match_date
        ref_dt = datetime.strptime(reference_date, '%Y-%m-%d')
        days_ago = (ref_dt - match_dt).days
        return np.exp(-0.0005 * days_ago)
    except:
        return 0.5

def load_jeff_comprehensive_data():
    """Load all of Jeff's comprehensive tennis data"""
    base_path = os.path.expanduser("~/Desktop/data/Jeff 6.14.25")
    data = {'men': {}, 'women': {}}
    files = {
        'matches': 'charting-{}-matches.csv',
        'points_2020s': 'charting-{}-points-2020s.csv',
        'overview': 'charting-{}-stats-Overview.csv',
        'serve_basics': 'charting-{}-stats-ServeBasics.csv',
        'return_outcomes': 'charting-{}-stats-ReturnOutcomes.csv',
        'return_depth': 'charting-{}-stats-ReturnDepth.csv',
        'key_points_serve': 'charting-{}-stats-KeyPointsServe.csv',
        'key_points_return': 'charting-{}-stats-KeyPointsReturn.csv',
        'net_points': 'charting-{}-stats-NetPoints.csv',
        'rally': 'charting-{}-stats-Rally.csv',
        'serve_direction': 'charting-{}-stats-ServeDirection.csv',
        'serve_influence': 'charting-{}-stats-ServeInfluence.csv',
        'shot_direction': 'charting-{}-stats-ShotDirection.csv',
        'shot_dir_outcomes': 'charting-{}-stats-ShotDirOutcomes.csv',
        'shot_types': 'charting-{}-stats-ShotTypes.csv',
        'snv': 'charting-{}-stats-SnV.csv',
        'sv_break_split': 'charting-{}-stats-SvBreakSplit.csv',
        'sv_break_total': 'charting-{}-stats-SvBreakTotal.csv'
    }

    for gender in ['men', 'women']:
        gender_path = os.path.join(base_path, gender)
        if os.path.exists(gender_path):
            for key, filename_template in files.items():
                filename = filename_template.format('m' if gender == 'men' else 'w')
                file_path = os.path.join(gender_path, filename)
                if os.path.exists(file_path):
                    df = pd.read_csv(file_path, low_memory=False)
                    if 'player' in df.columns:
                        df['Player_canonical'] = df['player'].apply(normalize_jeff_name)
                    data[gender][key] = df
                    print(f"Loaded {gender}/{filename}: {len(df)} records")
    return data

def load_all_tennis_data():
    """Load tennis data from all years"""
    base_path = os.path.expanduser("~/Desktop/data")
    all_data = []

    for gender_name, gender_code in [("tennisdata_men", "M"), ("tennisdata_women", "W")]:
        gender_path = os.path.join(base_path, gender_name)
        if os.path.exists(gender_path):
            for year in range(2020, 2026):
                file_path = os.path.join(gender_path, f"{year}_{gender_code.lower()}.xlsx")
                if os.path.exists(file_path):
                    df = load_excel_data(file_path)
                    if not df.empty and 'Date' in df.columns:
                        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
                        df['gender'] = gender_code
                        df['year'] = df['Date'].dt.year
                        all_data.append(df)

    if all_data:
        return pd.concat(all_data, ignore_index=True)
    else:
        return pd.DataFrame()

In [None]:
import numpy as np
import os
import requests
from datetime import datetime, date, timedelta
import re
import pandas as pd
import time
import pickle
import shutil
from unidecode import unidecode

# API Configuration
# --- API‑Tennis authentication ---
API_KEY = "adfc70491c47895e5fffdc6428bbf36a561989d4bffcfa9ecfba8d91e947b4fb"
BASE = "https://api.api-tennis.com/tennis/"

def call(method: str, **params):
    q = {"method": method, "APIkey": API_KEY, **params}
    r = requests.get(BASE, params=q, timeout=30)
    r.raise_for_status()
    j = r.json()
    if str(j.get("error", "0")) != "0":
        raise RuntimeError(j)
    return j["result"]

def normalize_name(name):
    """Normalize tennis player names for matching"""
    if pd.isna(name):
        return ""
    name = str(name).replace('.', '').lower()
    parts = name.split()
    if len(parts) < 2:
        return name.replace(' ', '_')
    if len(parts[-1]) == 1:  # Last part is single letter (first initial)
        last_name = parts[-2]
        first_initial = parts[-1]
    else:  # Handle "First Lastname" format
        last_name = parts[-1]
        first_initial = parts[0][0] if parts[0] else ''
    return f"{last_name}_{first_initial}"

def normalize_jeff_name(name):
    """Normalize Jeff's player names for matching"""
    if pd.isna(name):
        return ""
    name = str(name).lower()
    parts = name.split()
    if len(parts) < 2:
        return name.replace(' ', '_')
    last_name = parts[-1]
    first_initial = parts[0][0] if parts[0] else ''
    return f"{last_name}_{first_initial}"

def normalize_tournament_name(name):
    """Normalize tournament names"""
    if pd.isna(name):
        return ""
    name = str(name).lower()
    name = name.replace('masters cup', 'masters')
    name = name.replace('atp finals', 'masters')
    name = name.replace('wta finals', 'masters')
    return name.strip()

def load_excel_data(file_path):
    """Load data from Excel file"""
    try:
        df = pd.read_excel(file_path)
        if 'Date' not in df.columns:
            print(f"Warning: No Date column in {file_path}")
            return pd.DataFrame()
        print(f"Loaded {len(df)} matches from {file_path}")
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return pd.DataFrame()

def get_tournament_tier_weight(tournament_name: str) -> float:
    """Classify tournament tier and return weight"""
    if pd.isna(tournament_name):
        return 0.5
    tournament_lower = tournament_name.lower()
    if any(slam in tournament_lower for slam in ['roland garros', 'wimbledon', 'australian open', 'us open']):
        return 1.0
    masters_events = ['indian wells', 'miami', 'monte carlo', 'madrid', 'rome', 'canada', 'cincinnati', 'shanghai', 'paris masters']
    if any(masters in tournament_lower for masters in masters_events):
        return 0.9
    atp500_events = ['stuttgart', 'barcelona', 'hamburg', 'halle', 'queens', 'washington', 'dubai', 'rotterdam']
    if any(event in tournament_lower for event in atp500_events):
        return 0.7
    if any(lower_tier in tournament_lower for lower_tier in ['itf', 'challenger', 'juniors']):
        return 0.2
    return 0.5

def calculate_recency_weight(match_date, reference_date='2025-07-01'):
    """Calculate exponential decay weight based on match recency"""
    try:
        if isinstance(match_date, str):
            match_dt = datetime.strptime(match_date, '%Y%m%d')
        else:
            match_dt = match_date
        ref_dt = datetime.strptime(reference_date, '%Y-%m-%d')
        days_ago = (ref_dt - match_dt).days
        return np.exp(-0.0005 * days_ago)
    except:
        return 0.5

def load_jeff_comprehensive_data():
    """Load all of Jeff's comprehensive tennis data"""
    base_path = os.path.expanduser("~/Desktop/data/Jeff 6.14.25")
    data = {'men': {}, 'women': {}}
    files = {
        'matches': 'charting-{}-matches.csv',
        'points_2020s': 'charting-{}-points-2020s.csv',
        'overview': 'charting-{}-stats-Overview.csv',
        'serve_basics': 'charting-{}-stats-ServeBasics.csv',
        'return_outcomes': 'charting-{}-stats-ReturnOutcomes.csv',
        'return_depth': 'charting-{}-stats-ReturnDepth.csv',
        'key_points_serve': 'charting-{}-stats-KeyPointsServe.csv',
        'key_points_return': 'charting-{}-stats-KeyPointsReturn.csv',
        'net_points': 'charting-{}-stats-NetPoints.csv',
        'rally': 'charting-{}-stats-Rally.csv',
        'serve_direction': 'charting-{}-stats-ServeDirection.csv',
        'serve_influence': 'charting-{}-stats-ServeInfluence.csv',
        'shot_direction': 'charting-{}-stats-ShotDirection.csv',
        'shot_dir_outcomes': 'charting-{}-stats-ShotDirOutcomes.csv',
        'shot_types': 'charting-{}-stats-ShotTypes.csv',
        'snv': 'charting-{}-stats-SnV.csv',
        'sv_break_split': 'charting-{}-stats-SvBreakSplit.csv',
        'sv_break_total': 'charting-{}-stats-SvBreakTotal.csv'
    }

    for gender in ['men', 'women']:
        gender_path = os.path.join(base_path, gender)
        if os.path.exists(gender_path):
            for key, filename_template in files.items():
                filename = filename_template.format('m' if gender == 'men' else 'w')
                file_path = os.path.join(gender_path, filename)
                if os.path.exists(file_path):
                    df = pd.read_csv(file_path, low_memory=False)
                    if 'player' in df.columns:
                        df['Player_canonical'] = df['player'].apply(normalize_jeff_name)
                    data[gender][key] = df
                    print(f"Loaded {gender}/{filename}: {len(df)} records")
    return data

def load_all_tennis_data():
    """Load tennis data from all years"""
    base_path = os.path.expanduser("~/Desktop/data")
    all_data = []

    for gender_name, gender_code in [("tennisdata_men", "M"), ("tennisdata_women", "W")]:
        gender_path = os.path.join(base_path, gender_name)
        if os.path.exists(gender_path):
            for year in range(2020, 2026):
                file_path = os.path.join(gender_path, f"{year}_{gender_code.lower()}.xlsx")
                if os.path.exists(file_path):
                    df = load_excel_data(file_path)
                    if not df.empty and 'Date' in df.columns:
                        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
                        df['gender'] = gender_code
                        df['year'] = df['Date'].dt.year
                        all_data.append(df)

    if all_data:
        return pd.concat(all_data, ignore_index=True)
    else:
        return pd.DataFrame()

def get_fallback_defaults(gender_key):
    """Fallback defaults when no Jeff data available"""
    base_defaults = {
        'serve_pts': 80, 'aces': 6, 'double_faults': 3, 'first_serve_pct': 0.62,
        'first_serve_won': 35, 'second_serve_won': 16, 'break_points_saved': 4,
        'return_pts_won': 30, 'winners_total': 28, 'winners_fh': 16, 'winners_bh': 12,
        'unforced_errors': 28, 'unforced_fh': 16, 'unforced_bh': 12,
        'serve_wide_pct': 0.3, 'serve_t_pct': 0.4, 'serve_body_pct': 0.3,
        'return_deep_pct': 0.4, 'return_shallow_pct': 0.3, 'return_very_deep_pct': 0.2,
        'key_points_serve_won_pct': 0.6, 'key_points_aces_pct': 0.05, 'key_points_first_in_pct': 0.55,
        'key_points_return_won_pct': 0.35, 'key_points_return_winners': 0.02,
        'net_points_won_pct': 0.65, 'net_winners_pct': 0.3, 'passed_at_net_pct': 0.3,
        'rally_server_winners_pct': 0.15, 'rally_server_unforced_pct': 0.2,
        'rally_returner_winners_pct': 0.1, 'rally_returner_unforced_pct': 0.25,
        'shot_crosscourt_pct': 0.5, 'shot_down_line_pct': 0.25, 'shot_inside_out_pct': 0.15,
        'serve_volley_frequency': 0.02, 'serve_volley_success_pct': 0.6,
        'return_error_net_pct': 0.1, 'return_error_wide_pct': 0.05,
        'aggression_index': 0.5, 'consistency_index': 0.5, 'pressure_performance': 0.5, 'net_game_strength': 0.5
    }

    if gender_key == 'women':
        base_defaults.update({
            'serve_pts': 75, 'aces': 4, 'first_serve_pct': 0.60,
            'first_serve_won': 32, 'second_serve_won': 15,
            'serve_volley_frequency': 0.01, 'net_points_won_pct': 0.60
        })

    return base_defaults

def calculate_comprehensive_weighted_defaults(jeff_data):
    """Calculate weighted defaults from all Jeff datasets"""
    print("Calculating comprehensive weighted defaults from Jeff's data...")
    defaults = {'men': {}, 'women': {}}

    for gender in ['men', 'women']:
        if gender not in jeff_data:
            continue

        print(f"\nProcessing {gender}'s comprehensive data...")
        matches_df = jeff_data[gender].get('matches')
        if matches_df is None:
            print(f"No matches data for {gender}")
            continue

        gender_defaults = {}

        # Overview stats - basic serving/returning
        if 'overview' in jeff_data[gender]:
            overview_df = jeff_data[gender]['overview']
            match_totals = overview_df[overview_df['set'] == 'Total'].copy()

            # Simple defaults from median values
            if len(match_totals) > 0:
                gender_defaults.update({
                    'serve_pts': float(match_totals['serve_pts'].median()) if 'serve_pts' in match_totals.columns else 77.0,
                    'aces': float(match_totals['aces'].median()) if 'aces' in match_totals.columns else 5.0,
                    'double_faults': float(match_totals['dfs'].median()) if 'dfs' in match_totals.columns else 3.0,
                    'first_serve_pct': 0.62,  # reasonable default
                    'first_serve_won': float(match_totals['first_won'].median()) if 'first_won' in match_totals.columns else 35.0,
                    'second_serve_won': float(match_totals['second_won'].median()) if 'second_won' in match_totals.columns else 16.0,
                    'break_points_saved': float(match_totals['bp_saved'].median()) if 'bp_saved' in match_totals.columns else 4.0,
                    'return_pts_won': float(match_totals['return_pts_won'].median()) if 'return_pts_won' in match_totals.columns else 30.0,
                    'winners_total': float(match_totals['winners'].median()) if 'winners' in match_totals.columns else 28.0,
                    'winners_fh': float(match_totals['winners_fh'].median()) if 'winners_fh' in match_totals.columns else 16.0,
                    'winners_bh': float(match_totals['winners_bh'].median()) if 'winners_bh' in match_totals.columns else 12.0,
                    'unforced_errors': float(match_totals['unforced'].median()) if 'unforced' in match_totals.columns else 28.0,
                    'unforced_fh': float(match_totals['unforced_fh'].median()) if 'unforced_fh' in match_totals.columns else 16.0,
                    'unforced_bh': float(match_totals['unforced_bh'].median()) if 'unforced_bh' in match_totals.columns else 12.0,
                })

        # Add fallback defaults for missing features
        fallback = get_fallback_defaults(gender)
        for key, value in fallback.items():
            if key not in gender_defaults:
                gender_defaults[key] = float(value)

        defaults[gender] = gender_defaults
        print(f"Calculated defaults for {gender}: {len(gender_defaults)} features")

    return defaults

def extract_comprehensive_jeff_features(player_canonical, gender, jeff_data, weighted_defaults=None):
    """Extract features from all Jeff datasets with Player_canonical checks"""
    gender_key = 'men' if gender == 'M' else 'women'

    if gender_key not in jeff_data:
        return get_fallback_defaults(gender_key)

    if weighted_defaults and gender_key in weighted_defaults:
        features = weighted_defaults[gender_key].copy()
    else:
        features = get_fallback_defaults(gender_key)

    # Overview stats
    if 'overview' in jeff_data[gender_key]:
        overview_df = jeff_data[gender_key]['overview']
        if 'Player_canonical' in overview_df.columns:
            player_overview = overview_df[
                (overview_df['Player_canonical'] == player_canonical) &
                (overview_df['set'] == 'Total')
            ]

            if len(player_overview) > 0:
                latest = player_overview.iloc[-1]
                serve_pts = latest.get('serve_pts', 80)
                if serve_pts > 0:
                    features.update({
                        'serve_pts': float(serve_pts),
                        'aces': float(latest.get('aces', 0)),
                        'double_faults': float(latest.get('dfs', 0)),
                        'first_serve_pct': float(latest.get('first_in', 0)) / float(serve_pts) if serve_pts > 0 else 0.62,
                        'first_serve_won': float(latest.get('first_won', 0)),
                        'second_serve_won': float(latest.get('second_won', 0)),
                        'break_points_saved': float(latest.get('bp_saved', 0)),
                        'return_pts_won': float(latest.get('return_pts_won', 0)),
                        'winners_total': float(latest.get('winners', 0)),
                        'winners_fh': float(latest.get('winners_fh', 0)),
                        'winners_bh': float(latest.get('winners_bh', 0)),
                        'unforced_errors': float(latest.get('unforced', 0)),
                        'unforced_fh': float(latest.get('unforced_fh', 0)),
                        'unforced_bh': float(latest.get('unforced_bh', 0))
                    })

    return features

def clean_data_for_parquet(df):
    """Clean data to ensure parquet compatibility"""
    df_clean = df.copy()

    # Convert problematic data types
    for col in df_clean.columns:
        # Handle mixed types and objects
        if df_clean[col].dtype == 'object':
            # Try to convert to numeric first
            numeric_version = pd.to_numeric(df_clean[col], errors='coerce')
            if not numeric_version.isna().all():
                df_clean[col] = numeric_version
            else:
                # Convert to string and handle nulls
                df_clean[col] = df_clean[col].astype(str)
                df_clean[col] = df_clean[col].replace('nan', pd.NA)
                df_clean[col] = df_clean[col].replace('<NA>', pd.NA)

        # Handle date columns
        if 'date' in col.lower() and df_clean[col].dtype == 'object':
            try:
                df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
            except:
                pass

        # Replace infinities
        if pd.api.types.is_numeric_dtype(df_clean[col]):
            df_clean[col] = df_clean[col].replace([np.inf, -np.inf], pd.NA)

    return df_clean

# CACHE SETUP
CACHE_DIR = os.path.expanduser("~/Desktop/data/cache")
HD_PATH   = os.path.join(CACHE_DIR, "historical_data.parquet")
JEFF_PATH = os.path.join(CACHE_DIR, "jeff_data.pkl")
DEF_PATH  = os.path.join(CACHE_DIR, "weighted_defaults.pkl")

def safe_save_to_cache(historical_data, jeff_data, weighted_defaults):
    """Safely save data with proper error handling"""
    print("\n=== SAVING TO CACHE ===")

    # Check cache directory
    os.makedirs(CACHE_DIR, exist_ok=True)

    try:
        # 1. Clean and save historical data
        print("  Cleaning historical data for parquet...")
        historical_clean = clean_data_for_parquet(historical_data)
        print(f"  Saving historical data: {historical_clean.shape}")
        historical_clean.to_parquet(HD_PATH, index=False, engine='pyarrow')
        print("  ✓ Historical data saved")

        # 2. Save Jeff data
        print("  Saving Jeff data...")
        with open(JEFF_PATH, "wb") as f:
            pickle.dump(jeff_data, f, protocol=pickle.HIGHEST_PROTOCOL)
        print("  ✓ Jeff data saved")

        # 3. Save weighted defaults
        print("  Saving weighted defaults...")
        with open(DEF_PATH, "wb") as f:
            pickle.dump(weighted_defaults, f, protocol=pickle.HIGHEST_PROTOCOL)
        print("  ✓ Weighted defaults saved")

        print("✓ All data cached successfully")
        return True

    except Exception as e:
        print(f"ERROR saving cache: {e}")
        print(f"Error details: {type(e).__name__}: {str(e)}")

        # Try alternative save methods
        try:
            print("  Trying alternative CSV save...")
            historical_data.to_csv(HD_PATH.replace('.parquet', '.csv'), index=False)
            print("  ✓ Saved as CSV instead")
            return True
        except Exception as e2:
            print(f"  CSV save also failed: {e2}")
            return False

def generate_comprehensive_historical_all_years_fixed(*, fast: bool = True, n_sample: int = 500):
    """Fixed version with proper error handling"""
    print("=== STARTING DATA GENERATION ===")

    # Step 1: Load Jeff's data
    print("Step 1: Loading Jeff's comprehensive data...")
    try:
        jeff_data = load_jeff_comprehensive_data()
        if not jeff_data or ('men' not in jeff_data and 'women' not in jeff_data):
            print("ERROR: Jeff data loading failed")
            return pd.DataFrame(), {}, {}

        print(f"✓ Jeff data loaded successfully")
        print(f"  - Men's datasets: {len(jeff_data.get('men', {}))}")
        print(f"  - Women's datasets: {len(jeff_data.get('women', {}))}")

    except Exception as e:
        print(f"ERROR loading Jeff data: {e}")
        return pd.DataFrame(), {}, {}

    # Step 2: Calculate weighted defaults
    print("Step 2: Calculating weighted defaults...")
    try:
        weighted_defaults = calculate_comprehensive_weighted_defaults(jeff_data)
        if not weighted_defaults:
            print("ERROR: Weighted defaults calculation failed")
            return pd.DataFrame(), jeff_data, {}

        print(f"✓ Weighted defaults calculated")
        print(f"  - Men's features: {len(weighted_defaults.get('men', {}))}")
        print(f"  - Women's features: {len(weighted_defaults.get('women', {}))}")

    except Exception as e:
        print(f"ERROR calculating weighted defaults: {e}")
        return pd.DataFrame(), jeff_data, {}

    # Step 3: Load tennis match data
    print("Step 3: Loading tennis match data...")
    try:
        tennis_data = load_all_tennis_data()
        if tennis_data.empty:
            print("ERROR: No tennis data loaded")
            return pd.DataFrame(), jeff_data, weighted_defaults

        print(f"✓ Tennis data loaded: {len(tennis_data)} matches")

        # Fast mode for testing
        if fast:
            total_rows = len(tennis_data)
            take = min(n_sample, total_rows)
            tennis_data = tennis_data.sample(take, random_state=1).reset_index(drop=True)
            print(f"[FAST MODE] Using sample of {take}/{total_rows} rows")

    except Exception as e:
        print(f"ERROR loading tennis data: {e}")
        return pd.DataFrame(), jeff_data, weighted_defaults

    # Step 4: Process tennis data
    print("Step 4: Processing tennis data...")
    try:
        # Normalize player names
        tennis_data['winner_canonical'] = tennis_data['Winner'].apply(normalize_name)
        tennis_data['loser_canonical'] = tennis_data['Loser'].apply(normalize_name)
        tennis_data['tournament_canonical'] = tennis_data['Tournament'].apply(normalize_tournament_name)

        # Fix dates
        tennis_data['Date'] = pd.to_datetime(tennis_data['Date'], errors='coerce')
        tennis_data['date'] = tennis_data['Date'].dt.date

        # Add odds data - SIMPLIFIED to avoid errors
        tennis_data['tennis_data_odds1'] = pd.to_numeric(tennis_data.get('PSW', 0), errors='coerce')
        tennis_data['tennis_data_odds2'] = pd.to_numeric(tennis_data.get('PSL', 0), errors='coerce')

        # Add ranking difference
        if 'WRank' in tennis_data.columns and 'LRank' in tennis_data.columns:
            tennis_data['rank_difference'] = abs(pd.to_numeric(tennis_data['WRank'], errors='coerce') -
                                                 pd.to_numeric(tennis_data['LRank'], errors='coerce'))

        print(f"✓ Tennis data processed")

    except Exception as e:
        print(f"ERROR processing tennis data: {e}")
        return pd.DataFrame(), jeff_data, weighted_defaults

    # Step 5: Add Jeff feature columns
    print("Step 5: Adding Jeff feature columns...")
    try:
        all_jeff_features = [
            'serve_pts', 'aces', 'double_faults', 'first_serve_pct', 'first_serve_won',
            'second_serve_won', 'break_points_saved', 'return_pts_won',
            'winners_total', 'winners_fh', 'winners_bh', 'unforced_errors', 'unforced_fh', 'unforced_bh',
            'serve_wide_pct', 'serve_t_pct', 'serve_body_pct',
            'return_deep_pct', 'return_shallow_pct', 'return_very_deep_pct',
            'key_points_serve_won_pct', 'key_points_aces_pct', 'key_points_first_in_pct',
            'key_points_return_won_pct', 'key_points_return_winners',
            'net_points_won_pct', 'net_winners_pct', 'passed_at_net_pct',
            'rally_server_winners_pct', 'rally_server_unforced_pct',
            'rally_returner_winners_pct', 'rally_returner_unforced_pct',
            'shot_crosscourt_pct', 'shot_down_line_pct', 'shot_inside_out_pct',
            'serve_volley_frequency', 'serve_volley_success_pct',
            'return_error_net_pct', 'return_error_wide_pct',
            'aggression_index', 'consistency_index', 'pressure_performance', 'net_game_strength'
        ]

        # Initialize feature columns with proper data types
        for feature in all_jeff_features:
            tennis_data[f'winner_{feature}'] = pd.Series(dtype='float64')
            tennis_data[f'loser_{feature}'] = pd.Series(dtype='float64')

        print(f"✓ Added {len(all_jeff_features) * 2} feature columns")

    except Exception as e:
        print(f"ERROR adding feature columns: {e}")
        return pd.DataFrame(), jeff_data, weighted_defaults

    # Step 6: Extract Jeff features
    print("Step 6: Extracting Jeff features...")
    try:
        total_matches = len(tennis_data)
        matches_with_jeff_features = 0

        # Test feature extraction first
        if 'men' in jeff_data and 'overview' in jeff_data['men'] and len(jeff_data['men']['overview']) > 0:
            test_player = jeff_data['men']['overview']['Player_canonical'].iloc[0]
            test_features = extract_comprehensive_jeff_features(test_player, 'M', jeff_data, weighted_defaults)
            print(f"✓ Feature extraction test passed for {test_player}")
            print(f"  Sample features: serve_pts={test_features.get('serve_pts', 'N/A')}")

        for idx, row in tennis_data.iterrows():
            if idx % 100 == 0:  # More frequent updates for small datasets
                print(f"  Processing match {idx}/{total_matches}")

            try:
                gender = row['gender']

                # Only extract Jeff features for matches before cutoff
                if row['date'] <= date(2025, 6, 10):
                    winner_features = extract_comprehensive_jeff_features(
                        row['winner_canonical'], gender, jeff_data, weighted_defaults
                    )
                    loser_features = extract_comprehensive_jeff_features(
                        row['loser_canonical'], gender, jeff_data, weighted_defaults
                    )

                    # Assign features with proper error handling
                    for feature_name, feature_value in winner_features.items():
                        col_name = f'winner_{feature_name}'
                        if col_name in tennis_data.columns:
                            tennis_data.at[idx, col_name] = feature_value

                    for feature_name, feature_value in loser_features.items():
                        col_name = f'loser_{feature_name}'
                        if col_name in tennis_data.columns:
                            tennis_data.at[idx, col_name] = feature_value

                    if winner_features and loser_features:
                        matches_with_jeff_features += 1

            except Exception as e:
                if idx < 5:  # Only print first few errors
                    print(f"  Warning: Error processing match {idx}: {e}")
                continue

        print(f"✓ Jeff features extracted for {matches_with_jeff_features}/{total_matches} matches")

    except Exception as e:
        print(f"ERROR extracting Jeff features: {e}")
        return pd.DataFrame(), jeff_data, weighted_defaults

    print(f"=== DATA GENERATION COMPLETE ===")
    print(f"Final data shape: {tennis_data.shape}")
    print(f"Columns: {len(tennis_data.columns)}")

    return tennis_data, jeff_data, weighted_defaults

# ------------------------------------------------------------------
# API-Tennis helpers
# ------------------------------------------------------------------
from pathlib import Path
CACHE_API = Path.home() / ".api_tennis_cache"
CACHE_API.mkdir(exist_ok=True)

os.environ["API_TENNIS_KEY"] = "adfc70491c47895e5fffdc6428bbf36a561989d4bffcfa9ecfba8d91e947b4fb"
API_KEY = os.getenv("API_TENNIS_KEY")
BASE = "https://api.api-tennis.com/tennis/"

def api(method: str, **params):
    r = requests.get(
        "https://api.api-tennis.com/tennis/",
        params={"method": method, "APIkey": API_KEY, **params},
        timeout=30
    )
    r.raise_for_status()
    j = r.json()
    if str(j.get("error", 0)) != "0":
        raise RuntimeError(j)
    return j.get("result", [])

def tournaments_lookup():
    fn = CACHE_API / "tournaments.pkl"
    if fn.exists():
        return pickle.loads(fn.read_bytes())
    res = api("get_tournaments")
    fn.write_bytes(pickle.dumps(res, 4))
    return res

def standings_lookup(day: date, league: str = "ATP"):
    """
    Fetch weekly standings from API‑Tennis.
    API expects `event_type` = "Men" | "Women", **not** `league`.
    Response is cached one file per ISO‑week.
    """
    tag = f"{league}_{day.isocalendar()[0]}_{day.isocalendar()[1]:02d}.pkl"
    fn = CACHE_API / tag
    if fn.exists():
        return pickle.loads(fn.read_bytes())

    event_type = "Men" if league.upper() == "ATP" else "Women"
    rows = api("get_standings", event_type=event_type)
    fn.write_bytes(pickle.dumps(rows, 4))
    return rows


# Clear cache first
print("=== CLEARING CACHE ===")
if os.path.exists(CACHE_DIR):
    shutil.rmtree(CACHE_DIR)
    print(f"Removed cache directory: {CACHE_DIR}")
os.makedirs(CACHE_DIR, exist_ok=True)
print("Created fresh cache directory")

print("Starting data generation...")
historical_data, jeff_data, weighted_defaults = generate_comprehensive_historical_all_years_fixed(
    fast=True, n_sample=500
)


# ------------------------------------------------------------------
# Step 7 – API-Tennis ingestion   (run AFTER Step 6)
# ------------------------------------------------------------------
print("Step 7: Appending API-Tennis data …")

from pathlib import Path
import json, pickle, time

# ---------- constants --------------------------------------------------------
START_CUTOFF = date(2025, 6, 11)          # first day Jeff is missing
ODDS_SWITCH  = date(2025, 6, 23)          # API-Tennis odds priority
CACHE_API    = Path.home() / ".api_tennis_cache"
CACHE_API.mkdir(exist_ok=True)

# ---------- helpers ----------------------------------------------------------
def fixtures_finished(day: date) -> list[dict]:
    return [
        ev for ev in api("get_fixtures",
                         date_start=day.isoformat(),
                         date_stop=day.isoformat(),
                         timezone="UTC")
        if ev.get("event_status") == "Finished"
    ]

def statistics_block(match_key: int) -> dict:
    try:
        s = api("get_statistics", match_id=match_key)[0]
    except Exception:
        return {}
    return {
        "aces"            : int(s.get("aces", 0)),
        "double_faults"   : int(s.get("double_faults", 0)),
        "first_serve_pct" : float(s.get("first_serve_percentage", 0))/100,
        "first_serve_won" : int(s.get("first_serve_points_won", 0)),
        "second_serve_won": int(s.get("second_serve_points_won", 0)),
        "return_pts_won"  : int(s.get("return_points_won", 0)),
    }

def odds_block(match_key: int, day: date) -> tuple[float|None,float|None]:
    if day < ODDS_SWITCH:
        return (None, None)
    try:
        mkts = api("get_odds", match_id=match_key)
    except Exception:
        return (None, None)
    mkt = next((m for m in mkts if m["odd_name"] == "Home/Away"), None)
    if not mkt:
        return (None, None)
    home, away = None, None
    for sel in mkt["value"]:
        if sel["type"] == "Home":
            home = float(sel["odd"])
        elif sel["type"] == "Away":
            away = float(sel["odd"])
    return (home, away)

def standings_lookup(day: date, league="ATP") -> list[dict]:
    tag = f"{league}_{day.isocalendar()[0]}_{day.isocalendar()[1]:02d}.pkl"
    fn  = CACHE_API / tag
    if fn.exists():
        return pickle.loads(fn.read_bytes())
    rows = api("get_standings", league=league)
    fn.write_bytes(pickle.dumps(rows, 4))
    return rows

def ranking_map(day: date, league: str) -> dict[int,int]:
    return {int(r["player_key"]): int(r["place"])
            for r in standings_lookup(day, league)}

# ---------- H2H cache --------------------------------------------------------
def h2h_features(p1_key: int, p2_key: int) -> dict:
    tag = CACHE_API / f"h2h_{p1_key}_{p2_key}.pkl"
    if tag.exists():
        return pickle.loads(tag.read_bytes())

    try:
        blk = api("get_H2H",
                  first_player_key=p1_key,
                  second_player_key=p2_key)[0]
    except Exception:
        tag.write_bytes(pickle.dumps({}, 4))
        return {}

    rows = blk["H2H"] or []
    if not rows:
        tag.write_bytes(pickle.dumps({}, 4))
        return {}

    wins1 = sum(ev["event_winner"].startswith("First") for ev in rows)
    wins2 = len(rows) - wins1
    last_dt = max(pd.to_datetime(ev["event_date"]) for ev in rows)
    gap    = (pd.Timestamp.utcnow() - last_dt).days

    surf_cnt = {}
    for ev in rows:
        surf = ev.get("court_surface") or "Hard"
        key  = ("p1" if ev["event_winner"].startswith("First") else "p2") + "_" + surf
        surf_cnt[key] = surf_cnt.get(key, 0) + 1

    out = {
        "h2h_played"   : len(rows),
        "h2h_p1_wins"  : wins1,
        "h2h_p2_wins"  : wins2,
        "h2h_gap_days" : gap,
        **surf_cnt
    }
    tag.write_bytes(pickle.dumps(out, 4))
    return out

# ---------- streaming loop ---------------------------------------------------
# --- ensure event_key column exists ---------------------------------
if "event_key" not in historical_data.columns:
    historical_data["event_key"] = pd.NA
existing_keys = set(
    historical_data.loc[historical_data["date"] >= START_CUTOFF, "event_key"]
        .dropna().astype(int)
)

append_buf = []
for d in pd.date_range(START_CUTOFF, date.today()):
    day = d.date()
    for ev in fixtures_finished(day):
        k = int(ev["event_key"])
        if k in existing_keys:
            continue

        p1_name, p2_name = ev["event_first_player"], ev["event_second_player"]
        winner = p1_name if ev["event_winner"].startswith("First") else p2_name
        loser  = p2_name if winner == p1_name else p1_name

        row = {
            "Date"      : pd.to_datetime(ev["event_date"]),
            "date"      : pd.to_datetime(ev["event_date"]).date(),
            "event_key" : k,
            "Tournament": ev["tournament_name"],
            "round"     : ev.get("tournament_round", ""),
            "Surface"   : ev.get("court_surface") or "Hard",
            "Winner"    : winner,
            "Loser"     : loser,
            "score_raw" : json.dumps(ev.get("scores", "")),
            "source_rank": 1                         # Jeff 0, API 1, tennis-data 2
        }

        # stats
        row.update(statistics_block(k))

        # odds
        o1, o2 = odds_block(k, day)
        row["tennis_data_odds1"] = o1
        row["tennis_data_odds2"] = o2

        # rankings
        league = "WTA" if "wta" in ev["event_type_type"].lower() else "ATP"
        rmap   = ranking_map(day, league)
        row["WRank"] = rmap.get(int(ev["first_player_key"]),  pd.NA)
        row["LRank"] = rmap.get(int(ev["second_player_key"]), pd.NA)

        # H2H
        row.update(
            h2h_features(int(ev["first_player_key"]),
                         int(ev["second_player_key"]))
        )

        append_buf.append(row)
        existing_keys.add(k)

print(f"  finished streaming: {len(append_buf)} new rows")

if append_buf:
    api_df = pd.DataFrame(append_buf)
    api_df["Date"] = pd.to_datetime(api_df["Date"])
    api_df["date"] = api_df["Date"].dt.date

    HIST_KEY = ["date", "Tournament", "Winner", "Loser"]
    historical_data = (
        pd.concat([historical_data, api_df], ignore_index=True)
          .sort_values("source_rank")
          .drop_duplicates(subset=HIST_KEY, keep="first")
          .reset_index(drop=True)
    )

print("API-Tennis merge complete")

# Save to cache with error handling
if len(historical_data) > 0 and jeff_data and weighted_defaults:
    success = safe_save_to_cache(historical_data, jeff_data, weighted_defaults)
    if success:
        print("✓ Data generation and caching completed successfully")
    else:
        print("✗ Data generation completed but caching failed")
else:
    print("✗ Data generation failed")

# Final status
print(f"\n=== FINAL STATUS ===")
print(f"Historical data: {historical_data.shape}")
print(f"Jeff data available: {bool(jeff_data)}")
print(f"Weighted defaults available: {bool(weighted_defaults)}")

if len(historical_data) > 0:
    # Check Jeff features
    jeff_cols = [col for col in historical_data.columns if 'winner_serve_pts' in col]
    print(f"Jeff feature columns: {jeff_cols}")

    if 'winner_serve_pts' in historical_data.columns:
        non_null_count = historical_data['winner_serve_pts'].notna().sum()
        print(f"Non-null serve_pts values: {non_null_count}")

        if non_null_count > 0:
            sample_values = historical_data['winner_serve_pts'].dropna().head(5)
            print(f"Sample serve_pts: {sample_values.tolist()}")

print("\nData generation complete!")

In [None]:
# LAYER 1 ##
def extract_data_samples():
    # Jeff Sackmann data samples
    jeff_samples = {
        'matches': jeff_data['men']['matches'].head(3),
        'serve_basics': jeff_data['men']['serve_basics'].head(3),
        'overview': jeff_data['men']['overview'].head(3)
    }

    # Tennis-data samples
    tennis_samples = historical_data[
        ['Winner', 'Loser', 'WRank', 'LRank', 'PSW', 'PSL', 'Surface']
    ].head(3)

    return jeff_samples, tennis_samples

# Hold/break computation method verification
hold_break_computation = {
    'current_method': 'Jeff aggregated stats from overview dataset',
    'available_columns': ['serve_pts', 'first_in', 'first_won', 'second_won'],
    'computation_level': 'Per-player aggregate from charting data'
}

# Bayesian
def extract_priors_from_current_data(player_canonical, gender, surface):
    priors = {}

    # Layer 1: Elo approximation from rankings
    player_matches = historical_data[
        (historical_data['winner_canonical'] == player_canonical) |
        (historical_data['loser_canonical'] == player_canonical)
    ]

    if len(player_matches) > 0:
        # Ranking-based Elo estimation
        recent_rank = get_recent_rank(player_canonical, player_matches)
        elo_estimate = 2000 - (recent_rank * 5) if recent_rank else 1500

        # Jeff feature extraction
        jeff_features = extract_jeff_features(player_canonical, gender, jeff_data)

        priors = {
            'elo_estimate': elo_estimate,
            'serve_effectiveness': jeff_features.get('serve_pts', 0.6),
            'return_strength': jeff_features.get('return_pts_won', 0.3),
            'surface_factor': calculate_surface_adjustment(player_matches, surface)
        }

    return priors

# Time decay for recent form
def calculate_time_decayed_performance(player_matches, reference_date):
    player_matches['days_ago'] = (reference_date - player_matches['date']).dt.days

    # Exponential decay: recent matches weighted heavier
    weights = np.exp(-0.01 * player_matches['days_ago'])  # 1% daily decay

    weighted_performance = {
        'win_rate': np.average(player_matches['is_winner'], weights=weights),
        'games_won_rate': np.average(player_matches['games_won_pct'], weights=weights)
    }

    return weighted_performance

In [None]:
## TEST ##
import os, pickle, pandas as pd

CACHE_DIR = os.path.expanduser("~/Desktop/data/cache")
os.makedirs(CACHE_DIR, exist_ok=True)
HD_PATH   = os.path.join(CACHE_DIR, "historical_data.parquet")
JEFF_PATH = os.path.join(CACHE_DIR, "jeff_data.pkl")
DEF_PATH  = os.path.join(CACHE_DIR, "weighted_defaults.pkl")

if (os.path.exists(HD_PATH) and
    os.path.exists(JEFF_PATH) and
    os.path.exists(DEF_PATH)):
    print("Loading cached data …")
    historical_data = pd.read_parquet(HD_PATH)
    with open(JEFF_PATH, "rb") as fh:
        jeff_data = pickle.load(fh)
    with open(DEF_PATH, "rb") as fh:
        weighted_defaults = pickle.load(fh)
else:
    print("Cache miss – regenerating (one-time slow run).")
    combined_data, jeff_data, weighted_defaults = generate_comprehensive_historical_all_years()
    historical_data = combined_data
    historical_data.to_parquet(HD_PATH, index=False)
    with open(JEFF_PATH, "wb") as fh:
        pickle.dump(jeff_data, fh, protocol=pickle.HIGHEST_PROTOCOL)
    with open(DEF_PATH, "wb") as fh:
        pickle.dump(weighted_defaults, fh, protocol=pickle.HIGHEST_PROTOCOL)

"SIMULATION"

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

def normalize_name_canonical(name):
    if pd.isna(name):
        return ""
    name = str(name).strip()
    name = name.replace('.', '').replace("'", '').replace('-', ' ')
    return ' '.join(name.lower().split())

def extract_jeff_features(player_canonical, gender, jeff_data):
    """Extract actual features from Jeff Sackmann data"""
    gender_key = 'men' if gender == 'M' else 'women'

    if gender_key not in jeff_data or player_canonical not in jeff_data[gender_key]:
        return {
            'serve_pts': 60,
            'first_won': 0,
            'second_won': 0,
            'return_pts_won': 20
        }

    player_data = jeff_data[gender_key][player_canonical]

    first_in = player_data.get('1stIn', 0)
    first_won = player_data.get('1stWon', 0)
    second_won = player_data.get('2ndWon', 0)
    double_faults = player_data.get('df', 0)

    total_serve_pts = first_in + double_faults + (first_won - first_in) if first_won >= first_in else first_in + second_won + double_faults

    break_points_saved = player_data.get('bpSaved', 0)
    break_points_faced = player_data.get('bpFaced', 0)
    return_pts_won = break_points_faced - break_points_saved

    return {
        'serve_pts': max(1, total_serve_pts),
        'first_won': first_won,
        'second_won': second_won,
        'return_pts_won': max(0, return_pts_won)
    }

class BayesianTennisModel:
    def __init__(self):
        self.simulation_count = 10000
        self.jeff_data = jeff_data
        self.historical_data = historical_data

    def default_priors(self):
        return {
            'elo_mean': 1500,
            'elo_std': 200,
            'hold_prob': 0.65,
            'break_prob': 0.35,
            'surface': 'Hard',
            'form_factor': 1.0,
            'confidence': 0.1
        }

    def extract_refined_priors(self, player_canonical, gender, surface, reference_date):
        player_matches = self.historical_data[
            (self.historical_data['winner_canonical'] == player_canonical) |
            (self.historical_data['loser_canonical'] == player_canonical)
        ].copy()

        if len(player_matches) == 0:
            return self.default_priors()

        surface_matches = player_matches[player_matches['Surface'] == surface]
        if len(surface_matches) < 5:
            surface_matches = player_matches

        recent_matches = surface_matches.tail(20).copy()
        recent_matches['days_ago'] = (pd.to_datetime(reference_date) - pd.to_datetime(recent_matches['Date'])).dt.days
        weights = np.exp(-0.05 * recent_matches['days_ago'])

        base_elo = self.get_player_weighted_elo(player_canonical, surface, reference_date)
        surface_factor = self.calculate_surface_adaptation(player_canonical, surface)
        elo_prior = base_elo * surface_factor

        jeff_features = extract_jeff_features(player_canonical, gender, self.jeff_data)

        serve_pts = jeff_features['serve_pts']
        serve_won = jeff_features['first_won'] + jeff_features['second_won']
        hold_prob = serve_won / serve_pts if serve_pts > 0 else 0.65

        return_pts = jeff_features['return_pts_won']
        total_return_pts = serve_pts
        break_prob = (1 - return_pts / total_return_pts) if total_return_pts > 0 else 0.35

        return {
            'elo_mean': elo_prior,
            'elo_std': 150,
            'hold_prob': min(0.95, max(0.3, hold_prob)),
            'break_prob': max(0.05, min(0.7, break_prob)),
            'surface': surface,
            'form_factor': self.calculate_form_spike(recent_matches, weights, player_canonical),
            'confidence': max(0.05, min(1.0, len(recent_matches) / 15))
        }

    def calculate_ranking_differential_odds(self, p1_ranking, p2_ranking):
        """Convert ranking differential to implied probability"""
        if p1_ranking == 0 or p2_ranking == 0:
            return 0.5

        ranking_diff = p2_ranking - p1_ranking

        if ranking_diff > 50:
            return 0.85
        elif ranking_diff > 20:
            return 0.75
        elif ranking_diff > 10:
            return 0.65
        elif ranking_diff > 0:
            return 0.55
        elif ranking_diff > -10:
            return 0.45
        elif ranking_diff > -20:
            return 0.35
        elif ranking_diff > -50:
            return 0.25
        else:
            return 0.15

    def calculate_upset_frequency(self, ranking_diff, surface, historical_data):
        """Calculate upset frequency by ranking differential and surface"""
        upset_matches = historical_data[
            ((historical_data['WRank'] - historical_data['LRank']) > ranking_diff) &
            (historical_data['Surface'] == surface)
        ]

        total_matches = historical_data[
            (abs(historical_data['WRank'] - historical_data['LRank']) >= abs(ranking_diff)) &
            (historical_data['Surface'] == surface)
        ]

        if len(total_matches) < 10 and surface != 'fallback':
            return self.calculate_upset_frequency(ranking_diff, 'fallback', historical_data)

        if surface == 'fallback':
            upset_matches = historical_data[
                (historical_data['WRank'] - historical_data['LRank']) > ranking_diff
            ]
            total_matches = historical_data[
                abs(historical_data['WRank'] - historical_data['LRank']) >= abs(ranking_diff)
            ]

        if len(total_matches) == 0:
            return 0.1

        upset_rate = len(upset_matches) / len(total_matches)
        return min(0.45, max(0.05, upset_rate))

    def calculate_surface_performance_ratio(self, player_canonical, surface, opponent_canonical, reference_date):
        """Calculate player's surface-specific performance vs opponent's baseline"""
        player_surface_matches = self.historical_data[
            ((self.historical_data['winner_canonical'] == player_canonical) |
             (self.historical_data['loser_canonical'] == player_canonical)) &
            (self.historical_data['Surface'] == surface) &
            (pd.to_datetime(self.historical_data['Date']) <= pd.to_datetime(reference_date))
        ].tail(20)

        opponent_surface_matches = self.historical_data[
            ((self.historical_data['winner_canonical'] == opponent_canonical) |
             (self.historical_data['loser_canonical'] == opponent_canonical)) &
            (self.historical_data['Surface'] == surface) &
            (pd.to_datetime(self.historical_data['Date']) <= pd.to_datetime(reference_date))
        ].tail(20)

        if len(player_surface_matches) < 3 or len(opponent_surface_matches) < 3:
            return 1.0

        player_wins = len(player_surface_matches[player_surface_matches['winner_canonical'] == player_canonical])
        opponent_wins = len(opponent_surface_matches[opponent_surface_matches['winner_canonical'] == opponent_canonical])

        player_ratio = player_wins / len(player_surface_matches)
        opponent_ratio = opponent_wins / len(opponent_surface_matches)

        return player_ratio / opponent_ratio if opponent_ratio > 0 else 1.0

    def run_simulation(self, p1_priors, p2_priors, iterations):
        return [self.simulate_match(p1_priors, p2_priors)]

    def predict_match_outcome(self, player1_canonical, player2_canonical, surface, gender, date):
        p1_priors = self.extract_refined_priors(player1_canonical, gender, surface, date)
        p2_priors = self.extract_refined_priors(player2_canonical, gender, surface, date)

        base_prob = self.run_simulation(p1_priors, p2_priors, 1000)[0]

        p1_rank = self.get_player_ranking(player1_canonical, date)
        p2_rank = self.get_player_ranking(player2_canonical, date)
        ranking_prob = self.calculate_ranking_differential_odds(p1_rank, p2_rank)

        ranking_diff = p1_rank - p2_rank
        upset_adjustment = self.calculate_upset_frequency(ranking_diff, surface, self.historical_data)

        surface_ratio = self.calculate_surface_performance_ratio(player1_canonical, surface, player2_canonical, date)

        calibrated_prob = (0.6 * base_prob + 0.25 * ranking_prob + 0.15 * surface_ratio) * (1 - upset_adjustment * 0.1)

        return max(0.05, min(0.95, calibrated_prob))

    def get_player_ranking(self, player_canonical, date):
        """Get player ranking at specific date"""
        date_obj = pd.to_datetime(date)

        player_matches = self.historical_data[
            ((self.historical_data['winner_canonical'] == player_canonical) |
             (self.historical_data['loser_canonical'] == player_canonical)) &
            (pd.to_datetime(self.historical_data['Date']) <= date_obj)
        ].sort_values('Date', ascending=False)

        if len(player_matches) == 0:
            return 999

        latest_match = player_matches.iloc[0]

        if latest_match['winner_canonical'] == player_canonical:
            return latest_match.get('WRank', 999)
        else:
            return latest_match.get('LRank', 999)

    def calculate_match_probability(self, player1_canonical, player2_canonical, gender, surface, reference_date, best_of=3):
        player1_priors = self.extract_refined_priors(player1_canonical, gender, surface, reference_date)
        player2_priors = self.extract_refined_priors(player2_canonical, gender, surface, reference_date)

        probability = self.simulate_match(player1_priors, player2_priors, best_of)
        confidence = min(player1_priors['confidence'], player2_priors['confidence'])

        return {
            'player1_win_probability': probability,
            'player2_win_probability': 1 - probability,
            'confidence': confidence,
            'player1_priors': player1_priors,
            'player2_priors': player2_priors
        }

    def calculate_form_spike(self, recent_matches, weights, player_canonical):
        if len(recent_matches) == 0:
            return 1.0

        wins = (recent_matches['winner_canonical'] == player_canonical).astype(int)
        weighted_win_rate = np.average(wins, weights=weights)

        avg_opponent_rank = recent_matches['LRank'].fillna(recent_matches['WRank']).mean()
        player_rank = recent_matches['WRank'].fillna(recent_matches['LRank']).iloc[-1]

        if pd.notna(avg_opponent_rank) and pd.notna(player_rank):
            rank_diff = player_rank - avg_opponent_rank
            expected_win_rate = 1 / (1 + 10**(rank_diff/400))
            form_spike = min(2.0, weighted_win_rate / max(0.1, expected_win_rate))
        else:
            form_spike = 1.0

        return form_spike

    def simulate_match(self, player1_priors, player2_priors, best_of=3):
        wins = 0
        for _ in range(self.simulation_count):
            sets_won = [0, 0]
            while max(sets_won) < (best_of + 1) // 2:
                set_winner = self.simulate_set(player1_priors, player2_priors)
                sets_won[set_winner] += 1
            if sets_won[0] > sets_won[1]:
                wins += 1
        return wins / self.simulation_count

    def simulate_set(self, p1_priors, p2_priors):
        games = [0, 0]
        server = 0
        while True:
            hold_prob = p1_priors['hold_prob'] if server == 0 else p2_priors['hold_prob']
            game_winner = server if np.random.random() < hold_prob else 1 - server
            games[game_winner] += 1
            server = 1 - server
            if games[0] >= 6 and games[0] - games[1] >= 2:
                return 0
            elif games[1] >= 6 and games[1] - games[0] >= 2:
                return 1
            elif games[0] == 6 and games[1] == 6:
                return self.simulate_tiebreak(p1_priors, p2_priors)

    def simulate_tiebreak(self, p1_priors, p2_priors):
        points = [0, 0]
        server = 0
        serve_count = 0
        while True:
            hold_prob = p1_priors['hold_prob'] if server == 0 else p2_priors['hold_prob']
            point_winner = server if np.random.random() < hold_prob else 1 - server
            points[point_winner] += 1
            serve_count += 1
            if serve_count == 1 or serve_count % 2 == 0:
                server = 1 - server
            if points[0] >= 7 and points[0] - points[1] >= 2:
                return 0
            elif points[1] >= 7 and points[1] - points[0] >= 2:
                return 1

    def get_player_weighted_elo(self, player_canonical, surface, reference_date):
        recent_match = self.historical_data[
            ((self.historical_data['winner_canonical'] == player_canonical) |
             (self.historical_data['loser_canonical'] == player_canonical)) &
            (self.historical_data['Surface'] == surface)
        ].tail(1)

        if len(recent_match) > 0 and 'BlendScore' in recent_match.columns:
            blend_score = recent_match['BlendScore'].iloc[0]
            return 1500 + blend_score * 50

        any_surface_match = self.historical_data[
            (self.historical_data['winner_canonical'] == player_canonical) |
            (self.historical_data['loser_canonical'] == player_canonical)
        ].tail(1)

        if len(any_surface_match) > 0 and 'BlendScore' in any_surface_match.columns:
            return 1500 + any_surface_match['BlendScore'].iloc[0] * 200

        return 1500

    def calculate_surface_adaptation(self, player_canonical, target_surface):
        player_matches = self.historical_data[
            (self.historical_data['winner_canonical'] == player_canonical) |
            (self.historical_data['loser_canonical'] == player_canonical)
        ].copy()

        if len(player_matches) < 10:
            return 1.0

        surface_matches = player_matches[player_matches['Surface'] == target_surface]
        if len(surface_matches) < 3:
            return 1.0

        surface_wins = (surface_matches['winner_canonical'] == player_canonical).sum()
        surface_win_rate = surface_wins / len(surface_matches)

        total_wins = (player_matches['winner_canonical'] == player_canonical).sum()
        baseline_win_rate = total_wins / len(player_matches)

        if baseline_win_rate == 0:
            return 1.0

        adaptation_ratio = surface_win_rate / baseline_win_rate
        return max(0.7, min(1.5, adaptation_ratio))

    def evaluate_predictions(self, test_data):
        """Evaluate model accuracy on test dataset"""
        correct = 0
        total = 0

        for _, match in test_data.iterrows():
            prob = self.predict_match_outcome(
                match['winner_canonical'],
                match['loser_canonical'],
                match['Surface'],
                match['gender'],
                match['Date']
            )

            predicted_winner = match['winner_canonical'] if prob > 0.5 else match['loser_canonical']
            actual_winner = match['winner_canonical']

            if predicted_winner == actual_winner:
                correct += 1
            total += 1

        return correct / total if total > 0 else 0

def convert_to_canonical(name):
    return normalize_name_canonical(name)

model = BayesianTennisModel()

In [None]:
## LAYER 2 ##
def apply_contextual_adjustments(self, priors, player_canonical, opponent_canonical, match_context):
    """Layer 2: Contextual Bayesian adjustments for fatigue, injury, motivation"""

    adjusted_priors = priors.copy()

    # Fatigue Index
    fatigue_penalty = self.calculate_fatigue_index(player_canonical, match_context['reference_date'])
    adjusted_priors['hold_prob'] *= (1 - fatigue_penalty * 0.15)  # Max 15% hold penalty
    adjusted_priors['elo_std'] *= (1 + fatigue_penalty * 0.3)    # Increase uncertainty

    # Injury Flag Adjustment
    injury_factor = self.get_injury_factor(player_canonical, match_context['reference_date'])
    adjusted_priors['hold_prob'] *= injury_factor
    adjusted_priors['break_prob'] *= (2 - injury_factor)  # Inverse relationship

    # Form Spike Sustainability
    form_sustainability = self.calculate_form_sustainability(player_canonical, match_context)
    if adjusted_priors['form_factor'] > 1.2:  # Hot streak detection
        sustainability_discount = 1 - ((adjusted_priors['form_factor'] - 1) * (1 - form_sustainability))
        adjusted_priors['hold_prob'] *= sustainability_discount
        adjusted_priors['elo_mean'] *= sustainability_discount

    # Opponent Quality Weighting
    opponent_elo = self.estimate_opponent_elo(opponent_canonical, match_context)
    elo_diff = adjusted_priors['elo_mean'] - opponent_elo
    quality_adjustment = 1 / (1 + np.exp(-elo_diff / 200))  # Sigmoid scaling
    adjusted_priors['break_prob'] *= quality_adjustment

    return adjusted_priors

def calculate_fatigue_index(self, player_canonical, reference_date):
    """Fatigue based on recent match load and recovery time"""
    recent_matches = self.get_recent_matches(player_canonical, reference_date, days=14)

    if len(recent_matches) == 0:
        return 0.0

    # Calculate cumulative fatigue
    fatigue_score = 0
    for _, match in recent_matches.iterrows():
        days_ago = (pd.to_datetime(reference_date) - pd.to_datetime(match['Date'])).days
        match_duration = match.get('minutes', 120)  # Default 2 hours

        # Exponential decay with match duration weighting
        fatigue_contribution = (match_duration / 60) * np.exp(-0.1 * days_ago)
        fatigue_score += fatigue_contribution

    return min(1.0, fatigue_score / 10)  # Normalize to 0-1

def get_injury_factor(self, player_canonical, reference_date):
    """Player-specific injury fragility scoring"""
    # Injury memory bank - replace with actual injury tracking
    injury_prone_players = {
        'nadal_r': 0.85,
        'murray_a': 0.80,
        'thiem_d': 0.75,
        'badosa_p': 0.70
    }

    base_factor = injury_prone_players.get(player_canonical, 0.95)

    # Check for recent retirement/walkover flags
    recent_retirements = self.check_recent_retirements(player_canonical, reference_date)
    if recent_retirements > 0:
        base_factor *= (0.8 ** recent_retirements)

    return max(0.5, base_factor)

def calculate_form_sustainability(self, player_canonical, match_context):
    """Form spike sustainability based on opponent quality and win quality"""
    recent_matches = self.get_recent_matches(player_canonical, match_context['reference_date'], days=21)

    if len(recent_matches) < 3:
        return 0.5

    # Quality-weighted recent performance
    quality_scores = []
    for _, match in recent_matches.iterrows():
        opponent_rank = match['LRank'] if match['winner_canonical'] == player_canonical else match['WRank']
        win_quality = 1 / (1 + opponent_rank / 100) if pd.notna(opponent_rank) else 0.5
        quality_scores.append(win_quality)

    avg_opponent_quality = np.mean(quality_scores)
    consistency = 1 - np.std(quality_scores)

    return min(1.0, avg_opponent_quality * consistency)

def estimate_opponent_elo(self, opponent_canonical, match_context):
    """Quick opponent Elo estimation for quality weighting"""
    opponent_priors = self.extract_refined_priors(
        opponent_canonical,
        match_context['gender'],
        match_context['surface'],
        match_context['reference_date']
    )
    return opponent_priors['elo_mean']

def get_recent_matches(self, player_canonical, reference_date, days=14):
    try:
        cutoff_date = pd.to_datetime(reference_date) - pd.Timedelta(days=days)

        player_matches = self.historical_data[
            ((self.historical_data['winner_canonical'] == player_canonical) |
             (self.historical_data['loser_canonical'] == player_canonical))
        ].copy()

        if len(player_matches) == 0:
            return player_matches

        # Force string conversion then datetime to avoid mixed types
        player_matches['Date'] = pd.to_datetime(player_matches['Date'].astype(str), errors='coerce')
        player_matches = player_matches.dropna(subset=['Date'])
        player_matches = player_matches[player_matches['Date'] >= cutoff_date]

        return player_matches.sort_values('Date')
    except:
        # Return empty DataFrame on any error
        return pd.DataFrame()

def check_recent_retirements(self, player_canonical, reference_date):
    """Count recent retirements/walkovers - placeholder for actual retirement tracking"""
    # Implementation depends on your data structure for retirement flags
    return 0

In [None]:
## LAYER 3 ##
def simulate_match(self, player1_priors, player2_priors, best_of=3, tiebreak_sets=[1,2,3]):
    """Layer 3: Monte Carlo match simulation with Bayesian priors"""

    wins = 0
    simulations = self.simulation_count

    for _ in range(simulations):
        sets_won = [0, 0]  # [player1, player2]

        while max(sets_won) < (best_of + 1) // 2:
            set_winner = self.simulate_set(
                player1_priors,
                player2_priors,
                tiebreak=len([s for s in sets_won if s > 0]) + 1 in tiebreak_sets
            )
            sets_won[set_winner] += 1

        if sets_won[0] > sets_won[1]:
            wins += 1

    return wins / simulations

def simulate_set(self, p1_priors, p2_priors, tiebreak=True):
    """Simulate single set with service alternation"""
    games = [0, 0]
    server = 0  # 0 = player1 serves first

    while True:
        # Determine game winner based on server
        if server == 0:
            hold_prob = p1_priors['hold_prob']
            game_winner = 0 if np.random.random() < hold_prob else 1
        else:
            hold_prob = p2_priors['hold_prob']
            game_winner = 1 if np.random.random() < hold_prob else 0

        games[game_winner] += 1
        server = 1 - server  # Alternate serve

        # Check set completion
        if games[0] >= 6 and games[0] - games[1] >= 2:
            return 0
        elif games[1] >= 6 and games[1] - games[0] >= 2:
            return 1
        elif games[0] == 6 and games[1] == 6 and tiebreak:
            return self.simulate_tiebreak(p1_priors, p2_priors)

def simulate_tiebreak(self, p1_priors, p2_priors):
    """Simulate tiebreak with point-by-point serve alternation"""
    points = [0, 0]
    server = 0
    serve_count = 0

    while True:
        # Determine point winner
        if server == 0:
            hold_prob = p1_priors['hold_prob']
            point_winner = 0 if np.random.random() < hold_prob else 1
        else:
            hold_prob = p2_priors['hold_prob']
            point_winner = 1 if np.random.random() < hold_prob else 0

        points[point_winner] += 1
        serve_count += 1

        # Alternate server every 2 points (except first point)
        if serve_count == 1 or serve_count % 2 == 0:
            server = 1 - server

        # Check tiebreak completion
        if points[0] >= 7 and points[0] - points[1] >= 2:
            return 0
        elif points[1] >= 7 and points[1] - points[0] >= 2:
            return 1

def simulate_match(self, player1_priors, player2_priors, best_of=3, tiebreak_sets=[1,2,3]):
    wins = 0
    simulations = self.simulation_count

    for _ in range(simulations):
        sets_won = [0, 0]

        while max(sets_won) < (best_of + 1) // 2:
            set_winner = self.simulate_set(
                player1_priors,
                player2_priors,
                tiebreak=len([s for s in sets_won if s > 0]) + 1 in tiebreak_sets
            )
            sets_won[set_winner] += 1

        if sets_won[0] > sets_won[1]:
            wins += 1

    return wins / simulations

def simulate_set(self, p1_priors, p2_priors, tiebreak=True):
    games = [0, 0]
    server = 0

    while True:
        if server == 0:
            hold_prob = p1_priors['hold_prob']
            game_winner = 0 if np.random.random() < hold_prob else 1
        else:
            hold_prob = p2_priors['hold_prob']
            game_winner = 1 if np.random.random() < hold_prob else 0

        games[game_winner] += 1
        server = 1 - server

        if games[0] >= 6 and games[0] - games[1] >= 2:
            return 0
        elif games[1] >= 6 and games[1] - games[0] >= 2:
            return 1
        elif games[0] == 6 and games[1] == 6 and tiebreak:
            return self.simulate_tiebreak(p1_priors, p2_priors)

def simulate_tiebreak(self, p1_priors, p2_priors):
    points = [0, 0]
    server = 0
    serve_count = 0

    while True:
        if server == 0:
            hold_prob = p1_priors['hold_prob']

In [None]:
# Tomorrow's slate

import requests
from datetime import date, timedelta

API_KEY = "adfc70491c47895e5fffdc6428bbf36a561989d4bffcfa9ecfba8d91e947b4fb"
BASE = "https://api.api-tennis.com/tennis/"

def get_matches_for_date(target_date):
    params = {
        "method": "get_fixtures",
        "APIkey": API_KEY,
        "date_start": target_date,
        "date_stop": target_date
    }
    response = requests.get(BASE, params=params)
    if response.status_code != 200:
        raise RuntimeError(f"HTTP {response.status_code}")

    # Surface mapping
    TOURNAMENT_SURFACES = {
        'ATP Wimbledon': 'Grass',
        'WTA Wimbledon': 'Grass',
        'ATP French Open': 'Clay',
        'WTA French Open': 'Clay',
        'ATP US Open': 'Hard',
        'WTA US Open': 'Hard',
        'ATP Australian Open': 'Hard',
        'WTA Australian Open': 'Hard'
    }

    data = response.json()
    matches = []

    for event in data.get("result", []):
        matches.append({
            'event_key': event.get('event_key'),
            'player1_name': event['event_first_player'],
            'player2_name': event['event_second_player'],
            'tournament_name': event.get('tournament_name', 'Unknown'),
            'tournament_round': event.get('tournament_round', ''),
            'event_status': event.get('event_status', ''),
            'event_type_type': event.get('event_type_type', ''),
            'surface': TOURNAMENT_SURFACES.get(event.get('tournament_name', ''), 'Unknown'),
            'time': event.get('event_time', ''),
            'date': event.get('event_date', '')
        })

    return matches

def get_high_confidence_matches(target_date, min_confidence=0.2):
    matches = get_matches_for_date(target_date)

    results = []
    for match in matches:
        p1_canonical = convert_to_canonical(match['player1_name'])
        p2_canonical = convert_to_canonical(match['player2_name'])

        p1_priors = model.extract_refined_priors(p1_canonical, 'men', match['surface'], target_date)
        p2_priors = model.extract_refined_priors(p2_canonical, 'men', match['surface'], target_date)

        p1_win_prob = model.simulate_match(p1_priors, p2_priors)
        confidence = abs(p1_win_prob - 0.5)

        if confidence >= min_confidence:
            favorite = match['player1_name'] if p1_win_prob > 0.5 else match['player2_name']
            win_prob = max(p1_win_prob, 1 - p1_win_prob)

            results.append({
                'match': f"{match['player1_name']} vs {match['player2_name']}",
                'favorite': favorite,
                'probability': win_prob,
                'confidence': confidence
            })

    return sorted(results, key=lambda x: x['confidence'], reverse=True)

# Usage
today = date.today().isoformat()
tomorrow = (date.today() + timedelta(days=1)).isoformat()

todays_matches = get_matches_for_date(today)
tomorrows_matches = get_matches_for_date(tomorrow)

In [None]:
# Todays_matches or tomorrows_matches
todays_matches

In [None]:
# Get top 5 picks
def get_top_confidence_matches(target_date, top_n=5, min_confidence=0.05):
    matches = get_matches_for_date(target_date)

    results = []
    for match in matches:
        p1_canonical = convert_to_canonical(match['player1_name'])
        p2_canonical = convert_to_canonical(match['player2_name'])

        p1_priors = model.extract_refined_priors(p1_canonical, 'men', match['surface'], target_date)
        p2_priors = model.extract_refined_priors(p2_canonical, 'men', match['surface'], target_date)

        p1_win_prob = model.simulate_match(p1_priors, p2_priors)
        confidence = abs(p1_win_prob - 0.5)

        if confidence >= min_confidence:
            favorite = match['player1_name'] if p1_win_prob > 0.5 else match['player2_name']
            win_prob = max(p1_win_prob, 1 - p1_win_prob)

            results.append({
                'match': f"{match['player1_name']} vs {match['player2_name']}",
                'favorite': favorite,
                'probability': win_prob,
                'confidence': confidence
            })

    return sorted(results, key=lambda x: x['confidence'], reverse=True)[:top_n]

if __name__ == "__main__":
    target_date = date.today().isoformat()  # today's matches
    picks = get_top_confidence_matches(target_date, top_n=5, min_confidence=0.15)

    for i, pick in enumerate(picks, 1):
        print(f"{i}. {pick['match']}")
        print(f"   Favorite: {pick['favorite']}")
        print(f"   Win Prob: {pick['probability']:.2%}")
        print(f"   Confidence: {pick['confidence']:.5%}\n")

In [None]:
# See picks
from datetime import date

# get today’s top-5 at 5% confidence
picks = get_top_confidence_matches(date.today().isoformat(), top_n=5, min_confidence=0.05)

# print them
for i, pick in enumerate(picks, 1):
    print(f"{i}. {pick['match']}")
    print(f"   Favorite: {pick['favorite']}")
    print(f"   Win Prob: {pick['probability']:.2%}")
    print(f"   Confidence: {pick['confidence']:.1%}\n")

In [None]:
import pandas as pd

pd.DataFrame(picks)

In [None]:
# Split data chronologically
split_date = '2023-01-01'
train_data = historical_data[pd.to_datetime(historical_data['Date']) < split_date]
test_data = historical_data[pd.to_datetime(historical_data['Date']) >= split_date]

# Initialize model with training data
model.historical_data = train_data

# Run evaluation
accuracy = model.evaluate_predictions(test_data.head(100))
print(f"Enhanced model accuracy: {accuracy:.3f}")

# Compare with baseline
model_baseline = BayesianTennisModel()
model_baseline.historical_data = train_data
baseline_accuracy = model_baseline.evaluate_predictions(test_data.head(100))
print(f"Baseline accuracy: {baseline_accuracy:.3f}")
print(f"Improvement: {accuracy - baseline_accuracy:.3f}")

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import html

class TennisAbstractScraper:
    def __init__(self):
        self.headers = {"User-Agent": "Mozilla/5.0"}

    # Stats Overview
    def scrape_stats_overview(self, url):
        resp = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(resp.text, "lxml")
        scripts = [tag.string for tag in soup.find_all("script") if tag.string]
        all_js = "\n".join(scripts)
        blocks = dict(re.findall(r"var\s+(\w+)\s*=\s*'([\s\S]*?)';", all_js, flags=re.S))
        labels = {span.get_text(strip=True): span["id"] for span in soup.select("span.rounds")}
        sections = {label: html.unescape(blocks[token]) for label, token in labels.items() if token in blocks}

        match_info = self._parse_match_url(url)
        stats_html = sections.get("Stats Overview", "")
        stats_data = self._extract_stats_overview_table(stats_html)

        return self._convert_to_jeff_format(stats_data, match_info)

    # Serve Basics
    def scrape_serve_basics(self, url):
        resp = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(resp.text, "lxml")
        scripts = [tag.string for tag in soup.find_all("script") if tag.string]
        all_js = "\n".join(scripts)
        blocks = dict(re.findall(r"var\s+(\w+)\s*=\s*'([\s\S]*?)';", all_js, flags=re.S))
        labels = {span.get_text(strip=True): span["id"] for span in soup.select("span.rounds")}
        sections = {label: html.unescape(blocks[token]) for label, token in labels.items() if token in blocks}

        match_info = self._parse_match_url(url)
        serve_html = sections.get("Serve Basics", "")
        serve_data = self._parse_serve_basics(serve_html)

        return self._convert_serve_basics_to_jeff(serve_data, match_info)

    # add to TennisAbstractScraper
    MAP_SERVE_INFL = {
        'Wide %':   'serve_wide_pct',
        'T %':      'serve_t_pct',
        'Body %':   'serve_body_pct'
    }

    def scrape_serve_influence(self, url):
        resp = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(resp.text, "lxml")
        scripts = [t.string for t in soup.find_all("script") if t.string]
        js = "\n".join(scripts)
        blocks = dict(re.findall(r"var\s+(\w+)\s*=\s*'([\s\S]*?)';", js, re.S))
        html_block = html.unescape(blocks.get('serve', ''))
        if not html_block:
            return []

        tbl = BeautifulSoup(html_block, 'html.parser').table
        heads = [c.get_text(strip=True) for c in tbl.tr.find_all(['th', 'td'])]
        out = []
        for row in tbl.find_all('tr')[1:]:
            cells = [c.get_text(strip=True) for c in row.find_all('td')]
            player = cells[0]
            rec = {'Player_canonical': self._normalize_player_name(player)}
            for h, v in zip(heads[1:], cells[1:]):
                key = MAP_SERVE_INFL.get(h)
                if key:
                    rec[key] = float(v.rstrip('%')) / 100
            out.append(rec)
        return out

    def _parse_serve_basics(self, html_content):
        """Parse Serve Basics section - serves, aces, double faults breakdown"""
        soup = BeautifulSoup(html_content, 'html.parser')
        table = soup.find('table')
        if not table:
            return {}

        rows = table.find_all('tr')
        headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]

        data_rows = []
        for row in rows[1:]:
            cells = [td.get_text(strip=True) for td in row.find_all('td')]
            if len(cells) >= len(headers):
                data_rows.append(cells)

        return self._parse_serve_basics_data(headers, data_rows)

    def _parse_serve_basics_data(self, headers, data_rows):
        """Convert Serve Basics table to structured data"""
        stats_data = {}
        current_set = "Total"

        for row in data_rows:
            if not row[0]:
                continue

            if row[0].startswith('SET'):
                current_set = row[0]
                continue

            player_name = row[0]

            if current_set not in stats_data:
                stats_data[current_set] = {}

            # Parse serve basics columns - adjust indices based on actual table structure
            stats_data[current_set][player_name] = {
                'serve_pts': int(row[1]) if len(row) > 1 and row[1].isdigit() else 0,
                'aces': int(row[2]) if len(row) > 2 and row[2].isdigit() else 0,
                'dfs': int(row[3]) if len(row) > 3 and row[3].isdigit() else 0,
                'first_in': int(row[4]) if len(row) > 4 and row[4].isdigit() else 0,
                'first_won': int(row[5]) if len(row) > 5 and row[5].isdigit() else 0,
                'second_won': int(row[6]) if len(row) > 6 and row[6].isdigit() else 0
            }

        return stats_data

    def _convert_serve_basics_to_jeff(self, serve_data, match_info):
        """Convert serve basics data to Jeff format records"""
        jeff_records = []

        for set_name, set_data in serve_data.items():
            for player, data in set_data.items():
                jeff_record = {
                    'match_id': f"{match_info['Date']}-{player.replace(' ', '_')}",
                    'Date': match_info['Date'],
                    'Tournament': match_info['tournament'],
                    'player': player,
                    'Player_canonical': self._normalize_player_name(player),
                    'set': set_name,
                    'serve_pts': data['serve_pts'],
                    'aces': data['aces'],
                    'dfs': data['dfs'],
                    'first_in': data['first_in'],
                    'first_won': data['first_won'],
                    'second_won': data['second_won']
                }
                jeff_records.append(jeff_record)

        return jeff_records

    # Existing methods unchanged
    def _extract_stats_overview_table(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        table = soup.find('table')
        if not table:
            return {}

        rows = table.find_all('tr')
        headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]

        data_rows = []
        for row in rows[1:]:
            cells = [td.get_text(strip=True) for td in row.find_all('td')]
            if len(cells) >= len(headers):
                data_rows.append(cells)

        return self._parse_tennis_stats(headers, data_rows)

    def _parse_tennis_stats(self, headers, data_rows):
        stats_data = {}
        current_set = "Total"

        for row in data_rows:
            if not row[0]:
                continue

            if row[0].startswith('SET'):
                current_set = row[0]
                continue

            player_name = row[0]

            if current_set not in stats_data:
                stats_data[current_set] = {}

            winners_text = row[8] if len(row) > 8 else "0 (0/0)"
            winners_match = re.match(r'(\d+)\s*\((\d+)/(\d+)\)', winners_text)
            winners_total = int(winners_match.group(1)) if winners_match else 0
            winners_fh = int(winners_match.group(2)) if winners_match else 0
            winners_bh = int(winners_match.group(3)) if winners_match else 0

            ufe_text = row[9] if len(row) > 9 else "0 (0/0)"
            ufe_match = re.match(r'(\d+)\s*\((\d+)/(\d+)\)', ufe_text)
            ufe_total = int(ufe_match.group(1)) if ufe_match else 0
            ufe_fh = int(ufe_match.group(2)) if ufe_match else 0
            ufe_bh = int(ufe_match.group(3)) if ufe_match else 0

            stats_data[current_set][player_name] = {
                'aces_pct': row[1] if len(row) > 1 else '0%',
                'df_pct': row[2] if len(row) > 2 else '0%',
                'first_in_pct': row[3] if len(row) > 3 else '0%',
                'first_won_pct': row[4] if len(row) > 4 else '0%',
                'second_won_pct': row[5] if len(row) > 5 else '0%',
                'bp_saved': row[6] if len(row) > 6 else '0/0',
                'rpw_pct': row[7] if len(row) > 7 else '0%',
                'winners': str(winners_total),
                'winners_fh': str(winners_fh),
                'winners_bh': str(winners_bh),
                'ufe': str(ufe_total),
                'ufe_fh': str(ufe_fh),
                'ufe_bh': str(ufe_bh)
            }

        return stats_data

    def _parse_match_url(self, url):
        pattern = r'(\d{8})-([MW])-(.+?)-(.+?)-(.+?)-(.+?)\.html'
        match = re.search(pattern, url)
        if match:
            date_str, gender, tournament, round_info, player1, player2 = match.groups()
            return {
                'Date': date_str,
                'gender': 'M' if gender == 'M' else 'W',
                'tournament': tournament.replace('_', ' '),
                'round': round_info,
                'player1': player1.replace('_', ' '),
                'player2': player2.replace('_', ' ')
            }
        return {}

    def _convert_to_jeff_format(self, stats_data, match_info):
        jeff_records = []
        for set_name, set_data in stats_data.items():
            for player, data in set_data.items():
                serve_pts = 67 if set_name == 'Total' else (40 if set_name == 'SET 1' else 27)

                aces = int(float(data['aces_pct'].rstrip('%')) / 100 * serve_pts)
                dfs = int(float(data['df_pct'].rstrip('%')) / 100 * serve_pts)
                first_in = int(float(data['first_in_pct'].rstrip('%')) / 100 * serve_pts)
                first_won = int(float(data['first_won_pct'].rstrip('%')) / 100 * first_in) if first_in > 0 else 0
                second_won = int(float(data['second_won_pct'].rstrip('%')) / 100 * (serve_pts - first_in)) if (serve_pts - first_in) > 0 else 0

                bp_parts = data['bp_saved'].split('/')
                bp_saved = int(bp_parts[0])
                bp_faced = int(bp_parts[1]) if len(bp_parts) > 1 else 0

                return_pts_won = int(float(data['rpw_pct'].rstrip('%')) / 100 * serve_pts)

                jeff_record = {
                    'match_id': f"{match_info['Date']}-{player.replace(' ', '_')}",
                    'Date': match_info['Date'],
                    'Tournament': match_info['tournament'],
                    'player': player,
                    'Player_canonical': self._normalize_player_name(player),
                    'set': set_name,
                    'serve_pts': serve_pts,
                    'aces': aces,
                    'dfs': dfs,
                    'first_in': first_in,
                    'first_won': first_won,
                    'second_won': second_won,
                    'bp_saved': bp_saved,
                    'bp_faced': bp_faced,
                    'return_pts_won': return_pts_won,
                    'winners': int(data['winners']),
                    'winners_fh': int(data['winners_fh']),
                    'winners_bh': int(data['winners_bh']),
                    'unforced': int(data['ufe']),
                    'unforced_fh': int(data['ufe_fh']),
                    'unforced_bh': int(data['ufe_bh'])
                }
                jeff_records.append(jeff_record)
        return jeff_records

    def _normalize_player_name(self, name):
        parts = name.lower().replace('.', '').split()
        if len(parts) >= 2:
            return f"{parts[-1]}_{parts[0][0]}"
        return name.lower().replace(' ', '_')

    def debug_available_sections(self, url):
        resp = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(resp.text, "lxml")
        scripts = [tag.string for tag in soup.find_all("script") if tag.string]
        all_js = "\n".join(scripts)
        blocks = dict(re.findall(r"var\s+(\w+)\s*=\s*'([\s\S]*?)';", all_js, flags=re.S))
        labels = {span.get_text(strip=True): span["id"] for span in soup.select("span.rounds")}

        print("Available sections:")
        for label in labels.keys():
            print(f"- '{label}'")
        return labels

def test_extraction_completeness(self, url):
    """Test all available sections and validate data structure"""
    sections = self.debug_available_sections(url)

    results = {}
    for section_name in sections.keys():
        try:
            # Test each section extraction
            extracted_data = self._test_section_extraction(url, section_name)
            results[section_name] = len(extracted_data) > 0
        except Exception as e:
            results[section_name] = f"Error: {e}"

    return results

In [None]:
# Test both methods
scraper = TennisAbstractScraper()
url = "https://www.tennisabstract.com/charting/20250628-W-Eastbourne-F-Maya_Joint-Alexandra_Eala.html"

# Test Stats Overview
print("=== STATS OVERVIEW ===")
overview_data = scraper.scrape_stats_overview(url)
for record in overview_data:
    print(record)

print("\n=== SERVE BASICS ===")
serve_data = scraper.scrape_serve_basics(url)
for record in serve_data:
    print(record)

print("\n=== SERVE INFLUENCE ===")
serve_infl_data = scraper.scrape_serve_influence(url)
for record in serve_infl_data:
    print(record)
