In [None]:
import numpy as np
import os
import requests
from datetime import datetime, date, timedelta
import re
import pandas as pd

from unidecode import unidecode
import subprocess
import sys

# API Configuration
os.environ["API_TENNIS_KEY"] = "adfc70491c47895e5fffdc6428bbf36a561989d4bffcfa9ecfba8d91e947b4fb"
API_KEY = os.getenv("API_TENNIS_KEY")
BASE = "https://api.api-tennis.com/tennis/"

def call(method: str, **params):
    q = {"method": method, "APIkey": API_KEY, **params}
    r = requests.get(BASE, params=q, timeout=30)
    r.raise_for_status()
    j = r.json()
    if str(j.get("error", "0")) != "0":
        raise RuntimeError(j)
    return j["result"]

def fetch_day(day):
    if isinstance(day, str):
        day = datetime.strptime(day, '%Y-%m-%d').date()
    return call("get_fixtures", date_start=day.isoformat(), date_stop=day.isoformat())

def normalize_name_canonical(name: str) -> str:
    clean_name = unidecode(str(name)).lower().replace(".", "").strip()
    clean_name = re.sub(r'\s+', ' ', clean_name)
    if '/' in clean_name:
        return clean_name.replace('/', '_')
    parts = clean_name.split()
    if len(parts) >= 2:
        if len(parts[-1]) == 1:
            return f"{parts[-2]}_{parts[-1]}"
        else:
            return f"{parts[-1]}_{parts[0][0]}"
    return clean_name

def determine_surface(tournament_name: str) -> str:
    tournament_lower = tournament_name.lower() if tournament_name else ""

    surface_mapping = {
        'roland garros': 'Clay', 'french open': 'Clay', 'monte carlo': 'Clay',
        'rome': 'Clay', 'madrid': 'Clay', 'barcelona': 'Clay',
        'wimbledon': 'Grass', 'queens': 'Grass', 'halle': 'Grass',
        'australian open': 'Hard', 'us open': 'Hard', 'indian wells': 'Hard',
        'miami': 'Hard', 'canada': 'Hard', 'cincinnati': 'Hard'
    }

    for keyword, surface in surface_mapping.items():
        if keyword in tournament_lower:
            return surface

    if 'clay' in tournament_lower:
        return 'Clay'
    elif 'grass' in tournament_lower:
        return 'Grass'
    else:
        return 'Hard'

def normalize_name(name):
    """Normalize tennis player names for matching"""
    if pd.isna(name):
        return ""

    name = str(name).replace('.', '').lower()
    parts = name.split()

    if len(parts) < 2:
        return name.replace(' ', '_')

    # Handle "Lastname F" format
    if len(parts[-1]) == 1:  # Last part is single letter (first initial)
        last_name = parts[-2]
        first_initial = parts[-1]
    else:  # Handle "First Lastname" format
        last_name = parts[-1]
        first_initial = parts[0][0] if parts[0] else ''

    return f"{last_name}_{first_initial}"

def normalize_jeff_name(name):
    """Normalize Jeff's player names for matching"""
    if pd.isna(name):
        return ""

    name = str(name).lower()
    parts = name.split()

    if len(parts) < 2:
        return name.replace(' ', '_')

    # Jeff data is "First Last" format
    last_name = parts[-1]
    first_initial = parts[0][0] if parts[0] else ''

    return f"{last_name}_{first_initial}"

def normalize_tournament_name(name):
    """Normalize tournament names"""
    if pd.isna(name):
        return ""

    name = str(name).lower()
    # Remove common variations
    name = name.replace('masters cup', 'masters')
    name = name.replace('atp finals', 'masters')
    name = name.replace('wta finals', 'masters')

    return name.strip()

def determine_gender_from_tournament(tournament_name: str) -> str:
    """Determine gender from tournament name"""
    tournament_lower = tournament_name.lower() if tournament_name else ""

    wta_keywords = ['wta', 'women', 'ladies', 'womens']
    atp_keywords = ['atp', 'men', 'mens']

    for keyword in wta_keywords:
        if keyword in tournament_lower:
            return 'W'

    for keyword in atp_keywords:
        if keyword in tournament_lower:
            return 'M'

    return 'M'  # Default to men's if unclear

def load_excel_data(file_path):
    """Load data from Excel file"""
    try:
        df = pd.read_excel(file_path)
        if 'Date' not in df.columns:
            print(f"Warning: No Date column in {file_path}")
            return pd.DataFrame()
        print(f"Loaded {len(df)} matches from {file_path}")
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return pd.DataFrame()

def get_tournament_tier_weight(tournament_name: str) -> float:
    """Classify tournament tier and return weight"""
    if pd.isna(tournament_name):
        return 0.5

    tournament_lower = tournament_name.lower()

    # Grand Slams
    if any(slam in tournament_lower for slam in ['roland garros', 'wimbledon', 'australian open', 'us open']):
        return 1.0

    # Masters/WTA 1000
    masters_events = ['indian wells', 'miami', 'monte carlo', 'madrid', 'rome', 'canada', 'cincinnati', 'shanghai', 'paris masters']
    if any(masters in tournament_lower for masters in masters_events):
        return 0.9

    # ATP 500/WTA 500
    atp500_events = ['stuttgart', 'barcelona', 'hamburg', 'halle', 'queens', 'washington', 'dubai', 'rotterdam']
    if any(event in tournament_lower for event in atp500_events):
        return 0.7

    # ITF/Challenger/Juniors
    if any(lower_tier in tournament_lower for lower_tier in ['itf', 'challenger', 'juniors']):
        return 0.2

    # ATP 250/WTA 250 (everything else)
    return 0.5

def calculate_recency_weight(match_date, reference_date='2025-07-01'):
    """Calculate exponential decay weight based on match recency"""
    try:
        if isinstance(match_date, str):
            match_dt = datetime.strptime(match_date, '%Y%m%d')
        else:
            match_dt = match_date

        ref_dt = datetime.strptime(reference_date, '%Y-%m-%d')
        days_ago = (ref_dt - match_dt).days

        # Exponential decay: ~60% weight after 2 years (730 days)
        return np.exp(-0.0005 * days_ago)
    except:
        return 0.5

def load_jeff_comprehensive_data():
    """Load all of Jeff's comprehensive tennis data"""
    base_path = os.path.expanduser("~/Desktop/data/Jeff 6.14.25")

    data = {
        'men': {},
        'women': {}
    }

    # File mappings with actual names
    files = {
        'matches': 'charting-{}-matches.csv',
        'points_2020s': 'charting-{}-points-2020s.csv',
        'overview': 'charting-{}-stats-Overview.csv',
        'serve_basics': 'charting-{}-stats-ServeBasics.csv',
        'return_outcomes': 'charting-{}-stats-ReturnOutcomes.csv',
        'return_depth': 'charting-{}-stats-ReturnDepth.csv',
        'key_points_serve': 'charting-{}-stats-KeyPointsServe.csv',
        'key_points_return': 'charting-{}-stats-KeyPointsReturn.csv',
        'net_points': 'charting-{}-stats-NetPoints.csv',
        'rally': 'charting-{}-stats-Rally.csv',
        'serve_direction': 'charting-{}-stats-ServeDirection.csv',
        'serve_influence': 'charting-{}-stats-ServeInfluence.csv',
        'shot_direction': 'charting-{}-stats-ShotDirection.csv',
        'shot_dir_outcomes': 'charting-{}-stats-ShotDirOutcomes.csv',
        'shot_types': 'charting-{}-stats-ShotTypes.csv',
        'snv': 'charting-{}-stats-SnV.csv',
        'sv_break_split': 'charting-{}-stats-SvBreakSplit.csv',
        'sv_break_total': 'charting-{}-stats-SvBreakTotal.csv'
    }

    # Load men's data
    men_path = os.path.join(base_path, 'men')
    if os.path.exists(men_path):
        for key, filename_template in files.items():
            filename = filename_template.format('m')
            file_path = os.path.join(men_path, filename)
            if os.path.exists(file_path):
                df = pd.read_csv(file_path, low_memory=False)
                if 'player' in df.columns:
                    df['Player_canonical'] = df['player'].apply(normalize_jeff_name)
                data['men'][key] = df
                print(f"Loaded men/{filename}: {len(df)} records")

    # Load women's data
    women_path = os.path.join(base_path, 'women')
    if os.path.exists(women_path):
        for key, filename_template in files.items():
            filename = filename_template.format('w')
            file_path = os.path.join(women_path, filename)
            if os.path.exists(file_path):
                df = pd.read_csv(file_path, low_memory=False)
                if 'player' in df.columns:
                    df['Player_canonical'] = df['player'].apply(normalize_jeff_name)
                data['women'][key] = df
                print(f"Loaded women/{filename}: {len(df)} records")

    return data

def calculate_comprehensive_weighted_defaults(jeff_data):
    """Calculate weighted defaults from all Jeff datasets"""
    print("Calculating comprehensive weighted defaults from Jeff's data...")

    defaults = {'men': {}, 'women': {}}

    for gender in ['men', 'women']:
        if gender not in jeff_data:
            continue

        print(f"\nProcessing {gender}'s comprehensive data...")

        # Load matches for tournament context and dates
        matches_df = jeff_data[gender].get('matches')
        if matches_df is None:
            print(f"No matches data for {gender}")
            continue

        gender_defaults = {}

        # Overview stats - basic serving/returning
        if 'overview' in jeff_data[gender]:
            overview_df = jeff_data[gender]['overview']
            match_totals = overview_df[overview_df['set'] == 'Total'].copy()
            match_totals = match_totals.merge(
                matches_df[['match_id', 'Date', 'Tournament']],
                on='match_id',
                how='left'
            )

            weights = []
            for _, row in match_totals.iterrows():
                tournament_weight = get_tournament_tier_weight(row.get('Tournament', ''))
                recency_weight = calculate_recency_weight(row.get('Date', '20200101'))
                weights.append(tournament_weight * recency_weight)

            def weighted_quantile(values, weights, quantile=0.5):
                if len(values) == 0:
                    return 0
                sorted_indices = np.argsort(values)
                sorted_weights = np.array(weights)[sorted_indices]
                cumsum_weights = np.cumsum(sorted_weights)
                total_weight = cumsum_weights[-1]
                target_weight = quantile * total_weight
                index = np.searchsorted(cumsum_weights, target_weight)
                if index >= len(values):
                    index = len(values) - 1
                return values[sorted_indices[index]]

            serve_pts = match_totals['serve_pts'].fillna(0)
            aces = match_totals['aces'].fillna(0)
            dfs = match_totals['dfs'].fillna(0)
            first_in = match_totals['first_in'].fillna(0)
            first_won = match_totals['first_won'].fillna(0)
            second_won = match_totals['second_won'].fillna(0)
            bp_saved = match_totals['bp_saved'].fillna(0)
            return_pts_won = match_totals['return_pts_won'].fillna(0)
            winners = match_totals['winners'].fillna(0)
            winners_fh = match_totals['winners_fh'].fillna(0)
            winners_bh = match_totals['winners_bh'].fillna(0)
            unforced = match_totals['unforced'].fillna(0)
            unforced_fh = match_totals['unforced_fh'].fillna(0)
            unforced_bh = match_totals['unforced_bh'].fillna(0)

            gender_defaults.update({
                'serve_pts': weighted_quantile(serve_pts, weights),
                'aces': weighted_quantile(aces, weights),
                'double_faults': weighted_quantile(dfs, weights),
                'first_serve_pct': weighted_quantile(first_in / np.maximum(serve_pts, 1), weights),
                'first_serve_won': weighted_quantile(first_won, weights),
                'second_serve_won': weighted_quantile(second_won, weights),
                'break_points_saved': weighted_quantile(bp_saved, weights),
                'return_pts_won': weighted_quantile(return_pts_won, weights),
                'winners_total': weighted_quantile(winners, weights),
                'winners_fh': weighted_quantile(winners_fh, weights),
                'winners_bh': weighted_quantile(winners_bh, weights),
                'unforced_errors': weighted_quantile(unforced, weights),
                'unforced_fh': weighted_quantile(unforced_fh, weights),
                'unforced_bh': weighted_quantile(unforced_bh, weights)
            })

            # Calculate weighted composite indices from available data
            serve_pts_vals = match_totals['serve_pts'].fillna(80)
            winners_vals = match_totals['winners'].fillna(28)
            unforced_vals = match_totals['unforced'].fillna(28)

            aggression_values = (winners_vals / np.maximum(serve_pts_vals, 1) * 2 + 0.02 * 10 + 0.25 * 2) / 3
            consistency_values = 1 - (unforced_vals / np.maximum(serve_pts_vals, 1) + 0.1 + 0.05) / 3

            # Calculate pressure performance from key points data
            pressure_values = []
            if ('key_points_serve' in jeff_data[gender] and 'key_points_return' in jeff_data[gender]):
                key_serve_df = jeff_data[gender]['key_points_serve']
                key_return_df = jeff_data[gender]['key_points_return']

                key_serve_totals = key_serve_df[key_serve_df['row'] == 'Total'].copy()
                key_return_totals = key_return_df[key_return_df['row'] == 'Total'].copy()

                if len(key_serve_totals) > 0 and len(key_return_totals) > 0:
                    serve_won_pct = key_serve_totals['won'].fillna(0) / np.maximum(key_serve_totals['serve_pts'].fillna(1), 1)
                    return_won_pct = key_return_totals['won'].fillna(0) / np.maximum(key_return_totals['return_pts'].fillna(1), 1)
                    pressure_values = (serve_won_pct + return_won_pct) / 2

            # Calculate net game strength
            net_strength_values = []
            if 'net_points' in jeff_data[gender]:
                net_df = jeff_data[gender]['net_points']
                net_totals = net_df[net_df['row'] == 'Total'].copy()
                if len(net_totals) > 0:
                    net_strength_values = net_totals['won'].fillna(0) / np.maximum(net_totals['total'].fillna(1), 1)

            gender_defaults.update({
                'aggression_index': max(0, min(1, weighted_quantile(aggression_values, weights))),
                'consistency_index': max(0, min(1, weighted_quantile(consistency_values, weights))),
                'pressure_performance': max(0, min(1, pressure_values.median())) if len(pressure_values) > 0 else 0.55,
                'net_game_strength': max(0, min(1, net_strength_values.median())) if len(net_strength_values) > 0 else 0.6
            })

        # Serve direction defaults
        if 'serve_direction' in jeff_data[gender]:
            serve_dir_df = jeff_data[gender]['serve_direction']
            serve_totals = serve_dir_df[serve_dir_df['row'] == 'Total'].copy()

            if len(serve_totals) > 0:
                total_wide = serve_totals['deuce_wide'].fillna(0) + serve_totals['ad_wide'].fillna(0)
                total_t = serve_totals['deuce_t'].fillna(0) + serve_totals['ad_t'].fillna(0)
                total_body = serve_totals['deuce_middle'].fillna(0) + serve_totals['ad_middle'].fillna(0)
                total_serves = total_wide + total_t + total_body

                gender_defaults.update({
                    'serve_wide_pct': (total_wide / np.maximum(total_serves, 1)).median(),
                    'serve_t_pct': (total_t / np.maximum(total_serves, 1)).median(),
                    'serve_body_pct': (total_body / np.maximum(total_serves, 1)).median()
                })

        # Return depth defaults
        if 'return_depth' in jeff_data[gender]:
            return_depth_df = jeff_data[gender]['return_depth']
            return_totals = return_depth_df[return_depth_df['row'] == 'Total'].copy()

            if len(return_totals) > 0:
                returnable = return_totals['returnable'].fillna(1)
                deep = return_totals['deep'].fillna(0)
                shallow = return_totals['shallow'].fillna(0)
                very_deep = return_totals['very_deep'].fillna(0)

                gender_defaults.update({
                    'return_deep_pct': (deep / np.maximum(returnable, 1)).median(),
                    'return_shallow_pct': (shallow / np.maximum(returnable, 1)).median(),
                    'return_very_deep_pct': (very_deep / np.maximum(returnable, 1)).median()
                })

        # Key points defaults
        if 'key_points_serve' in jeff_data[gender]:
            key_serve_df = jeff_data[gender]['key_points_serve']
            key_serve_totals = key_serve_df[key_serve_df['row'] == 'Total'].copy()

            if len(key_serve_totals) > 0:
                serve_pts = key_serve_totals['serve_pts'].fillna(1)
                won = key_serve_totals['won'].fillna(0)
                aces = key_serve_totals['aces'].fillna(0)
                first_in = key_serve_totals['first_in'].fillna(0)

                gender_defaults.update({
                    'key_points_serve_won_pct': (won / np.maximum(serve_pts, 1)).median(),
                    'key_points_aces_pct': (aces / np.maximum(serve_pts, 1)).median(),
                    'key_points_first_in_pct': (first_in / np.maximum(serve_pts, 1)).median()
                })

        if 'key_points_return' in jeff_data[gender]:
            key_return_df = jeff_data[gender]['key_points_return']
            key_return_totals = key_return_df[key_return_df['row'] == 'Total'].copy()

            if len(key_return_totals) > 0:
                return_pts = key_return_totals['return_pts'].fillna(1)
                won = key_return_totals['won'].fillna(0)
                winners = key_return_totals['winners'].fillna(0)

                gender_defaults.update({
                    'key_points_return_won_pct': (won / np.maximum(return_pts, 1)).median(),
                    'key_points_return_winners': (winners / np.maximum(return_pts, 1)).median()
                })

        # Net points defaults
        if 'net_points' in jeff_data[gender]:
            net_df = jeff_data[gender]['net_points']
            net_totals = net_df[net_df['row'] == 'Total'].copy()

            if len(net_totals) > 0:
                total_net = net_totals['total'].fillna(1)
                won = net_totals['won'].fillna(0)
                winners = net_totals['winners'].fillna(0)
                passed = net_totals['passed'].fillna(0)

                gender_defaults.update({
                    'net_points_won_pct': (won / np.maximum(total_net, 1)).median(),
                    'net_winners_pct': (winners / np.maximum(total_net, 1)).median(),
                    'passed_at_net_pct': (passed / np.maximum(total_net, 1)).median()
                })

        # Add remaining defaults from fallback
        fallback = get_fallback_defaults('men' if gender == 'men' else 'women')
        for key, value in fallback.items():
            if key not in gender_defaults:
                gender_defaults[key] = value

        defaults[gender] = gender_defaults
        print(f"Calculated comprehensive weighted defaults for {gender}: {len(gender_defaults)} features")

    return defaults

def extract_comprehensive_jeff_features(player_canonical, gender, jeff_data, weighted_defaults=None):
    gender_key = 'men' if gender == 'M' else 'women'

    if gender_key not in jeff_data:
        return get_fallback_defaults(gender_key)

    if weighted_defaults and gender_key in weighted_defaults:
        features = weighted_defaults[gender_key].copy()
    else:
        features = get_fallback_defaults(gender_key)

    # Overview stats
    if 'overview' in jeff_data[gender_key]:
        overview_df = jeff_data[gender_key]['overview']
        if 'Player_canonical' in overview_df.columns:
            player_overview = overview_df[
                (overview_df['Player_canonical'] == player_canonical) &
                (overview_df['set'] == 'Total')
            ]

            if len(player_overview) > 0:
                latest = player_overview.iloc[-1]
                serve_pts = latest.get('serve_pts', 80)
                if serve_pts > 0:
                    features.update({
                        'serve_pts': serve_pts,
                        'aces': latest.get('aces', 0),
                        'double_faults': latest.get('dfs', 0),
                        'first_serve_pct': latest.get('first_in', 0) / serve_pts,
                        'first_serve_won': latest.get('first_won', 0),
                        'second_serve_won': latest.get('second_won', 0),
                        'break_points_saved': latest.get('bp_saved', 0),
                        'return_pts_won': latest.get('return_pts_won', 0),
                        'winners_total': latest.get('winners', 0),
                        'winners_fh': latest.get('winners_fh', 0),
                        'winners_bh': latest.get('winners_bh', 0),
                        'unforced_errors': latest.get('unforced', 0),
                        'unforced_fh': latest.get('unforced_fh', 0),
                        'unforced_bh': latest.get('unforced_bh', 0)
                    })

    return features

def calculate_playing_style_indices(features):
    """Calculate playing style composite indices"""

    # Aggression Index
    aggression = (
        features.get('winners_total', 0) / max(1, features.get('serve_pts', 60)) * 2 +
        features.get('serve_volley_frequency', 0) * 10 +
        features.get('shot_down_line_pct', 0) * 2
    ) / 3

    # Consistency Index
    consistency = 1 - (
        features.get('unforced_errors', 0) / max(1, features.get('serve_pts', 60)) +
        features.get('return_error_net_pct', 0) +
        features.get('return_error_wide_pct', 0)
    ) / 3

    # Pressure Performance
    pressure_perf = (
        features.get('key_points_serve_won_pct', 0) +
        features.get('key_points_return_won_pct', 0)
    ) / 2

    # Net Game Strength
    net_game_strength = features.get('net_points_won_pct', 0)

    return {
        'aggression_index': max(0, min(1, aggression)),
        'consistency_index': max(0, min(1, consistency)),
        'pressure_performance': max(0, min(1, pressure_perf)),
        'net_game_strength': max(0, min(1, net_game_strength))
    }


def get_fallback_defaults(gender_key):
    """Fallback defaults when no Jeff data available"""
    base_defaults = {
        'serve_pts': 80,
        'aces': 6,
        'double_faults': 3,
        'first_serve_pct': 0.62,
        'first_serve_won': 35,
        'second_serve_won': 16,
        'break_points_saved': 4,
        'return_pts_won': 30,
        'winners_total': 28,
        'winners_fh': 16,
        'winners_bh': 12,
        'unforced_errors': 28,
        'unforced_fh': 16,
        'unforced_bh': 12,
        'serve_wide_pct': 0.3,
        'serve_t_pct': 0.4,
        'serve_body_pct': 0.3,
        'return_deep_pct': 0.4,
        'return_shallow_pct': 0.3,
        'return_very_deep_pct': 0.2,
        'key_points_serve_won_pct': 0.6,
        'key_points_aces_pct': 0.05,
        'key_points_first_in_pct': 0.55,
        'key_points_return_won_pct': 0.35,
        'key_points_return_winners': 0.02,
        'net_points_won_pct': 0.65,
        'net_winners_pct': 0.3,
        'passed_at_net_pct': 0.3,
        'rally_server_winners_pct': 0.15,
        'rally_server_unforced_pct': 0.2,
        'rally_returner_winners_pct': 0.1,
        'rally_returner_unforced_pct': 0.25,
        'shot_crosscourt_pct': 0.5,
        'shot_down_line_pct': 0.25,
        'shot_inside_out_pct': 0.15,
        'serve_volley_frequency': 0.02,
        'serve_volley_success_pct': 0.6,
        'return_error_net_pct': 0.1,
        'return_error_wide_pct': 0.05,
        'aggression_index': 0.5,
        'consistency_index': 0.5,
        'pressure_performance': 0.5,
        'net_game_strength': 0.5
    }

    # Adjust for gender differences
    if gender_key == 'women':
        base_defaults.update({
            'serve_pts': 75,
            'aces': 4,
            'first_serve_pct': 0.60,
            'first_serve_won': 32,
            'second_serve_won': 15,
            'serve_volley_frequency': 0.01,
            'net_points_won_pct': 0.60
        })

    return base_defaults

def load_all_tennis_data():
    """Load tennis data from all years"""
    base_path = os.path.expanduser("~/Desktop/data")

    all_data = []

    # Men's data
    men_path = os.path.join(base_path, "tennisdata_men")
    if os.path.exists(men_path):
        for year in range(2020, 2026):
            file_path = os.path.join(men_path, f"{year}_m.xlsx")
            if os.path.exists(file_path):
                df = load_excel_data(file_path)
                if not df.empty and 'Date' in df.columns:
                    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
                    df['gender'] = 'M'
                    df['year'] = df['Date'].dt.year
                    all_data.append(df)

    # Women's data
    women_path = os.path.join(base_path, "tennisdata_women")
    if os.path.exists(women_path):
        for year in range(2020, 2026):
            file_path = os.path.join(women_path, f"{year}_w.xlsx")
            if os.path.exists(file_path):
                df = load_excel_data(file_path)
                if not df.empty and 'Date' in df.columns:
                    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
                    df['gender'] = 'W'
                    df['year'] = df['Date'].dt.year
                    all_data.append(df)

    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()

def calculate_data_quality_v2(row):
    score = 0
    max_possible = 0

    # Core match info (always counts) - 30%
    if pd.notna(row.get('Winner')) and pd.notna(row.get('Loser')):
        score += 0.3
    max_possible += 0.3

    # Ranking data (when available) - 30%
    if pd.notna(row.get('WRank')) and pd.notna(row.get('LRank')):
        score += 0.3
        max_possible += 0.3
    elif row.get('date', date(2000,1,1)) >= date(2025, 6, 11):
        max_possible += 0.3

    # Odds data (when available) - 20%
    if pd.notna(row.get('tennis_data_odds1')):
        score += 0.2
        max_possible += 0.2

    # Jeff features (when available) - 20%
    jeff_available = any(pd.notna(row.get(f'winner_{feat}')) for feat in ['aces', 'serve_pts'])
    if jeff_available:
        score += 0.2
        max_possible += 0.2
    elif row.get('date', date(2000,1,1)) <= date(2025, 6, 10):
        max_possible += 0.2

    return score / max_possible if max_possible > 0 else 0

def extract_comprehensive_jeff_features(player_canonical, gender, jeff_data, weighted_defaults=None):
    """Extract features from all Jeff datasets with Player_canonical checks"""
    gender_key = 'men' if gender == 'M' else 'women'

    if gender_key not in jeff_data:
        return get_fallback_defaults(gender_key)

    if weighted_defaults and gender_key in weighted_defaults:
        features = weighted_defaults[gender_key].copy()
    else:
        features = get_fallback_defaults(gender_key)

    # Overview stats
    if 'overview' in jeff_data[gender_key]:
        overview_df = jeff_data[gender_key]['overview']
        if 'Player_canonical' in overview_df.columns:
            player_overview = overview_df[
                (overview_df['Player_canonical'] == player_canonical) &
                (overview_df['set'] == 'Total')
            ]

            if len(player_overview) > 0:
                latest = player_overview.iloc[-1]
                serve_pts = latest.get('serve_pts', 80)
                if serve_pts > 0:
                    features.update({
                        'serve_pts': serve_pts,
                        'aces': latest.get('aces', 0),
                        'double_faults': latest.get('dfs', 0),
                        'first_serve_pct': latest.get('first_in', 0) / serve_pts,
                        'first_serve_won': latest.get('first_won', 0),
                        'second_serve_won': latest.get('second_won', 0),
                        'break_points_saved': latest.get('bp_saved', 0),
                        'return_pts_won': latest.get('return_pts_won', 0),
                        'winners_total': latest.get('winners', 0),
                        'winners_fh': latest.get('winners_fh', 0),
                        'winners_bh': latest.get('winners_bh', 0),
                        'unforced_errors': latest.get('unforced', 0),
                        'unforced_fh': latest.get('unforced_fh', 0),
                        'unforced_bh': latest.get('unforced_bh', 0)
                    })

    # Serve direction patterns
    if 'serve_direction' in jeff_data[gender_key]:
        serve_dir_df = jeff_data[gender_key]['serve_direction']
        if 'Player_canonical' in serve_dir_df.columns:
            player_serve_dir = serve_dir_df[
                (serve_dir_df['Player_canonical'] == player_canonical) &
                (serve_dir_df['row'] == 'Total')
            ]

            if len(player_serve_dir) > 0:
                latest = player_serve_dir.iloc[-1]
                total_serves = (latest.get('deuce_wide', 0) + latest.get('deuce_t', 0) +
                              latest.get('ad_wide', 0) + latest.get('ad_t', 0))

                if total_serves > 0:
                    features.update({
                        'serve_wide_pct': (latest.get('deuce_wide', 0) + latest.get('ad_wide', 0)) / total_serves,
                        'serve_t_pct': (latest.get('deuce_t', 0) + latest.get('ad_t', 0)) / total_serves,
                        'serve_body_pct': (latest.get('deuce_middle', 0) + latest.get('ad_middle', 0)) / total_serves
                    })

    # Return depth
    if 'return_depth' in jeff_data[gender_key]:
        return_depth_df = jeff_data[gender_key]['return_depth']
        if 'Player_canonical' in return_depth_df.columns:
            player_return_depth = return_depth_df[
                (return_depth_df['Player_canonical'] == player_canonical) &
                (return_depth_df['row'] == 'Total')
            ]

            if len(player_return_depth) > 0:
                latest = player_return_depth.iloc[-1]
                returnable = latest.get('returnable', 1)

                if returnable > 0:
                    features.update({
                        'return_deep_pct': latest.get('deep', 0) / returnable,
                        'return_shallow_pct': latest.get('shallow', 0) / returnable,
                        'return_very_deep_pct': latest.get('very_deep', 0) / returnable
                    })

    # Key points serve
    if 'key_points_serve' in jeff_data[gender_key]:
        key_serve_df = jeff_data[gender_key]['key_points_serve']
        if 'Player_canonical' in key_serve_df.columns:
            player_key_serve = key_serve_df[
                (key_serve_df['Player_canonical'] == player_canonical) &
                (key_serve_df['row'] == 'Total')
            ]

            if len(player_key_serve) > 0:
                latest = player_key_serve.iloc[-1]
                key_serve_pts = latest.get('serve_pts', 1)

                if key_serve_pts > 0:
                    features.update({
                        'key_points_serve_won_pct': latest.get('won', 0) / key_serve_pts,
                        'key_points_aces_pct': latest.get('aces', 0) / key_serve_pts,
                        'key_points_first_in_pct': latest.get('first_in', 0) / key_serve_pts
                    })

    # Key points return
    if 'key_points_return' in jeff_data[gender_key]:
        key_return_df = jeff_data[gender_key]['key_points_return']
        if 'Player_canonical' in key_return_df.columns:
            player_key_return = key_return_df[
                (key_return_df['Player_canonical'] == player_canonical) &
                (key_return_df['row'] == 'Total')
            ]

            if len(player_key_return) > 0:
                latest = player_key_return.iloc[-1]
                return_pts = latest.get('return_pts', 1)

                if return_pts > 0:
                    features.update({
                        'key_points_return_won_pct': latest.get('won', 0) / return_pts,
                        'key_points_return_winners': latest.get('winners', 0) / return_pts
                    })

    # Net points
    if 'net_points' in jeff_data[gender_key]:
        net_df = jeff_data[gender_key]['net_points']
        if 'Player_canonical' in net_df.columns:
            player_net = net_df[
                (net_df['Player_canonical'] == player_canonical) &
                (net_df['row'] == 'Total')
            ]

            if len(player_net) > 0:
                latest = player_net.iloc[-1]
                net_pts = latest.get('total', 1)

                if net_pts > 0:
                    features.update({
                        'net_points_won_pct': latest.get('won', 0) / net_pts,
                        'net_winners_pct': latest.get('winners', 0) / net_pts,
                        'passed_at_net_pct': latest.get('passed', 0) / net_pts
                    })

    # Rally patterns
    if 'rally' in jeff_data[gender_key]:
        rally_df = jeff_data[gender_key]['rally']
        if 'Player_canonical' in rally_df.columns:
            player_rally = rally_df[
                (rally_df['Player_canonical'] == player_canonical) &
                (rally_df['row'] == 'Total')
            ]

            if len(player_rally) > 0:
                latest = player_rally.iloc[-1]
                rally_pts = latest.get('total', 1)

                if rally_pts > 0:
                    features.update({
                        'rally_server_winners_pct': latest.get('server_winners', 0) / rally_pts,
                        'rally_server_unforced_pct': latest.get('server_unforced', 0) / rally_pts,
                        'rally_returner_winners_pct': latest.get('returner_winners', 0) / rally_pts,
                        'rally_returner_unforced_pct': latest.get('returner_unforced', 0) / rally_pts
                    })

    # Shot direction
    if 'shot_direction' in jeff_data[gender_key]:
        shot_dir_df = jeff_data[gender_key]['shot_direction']
        if 'Player_canonical' in shot_dir_df.columns:
            player_shot_dir = shot_dir_df[
                (shot_dir_df['Player_canonical'] == player_canonical) &
                (shot_dir_df['row'] == 'Total')
            ]

            if len(player_shot_dir) > 0:
                latest = player_shot_dir.iloc[-1]
                total_shots = latest.get('total', 1)

                if total_shots > 0:
                    features.update({
                        'shot_crosscourt_pct': latest.get('crosscourt', 0) / total_shots,
                        'shot_down_line_pct': latest.get('down_line', 0) / total_shots,
                        'shot_inside_out_pct': latest.get('inside_out', 0) / total_shots
                    })

    # Serve and volley
    if 'snv' in jeff_data[gender_key]:
        snv_df = jeff_data[gender_key]['snv']
        if 'Player_canonical' in snv_df.columns:
            player_snv = snv_df[
                (snv_df['Player_canonical'] == player_canonical) &
                (snv_df['row'] == 'Total')
            ]

            if len(player_snv) > 0:
                latest = player_snv.iloc[-1]
                serve_pts = latest.get('serve_pts', 1)
                snv_pts = latest.get('snv', 0)

                if serve_pts > 0:
                    features.update({
                        'serve_volley_frequency': snv_pts / serve_pts,
                        'serve_volley_success_pct': latest.get('snv_won', 0) / max(1, snv_pts)
                    })

    # Return outcomes
    if 'return_outcomes' in jeff_data[gender_key]:
        return_outcomes_df = jeff_data[gender_key]['return_outcomes']
        if 'Player_canonical' in return_outcomes_df.columns:
            player_return_outcomes = return_outcomes_df[
                (return_outcomes_df['Player_canonical'] == player_canonical) &
                (return_outcomes_df['row'] == 'Total')
            ]

            if len(player_return_outcomes) > 0:
                latest = player_return_outcomes.iloc[-1]
                return_attempts = latest.get('return_attempts', 1)

                if return_attempts > 0:
                    features.update({
                        'return_error_net_pct': latest.get('net', 0) / return_attempts,
                        'return_error_wide_pct': latest.get('wide', 0) / return_attempts
                    })

    # Calculate composite indices
    features.update(calculate_playing_style_indices(features))

    return features

# Clear corrupted cache and regenerate data
import os, pickle, pandas as pd
from datetime import date
import shutil

CACHE_DIR = os.path.expanduser("~/Desktop/data/cache")
HD_PATH   = os.path.join(CACHE_DIR, "historical_data.parquet")
JEFF_PATH = os.path.join(CACHE_DIR, "jeff_data.pkl")
DEF_PATH  = os.path.join(CACHE_DIR, "weighted_defaults.pkl")

print("=== CLEARING CORRUPTED CACHE ===")
# Remove corrupted cache files
for file_path in [HD_PATH, JEFF_PATH, DEF_PATH]:
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Removed: {file_path}")

# Remove entire cache directory and recreate
if os.path.exists(CACHE_DIR):
    shutil.rmtree(CACHE_DIR)
    print(f"Removed cache directory: {CACHE_DIR}")

os.makedirs(CACHE_DIR, exist_ok=True)
print("Created fresh cache directory")

def generate_comprehensive_historical_all_years_fixed(*, fast: bool = False, n_sample: int = 500):
    """Fixed version of data generation function"""
    print("=== STARTING DATA GENERATION ===")

    # Step 1: Load Jeff's data
    print("Step 1: Loading Jeff's comprehensive data...")
    try:
        jeff_data = load_jeff_comprehensive_data()
        if not jeff_data or ('men' not in jeff_data and 'women' not in jeff_data):
            print("ERROR: Jeff data loading failed")
            return pd.DataFrame(), {}, {}

        print(f"✓ Jeff data loaded successfully")
        print(f"  - Men's datasets: {len(jeff_data.get('men', {}))}")
        print(f"  - Women's datasets: {len(jeff_data.get('women', {}))}")

    except Exception as e:
        print(f"ERROR loading Jeff data: {e}")
        return pd.DataFrame(), {}, {}

    # Step 2: Calculate weighted defaults
    print("Step 2: Calculating weighted defaults...")
    try:
        weighted_defaults = calculate_comprehensive_weighted_defaults(jeff_data)
        if not weighted_defaults:
            print("ERROR: Weighted defaults calculation failed")
            return pd.DataFrame(), jeff_data, {}

        print(f"✓ Weighted defaults calculated")
        print(f"  - Men's features: {len(weighted_defaults.get('men', {}))}")
        print(f"  - Women's features: {len(weighted_defaults.get('women', {}))}")

    except Exception as e:
        print(f"ERROR calculating weighted defaults: {e}")
        return pd.DataFrame(), jeff_data, {}

    # Step 3: Load tennis match data
    print("Step 3: Loading tennis match data...")
    try:
        tennis_data = load_all_tennis_data()
        if tennis_data.empty:
            print("ERROR: No tennis data loaded")
            return pd.DataFrame(), jeff_data, weighted_defaults

        print(f"✓ Tennis data loaded: {len(tennis_data)} matches")
        # --------------------------------------------------------------
        # Optional fast‑mode: work on a random subset for quick testing
        if fast:
            total_rows = len(tennis_data)
            take = min(n_sample, total_rows)
            tennis_data = tennis_data.sample(take, random_state=1).reset_index(drop=True)
            print(f"[fast‑mode] using sample of {take}/{total_rows} rows")
        # --------------------------------------------------------------

    except Exception as e:
        print(f"ERROR loading tennis data: {e}")
        return pd.DataFrame(), jeff_data, weighted_defaults

    # Step 4: Process tennis data
    print("Step 4: Processing tennis data...")
    try:
        # Normalize player names
        tennis_data['winner_canonical'] = tennis_data['Winner'].apply(normalize_name)
        tennis_data['loser_canonical'] = tennis_data['Loser'].apply(normalize_name)
        tennis_data['tournament_canonical'] = tennis_data['Tournament'].apply(normalize_tournament_name)

        # Fix dates
        tennis_data['Date'] = pd.to_datetime(tennis_data['Date'], errors='coerce')
        tennis_data['date'] = tennis_data['Date'].dt.date

        # Add odds data
        tennis_data['tennis_data_odds1'] = pd.to_numeric(tennis_data.get('PSW', tennis_data.get('AvgW', 0)), errors='coerce')
        tennis_data['tennis_data_odds2'] = pd.to_numeric(tennis_data.get('PSL', tennis_data.get('AvgL', 0)), errors='coerce')

        # --- Odds cleaning and derived columns ---
        # Average and min/max odds (if available)
        tennis_data['odds_win_avg'] = pd.to_numeric(tennis_data.get('AvgW', tennis_data.get('PSW', 0)), errors='coerce')
        tennis_data['odds_lose_avg'] = pd.to_numeric(tennis_data.get('AvgL', tennis_data.get('PSL', 0)), errors='coerce')
        # Preserve opening prices
        tennis_data['odds_win_open']  = tennis_data.get('PSW')
        tennis_data['odds_lose_open'] = tennis_data.get('PSL')
        # Calculate line move (bps) using opening odds
        tennis_data['line_move_bps'] = (tennis_data['odds_win_avg'] - tennis_data['odds_win_open']) / tennis_data['odds_win_open']

        # Add ranking difference
        if 'WRank' in tennis_data.columns and 'LRank' in tennis_data.columns:
            tennis_data['rank_difference'] = abs(tennis_data['WRank'] - tennis_data['LRank'])

        # Drop raw odds columns (keep cleaned/derived)
        tennis_data.drop(
            columns=[c for c in ['PSW','PSL','AvgW','AvgL','MaxW','MaxL','MinW','MinL'] if c in tennis_data.columns],
            inplace=True,
            errors='ignore'
        )

        print(f"✓ Tennis data processed")

    except Exception as e:
        print(f"ERROR processing tennis data: {e}")
        return pd.DataFrame(), jeff_data, weighted_defaults

    # Step 5: Add Jeff feature columns
    print("Step 5: Adding Jeff feature columns...")
    try:
        all_jeff_features = [
            'serve_pts', 'aces', 'double_faults', 'first_serve_pct', 'first_serve_won',
            'second_serve_won', 'break_points_saved', 'return_pts_won',
            'winners_total', 'winners_fh', 'winners_bh', 'unforced_errors', 'unforced_fh', 'unforced_bh',
            'serve_wide_pct', 'serve_t_pct', 'serve_body_pct',
            'return_deep_pct', 'return_shallow_pct', 'return_very_deep_pct',
            'key_points_serve_won_pct', 'key_points_aces_pct', 'key_points_first_in_pct',
            'key_points_return_won_pct', 'key_points_return_winners',
            'net_points_won_pct', 'net_winners_pct', 'passed_at_net_pct',
            'rally_server_winners_pct', 'rally_server_unforced_pct',
            'rally_returner_winners_pct', 'rally_returner_unforced_pct',
            'shot_crosscourt_pct', 'shot_down_line_pct', 'shot_inside_out_pct',
            'serve_volley_frequency', 'serve_volley_success_pct',
            'return_error_net_pct', 'return_error_wide_pct',
            'aggression_index', 'consistency_index', 'pressure_performance', 'net_game_strength'
        ]

        # ---- fast column initialisation (no fragmentation) -----------------
        winner_cols = {f"winner_{f}": pd.NA for f in all_jeff_features}
        loser_cols  = {f"loser_{f}":  pd.NA for f in all_jeff_features}

        feature_df  = pd.DataFrame(winner_cols | loser_cols, index=tennis_data.index)
        tennis_data = pd.concat([tennis_data, feature_df], axis=1, copy=False)
        # --------------------------------------------------------------------

        print(f"✓ Added {len(all_jeff_features) * 2} feature columns")

    except Exception as e:
        print(f"ERROR adding feature columns: {e}")
        return pd.DataFrame(), jeff_data, weighted_defaults

    # Step 6: Extract Jeff features
    print("Step 6: Extracting Jeff features...")
    try:
        total_matches = len(tennis_data)
        matches_with_jeff_features = 0

        # Test feature extraction first
        if 'men' in jeff_data and 'overview' in jeff_data['men']:
            test_player = jeff_data['men']['overview']['Player_canonical'].iloc[0]
            test_features = extract_comprehensive_jeff_features(test_player, 'M', jeff_data, weighted_defaults)
            print(f"✓ Feature extraction test passed for {test_player}")
            print(f"  Sample features: serve_pts={test_features.get('serve_pts', 'N/A')}")

        for idx, row in tennis_data.iterrows():
            if idx % 1000 == 0:
                print(f"  Processing match {idx}/{total_matches}")

            try:
                gender = row['gender']

                # Only extract Jeff features for matches before cutoff
                if row['date'] <= date(2025, 6, 10):
                    winner_features = extract_comprehensive_jeff_features(
                        row['winner_canonical'], gender, jeff_data, weighted_defaults
                    )
                    loser_features = extract_comprehensive_jeff_features(
                        row['loser_canonical'], gender, jeff_data, weighted_defaults
                    )

                    # Assign features
                    for feature_name, feature_value in winner_features.items():
                        col_name = f'winner_{feature_name}'
                        if col_name in tennis_data.columns:
                            tennis_data.at[idx, col_name] = feature_value

                    for feature_name, feature_value in loser_features.items():
                        col_name = f'loser_{feature_name}'
                        if col_name in tennis_data.columns:
                            tennis_data.at[idx, col_name] = feature_value

                    if winner_features and loser_features:
                        matches_with_jeff_features += 1

            except Exception as e:
                if idx < 10:  # Only print first few errors
                    print(f"  Warning: Error processing match {idx}: {e}")
                continue

        print(f"✓ Jeff features extracted for {matches_with_jeff_features}/{total_matches} matches")

        # ------------------------------------------------------------------
        # Step 7: Append API‑Tennis fixtures/completed matches since Jeff cutoff
        # ------------------------------------------------------------------
        print("Step 7: Appending API‑Tennis data …")
        from datetime import timedelta

        def _api_rows_for_day(day):
            rows = []
            try:
                fixtures = fetch_day(day)
            except Exception as err:
                print(f"  ⚠️  API error for {day}: {err}")
                return rows

            for fx in fixtures:
                # ---- finished‑match filter --------------------------------
                status_raw = fx.get("status") or fx.get("event_status") or ""
                status     = str(status_raw).lower()
                if not any(s in status for s in ("completed", "finished", "ended", "ft", "fulltime")):
                    continue  # skip in‑progress / scheduled fixtures
                # ---- winner / loser names ---------------------------------
                winner = fx.get("winner_name") or fx.get("event_winner_name")
                loser  = fx.get("opponent_name")
                if not winner:
                    # derive from event_winner index + first/second player fields
                    who = str(fx.get("event_winner", "")).strip()
                    p1  = fx.get("event_first_player")
                    p2  = fx.get("event_second_player")
                    if who == "1":
                        winner, loser = p1, p2
                    elif who == "2":
                        winner, loser = p2, p1
                if not winner or not loser:
                    continue  # cannot resolve players

                # ---- other fields -----------------------------------------
                match_date = datetime.strptime(fx.get("start_date") or fx.get("event_date"), "%Y-%m-%d").date()
                surface    = determine_surface(fx.get("tournament_name", ""))

                rows.append({
                    "Date": match_date,
                    "date": match_date,
                    "gender": ("M" if fx.get("gender", fx.get("event_gender","m")).lower().startswith("m") else "W"),
                    "Winner": winner,
                    "Loser": loser,
                    "Surface": surface,
                    "winner_canonical": normalize_name(winner),
                    "loser_canonical":  normalize_name(loser),
                    "tournament_canonical": normalize_tournament_name(fx.get("tournament_name", "")),
                    "tennis_data_odds1": pd.to_numeric(fx.get("odds_winner") or fx.get("event_odds_player1"), errors="coerce"),
                    "tennis_data_odds2": pd.to_numeric(fx.get("odds_loser")  or fx.get("event_odds_player2"), errors="coerce"),
                    "WRank": pd.NA,
                    "LRank": pd.NA,
                })
            return rows

        today   = date.today()
        cutoff  = date(2025, 6, 10)                 # Jeff data stops at 10 Jun 2025
        days    = [cutoff + timedelta(days=i) for i in range((today - cutoff).days + 1)]
        print(f"  Fetching API‑Tennis for {len(days)} days ({cutoff} → {today})")

        api_rows = []
        for d in days:
            api_rows.extend(_api_rows_for_day(d))

        if api_rows:
            api_df = pd.DataFrame(api_rows)

            # Remove rows already present in tennis_data (exact winner/loser/date)
            merged = tennis_data.merge(
                api_df[['winner_canonical', 'loser_canonical', 'date']],
                on=['winner_canonical', 'loser_canonical', 'date'],
                how='inner'
            )
            mask_dup = api_df.set_index(['winner_canonical', 'loser_canonical', 'date']).index.isin(
                merged.set_index(['winner_canonical', 'loser_canonical', 'date']).index
            )
            api_df = api_df[~mask_dup]

            tennis_data = pd.concat([tennis_data, api_df], ignore_index=True)
            print(f"  ✓ Appended {len(api_df)} new API‑Tennis rows")
        else:
            print("  ✓ No API‑Tennis rows to append")
        # ------------------------------------------------------------------

    except Exception as e:
        print(f"ERROR extracting Jeff features: {e}")
        return pd.DataFrame(), jeff_data, weighted_defaults

    print(f"=== DATA GENERATION COMPLETE ===")
    print(f"Final data shape: {tennis_data.shape}")
    print(f"Columns: {len(tennis_data.columns)}")

    return tennis_data, jeff_data, weighted_defaults

# Generate the data
# Use fast mode during development; remove fast=True for full rebuild
print("Starting fresh data generation...")
historical_data, jeff_data, weighted_defaults = generate_comprehensive_historical_all_years_fixed(fast=True, n_sample=500)

# Save to cache if successful
if len(historical_data) > 0 and jeff_data and weighted_defaults:
    print("\n=== SAVING TO CACHE ===")
    try:
        historical_data.to_parquet(HD_PATH, index=False)
        with open(JEFF_PATH, "wb") as f:
            pickle.dump(jeff_data, f, protocol=pickle.HIGHEST_PROTOCOL)
        with open(DEF_PATH, "wb") as f:
            pickle.dump(weighted_defaults, f, protocol=pickle.HIGHEST_PROTOCOL)
        print("✓ Data cached successfully")
    except Exception as e:
        print(f"ERROR saving cache: {e}")
else:
    print("\n=== CACHE NOT SAVED ===")
    print("Data generation failed - cache not created")

# Final status
print(f"\n=== FINAL STATUS ===")
print(f"Historical data: {historical_data.shape}")
print(f"Jeff data: {bool(jeff_data)}")
print(f"Weighted defaults: {bool(weighted_defaults)}")

if len(historical_data) > 0:
    # Check Jeff features
    jeff_cols = [col for col in historical_data.columns if col.startswith('winner_serve_pts')]
    print(f"Jeff feature columns: {jeff_cols}")

    if 'winner_serve_pts' in historical_data.columns:
        non_null_count = historical_data['winner_serve_pts'].notna().sum()
        print(f"Non-null serve_pts values: {non_null_count}")

        # Show sample
        sample_values = historical_data['winner_serve_pts'].dropna().head(5)
        print(f"Sample serve_pts: {sample_values.tolist()}")

print("Data generation complete!")

In [41]:
# -------------------------------------------------------------------
# API-Tennis standalone utilities – copy into a scratch cell
# -------------------------------------------------------------------
import os, json, time, pickle, requests, pandas as pd
from datetime import date, timedelta, datetime
from pathlib import Path

# API Configuration
os.environ["API_TENNIS_KEY"] = "adfc70491c47895e5fffdc6428bbf36a561989d4bffcfa9ecfba8d91e947b4fb"
API_KEY = os.getenv("API_TENNIS_KEY")
BASE = "https://api.api-tennis.com/tennis/"
from datetime import date

# ------------------------------------------------------------------ #
# one-line wrapper that never crashes on missing "result"
def api(method: str, **params):
    r = requests.get(BASE, params={"method": method, "APIkey": API_KEY, **params}, timeout=30)
    r.raise_for_status()
    j = r.json()
    if str(j.get("error", 0)) != "0":
        raise RuntimeError(j)
    return j.get("result", [])


# ---------- 1. finished fixtures for any single day  -------------- #
def fixtures_finished(day: date) -> pd.DataFrame:
    res = api("get_fixtures", date_start=day, date_stop=day, timezone="America/New_York")
    rows = []
    for ev in res:
        if ev.get("event_status") != "Finished":
            continue
        p1, p2 = ev["event_first_player"], ev["event_second_player"]
        winner_flag = ev["event_winner"]  # "First Player" / "Second Player"
        winner, loser = (p1, p2) if winner_flag.startswith("First") else (p2, p1)
        rows.append({
            "date": ev["event_date"],
            "event_key": ev["event_key"],
            "tournament": ev["tournament_name"],
            "round": ev.get("tournament_round", ""),
            "surface": ev.get("court_surface", ""),  # string may be empty
            "winner": winner,
            "loser": loser,
            "score": ev.get("scores", ""),
        })
    return pd.DataFrame(rows)


# example
print("=== finished fixtures 2025-06-25 ===")
print(fixtures_finished(date(2025, 6, 25)).head())

# ---------- 2. static tournament → surface lookup ----------------- #
CACHE = Path.home() / ".api_tennis_cache"
CACHE.mkdir(exist_ok=True)


def tournament_surface_map():
    fn = CACHE / "tournaments.pkl"
    if fn.exists():
        return pickle.loads(fn.read_bytes())

    tbl = api("get_tournaments")
    m = {}
    for t in tbl:
        nm = t["tournament_name"].lower()
        if "wimbledon" in nm or "halle" in nm or "queens" in nm:
            surf = "Grass"
        elif any(x in nm for x in ("french open", "roland", "monte carlo", "madrid", "rome", "clay")):
            surf = "Clay"
        else:
            surf = "Hard"
        m[t["tournament_name"]] = surf
    fn.write_bytes(pickle.dumps(m, protocol=4))
    return m


surf_map = tournament_surface_map()
print("\nSurface for Rio Open ->", surf_map.get("ATP Rio Open"))


# ---------- 3. quick H2H fetch (returns DataFrame) ---------------- #
def h2h(player_key1: str, player_key2: str) -> pd.DataFrame:
    tag = CACHE / f"h2h_{player_key1}_{player_key2}.pkl"
    if tag.exists():
        return pickle.loads(tag.read_bytes())

    # API sometimes throttles – one retry
    for attempt in range(2):
        try:
            hist = api("get_H2H", first_player_key=player_key1, second_player_key=player_key2)
            df = pd.DataFrame(hist)
            tag.write_bytes(pickle.dumps(df, protocol=4))
            return df
        except Exception as e:
            if attempt == 0:
                time.sleep(3)
            else:
                raise e

# example H2H between generic keys "584" and "52"
# print(h2h("584","52").head())

=== finished fixtures 2025-06-25 ===
         date  event_key            tournament  \
0  2025-06-25   12046056   Lima Challenger Men   
1  2025-06-25   12046060   Lima Challenger Men   
2  2025-06-25   12046061   Lima Challenger Men   
3  2025-06-25   12046064  Milan Challenger Men   
4  2025-06-25   12046065  Milan Challenger Men   

                               round surface              winner  \
0   Lima Challenger Men - 1/8-finals               Britto/ Carou   
1   Lima Challenger Men - 1/8-finals          Bangoura/ Stepanov   
2   Lima Challenger Men - 1/8-finals                   Li/ Tobon   
3  Milan Challenger Men - 1/8-finals                Goldhoff/ Ho   
4  Milan Challenger Men - 1/8-finals           Berrettini/ Fonio   

                      loser                                              score  
0    De La Fuente/ Olivieri  [{'score_first': '6', 'score_second': '3', 'sc...  
1           Monge/ Nakamine  [{'score_first': '6', 'score_second': '0', 'sc...  
2         

In [None]:
# Quick diagnostic: row coverage by data-feed period
import numpy as np
from datetime import date

# tag each row once
period_col = "source_period"
if period_col not in historical_data.columns:
    historical_data[period_col] = np.select(
        [
            historical_data['date'] <= date(2025, 6, 10),
            historical_data['date'].between(date(2025, 6, 11), date(2025, 6, 22)),
            historical_data['date'] >= date(2025, 6, 23)
        ],
        ['P1_all', 'P2_td_api', 'P3 _api'],
        default='UNKNOWN'
    )

# simple counts
print("=== rows per period ===")
print(historical_data.value_counts(period_col))

# optional cross-tab: feed vs period
hist_with_feed = historical_data.assign(
    feed=np.where(historical_data['date'] >  date(2025, 6, 22), 'API',
         np.where(historical_data['date'] <= date(2025, 6, 10), 'Jeff', 'tennis-data'))
)

print("\n=== feed × period matrix ===")
print(pd.crosstab(hist_with_feed[period_col], hist_with_feed['feed']))
# after generation (full or sample)
print("Total rows in historical_data:", len(historical_data))
print("Rows tagged API in historical_data:",
      ((historical_data['date'] >= date(2025, 6, 23))).sum())

In [None]:
# Test Data Loading - Fixed Version
import os, pickle, pandas as pd
from datetime import date

CACHE_DIR = os.path.expanduser("~/Desktop/data/cache")
os.makedirs(CACHE_DIR, exist_ok=True)
HD_PATH   = os.path.join(CACHE_DIR, "historical_data.parquet")
JEFF_PATH = os.path.join(CACHE_DIR, "jeff_data.pkl")
DEF_PATH  = os.path.join(CACHE_DIR, "weighted_defaults.pkl")

print("Checking cache files...")
print(f"Historical data exists: {os.path.exists(HD_PATH)}")
print(f"Jeff data exists: {os.path.exists(JEFF_PATH)}")
print(f"Defaults exist: {os.path.exists(DEF_PATH)}")

if (os.path.exists(HD_PATH) and
    os.path.exists(JEFF_PATH) and
    os.path.exists(DEF_PATH)):
    print("Loading cached data...")
    historical_data = pd.read_parquet(HD_PATH)
    with open(JEFF_PATH, "rb") as f:
        jeff_data = pickle.load(f)
    with open(DEF_PATH, "rb") as f:
        weighted_defaults = pickle.load(f)
    print("✓ Cache loaded successfully")
else:
    print("Cache miss – regenerating data...")
    historical_data, jeff_data, weighted_defaults = generate_comprehensive_historical_all_years()

    # Save to cache
    print("Saving to cache...")
    historical_data.to_parquet(HD_PATH, index=False)
    with open(JEFF_PATH, "wb") as f:
        pickle.dump(jeff_data, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(DEF_PATH, "wb") as f:
        pickle.dump(weighted_defaults, f, protocol=pickle.HIGHEST_PROTOCOL)
    print("✓ Data cached successfully")

# Check results
print(f"\n=== DATA SUMMARY ===")
print(f"Historical data shape: {historical_data.shape}")
print(f"Jeff data keys: {list(jeff_data.keys()) if jeff_data else 'None'}")
print(f"Weighted defaults keys: {list(weighted_defaults.keys()) if weighted_defaults else 'None'}")

# Check for Jeff features
if len(historical_data) > 0:
    jeff_feature_cols = [col for col in historical_data.columns if 'winner_serve_pts' in col or 'loser_serve_pts' in col]
    print(f"Jeff feature columns found: {jeff_feature_cols}")

    if 'winner_serve_pts' in historical_data.columns:
        nulls = historical_data['winner_serve_pts'].isna().sum()
        non_nulls = historical_data['winner_serve_pts'].notna().sum()
        print(f"winner_serve_pts: {non_nulls} values, {nulls} nulls")

        # Show sample values
        sample_values = historical_data['winner_serve_pts'].dropna().head(5)
        print(f"Sample serve_pts values: {sample_values.tolist()}")
    else:
        print("winner_serve_pts column not found")

    # Show some column names
    print(f"\nFirst 10 columns: {historical_data.columns[:10].tolist()}")
    print(f"Last 10 columns: {historical_data.columns[-10:].tolist()}")
else:
    print("No data loaded - check the data loading function")

# Show a sample row if data exists
if len(historical_data) > 0:
    print(f"\nSample match data:")
    sample_row = historical_data.iloc[0]
    print(f"Winner: {sample_row.get('Winner', 'N/A')}")
    print(f"Loser: {sample_row.get('Loser', 'N/A')}")
    print(f"Date: {sample_row.get('Date', 'N/A')}")
    print(f"Surface: {sample_row.get('Surface', 'N/A')}")
    print(f"Winner serve pts: {sample_row.get('winner_serve_pts', 'N/A')}")
    print(f"Loser serve pts: {sample_row.get('loser_serve_pts', 'N/A')}")

print(f"\n=== FINAL STATUS ===")
print(f"Data loaded: {len(historical_data) > 0}")
print(f"Jeff features available: {'winner_serve_pts' in historical_data.columns}")
print(f"Ready for modeling: {len(historical_data) > 0 and 'winner_serve_pts' in historical_data.columns}")

In [None]:
# LAYER 1 ##
def extract_data_samples():
    # Jeff Sackmann data samples
    jeff_samples = {
        'matches': jeff_data['men']['matches'].head(3),
        'serve_basics': jeff_data['men']['serve_basics'].head(3),
        'overview': jeff_data['men']['overview'].head(3)
    }

    # Tennis-data samples
    tennis_samples = historical_data[
        ['Winner', 'Loser', 'WRank', 'LRank', 'PSW', 'PSL', 'Surface']
    ].head(3)

    return jeff_samples, tennis_samples

# Hold/break computation method verification
hold_break_computation = {
    'current_method': 'Jeff aggregated stats from overview dataset',
    'available_columns': ['serve_pts', 'first_in', 'first_won', 'second_won'],
    'computation_level': 'Per-player aggregate from charting data'
}

# Bayesian
def extract_priors_from_current_data(player_canonical, gender, surface):
    priors = {}

    # Layer 1: Elo approximation from rankings
    player_matches = historical_data[
        (historical_data['winner_canonical'] == player_canonical) |
        (historical_data['loser_canonical'] == player_canonical)
    ]

    if len(player_matches) > 0:
        # Ranking-based Elo estimation
        recent_rank = get_recent_rank(player_canonical, player_matches)
        elo_estimate = 2000 - (recent_rank * 5) if recent_rank else 1500

        # Jeff feature extraction
        jeff_features = extract_jeff_features(player_canonical, gender, jeff_data)

        priors = {
            'elo_estimate': elo_estimate,
            'serve_effectiveness': jeff_features.get('serve_pts', 0.6),
            'return_strength': jeff_features.get('return_pts_won', 0.3),
            'surface_factor': calculate_surface_adjustment(player_matches, surface)
        }

    return priors

# Time decay for recent form
def calculate_time_decayed_performance(player_matches, reference_date):
    player_matches['days_ago'] = (reference_date - player_matches['date']).dt.days

    # Exponential decay: recent matches weighted heavier
    weights = np.exp(-0.01 * player_matches['days_ago'])  # 1% daily decay

    weighted_performance = {
        'win_rate': np.average(player_matches['is_winner'], weights=weights),
        'games_won_rate': np.average(player_matches['games_won_pct'], weights=weights)
    }

    return weighted_performance

In [None]:
## TEST ##
import os, pickle, pandas as pd

CACHE_DIR = os.path.expanduser("~/Desktop/data/cache")
os.makedirs(CACHE_DIR, exist_ok=True)
HD_PATH   = os.path.join(CACHE_DIR, "historical_data.parquet")
JEFF_PATH = os.path.join(CACHE_DIR, "jeff_data.pkl")
DEF_PATH  = os.path.join(CACHE_DIR, "weighted_defaults.pkl")

if (os.path.exists(HD_PATH) and
    os.path.exists(JEFF_PATH) and
    os.path.exists(DEF_PATH)):
    print("Loading cached data …")
    historical_data = pd.read_parquet(HD_PATH)
    with open(JEFF_PATH, "rb") as fh:
        jeff_data = pickle.load(fh)
    with open(DEF_PATH, "rb") as fh:
        weighted_defaults = pickle.load(fh)
else:
    print("Cache miss – regenerating (one-time slow run).")
    combined_data, jeff_data, weighted_defaults = generate_comprehensive_historical_all_years()
    historical_data = combined_data
    historical_data.to_parquet(HD_PATH, index=False)
    with open(JEFF_PATH, "wb") as fh:
        pickle.dump(jeff_data, fh, protocol=pickle.HIGHEST_PROTOCOL)
    with open(DEF_PATH, "wb") as fh:
        pickle.dump(weighted_defaults, fh, protocol=pickle.HIGHEST_PROTOCOL)

"SIMULATION"

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

def normalize_name_canonical(name):
    if pd.isna(name):
        return ""
    name = str(name).strip()
    name = name.replace('.', '').replace("'", '').replace('-', ' ')
    return ' '.join(name.lower().split())

def extract_jeff_features(player_canonical, gender, jeff_data):
    """Extract actual features from Jeff Sackmann data"""
    gender_key = 'men' if gender == 'M' else 'women'

    if gender_key not in jeff_data or player_canonical not in jeff_data[gender_key]:
        return {
            'serve_pts': 60,
            'first_won': 0,
            'second_won': 0,
            'return_pts_won': 20
        }

    player_data = jeff_data[gender_key][player_canonical]

    first_in = player_data.get('1stIn', 0)
    first_won = player_data.get('1stWon', 0)
    second_won = player_data.get('2ndWon', 0)
    double_faults = player_data.get('df', 0)

    total_serve_pts = first_in + double_faults + (first_won - first_in) if first_won >= first_in else first_in + second_won + double_faults

    break_points_saved = player_data.get('bpSaved', 0)
    break_points_faced = player_data.get('bpFaced', 0)
    return_pts_won = break_points_faced - break_points_saved

    return {
        'serve_pts': max(1, total_serve_pts),
        'first_won': first_won,
        'second_won': second_won,
        'return_pts_won': max(0, return_pts_won)
    }

class BayesianTennisModel:
    def __init__(self):
        self.simulation_count = 10000
        self.jeff_data = jeff_data
        self.historical_data = historical_data

    def default_priors(self):
        return {
            'elo_mean': 1500,
            'elo_std': 200,
            'hold_prob': 0.65,
            'break_prob': 0.35,
            'surface': 'Hard',
            'form_factor': 1.0,
            'confidence': 0.1
        }

    def extract_refined_priors(self, player_canonical, gender, surface, reference_date):
        player_matches = self.historical_data[
            (self.historical_data['winner_canonical'] == player_canonical) |
            (self.historical_data['loser_canonical'] == player_canonical)
        ].copy()

        if len(player_matches) == 0:
            return self.default_priors()

        surface_matches = player_matches[player_matches['Surface'] == surface]
        if len(surface_matches) < 5:
            surface_matches = player_matches

        recent_matches = surface_matches.tail(20).copy()
        recent_matches['days_ago'] = (pd.to_datetime(reference_date) - pd.to_datetime(recent_matches['Date'])).dt.days
        weights = np.exp(-0.05 * recent_matches['days_ago'])

        base_elo = self.get_player_weighted_elo(player_canonical, surface, reference_date)
        surface_factor = self.calculate_surface_adaptation(player_canonical, surface)
        elo_prior = base_elo * surface_factor

        jeff_features = extract_jeff_features(player_canonical, gender, self.jeff_data)

        serve_pts = jeff_features['serve_pts']
        serve_won = jeff_features['first_won'] + jeff_features['second_won']
        hold_prob = serve_won / serve_pts if serve_pts > 0 else 0.65

        return_pts = jeff_features['return_pts_won']
        total_return_pts = serve_pts
        break_prob = (1 - return_pts / total_return_pts) if total_return_pts > 0 else 0.35

        return {
            'elo_mean': elo_prior,
            'elo_std': 150,
            'hold_prob': min(0.95, max(0.3, hold_prob)),
            'break_prob': max(0.05, min(0.7, break_prob)),
            'surface': surface,
            'form_factor': self.calculate_form_spike(recent_matches, weights, player_canonical),
            'confidence': max(0.05, min(1.0, len(recent_matches) / 15))
        }

    def calculate_ranking_differential_odds(self, p1_ranking, p2_ranking):
        """Convert ranking differential to implied probability"""
        if p1_ranking == 0 or p2_ranking == 0:
            return 0.5

        ranking_diff = p2_ranking - p1_ranking

        if ranking_diff > 50:
            return 0.85
        elif ranking_diff > 20:
            return 0.75
        elif ranking_diff > 10:
            return 0.65
        elif ranking_diff > 0:
            return 0.55
        elif ranking_diff > -10:
            return 0.45
        elif ranking_diff > -20:
            return 0.35
        elif ranking_diff > -50:
            return 0.25
        else:
            return 0.15

    def calculate_upset_frequency(self, ranking_diff, surface, historical_data):
        """Calculate upset frequency by ranking differential and surface"""
        upset_matches = historical_data[
            ((historical_data['WRank'] - historical_data['LRank']) > ranking_diff) &
            (historical_data['Surface'] == surface)
        ]

        total_matches = historical_data[
            (abs(historical_data['WRank'] - historical_data['LRank']) >= abs(ranking_diff)) &
            (historical_data['Surface'] == surface)
        ]

        if len(total_matches) < 10 and surface != 'fallback':
            return self.calculate_upset_frequency(ranking_diff, 'fallback', historical_data)

        if surface == 'fallback':
            upset_matches = historical_data[
                (historical_data['WRank'] - historical_data['LRank']) > ranking_diff
            ]
            total_matches = historical_data[
                abs(historical_data['WRank'] - historical_data['LRank']) >= abs(ranking_diff)
            ]

        if len(total_matches) == 0:
            return 0.1

        upset_rate = len(upset_matches) / len(total_matches)
        return min(0.45, max(0.05, upset_rate))

    def calculate_surface_performance_ratio(self, player_canonical, surface, opponent_canonical, reference_date):
        """Calculate player's surface-specific performance vs opponent's baseline"""
        player_surface_matches = self.historical_data[
            ((self.historical_data['winner_canonical'] == player_canonical) |
             (self.historical_data['loser_canonical'] == player_canonical)) &
            (self.historical_data['Surface'] == surface) &
            (pd.to_datetime(self.historical_data['Date']) <= pd.to_datetime(reference_date))
        ].tail(20)

        opponent_surface_matches = self.historical_data[
            ((self.historical_data['winner_canonical'] == opponent_canonical) |
             (self.historical_data['loser_canonical'] == opponent_canonical)) &
            (self.historical_data['Surface'] == surface) &
            (pd.to_datetime(self.historical_data['Date']) <= pd.to_datetime(reference_date))
        ].tail(20)

        if len(player_surface_matches) < 3 or len(opponent_surface_matches) < 3:
            return 1.0

        player_wins = len(player_surface_matches[player_surface_matches['winner_canonical'] == player_canonical])
        opponent_wins = len(opponent_surface_matches[opponent_surface_matches['winner_canonical'] == opponent_canonical])

        player_ratio = player_wins / len(player_surface_matches)
        opponent_ratio = opponent_wins / len(opponent_surface_matches)

        return player_ratio / opponent_ratio if opponent_ratio > 0 else 1.0

    def run_simulation(self, p1_priors, p2_priors, iterations):
        return [self.simulate_match(p1_priors, p2_priors)]

    def predict_match_outcome(self, player1_canonical, player2_canonical, surface, gender, date):
        p1_priors = self.extract_refined_priors(player1_canonical, gender, surface, date)
        p2_priors = self.extract_refined_priors(player2_canonical, gender, surface, date)

        base_prob = self.run_simulation(p1_priors, p2_priors, 1000)[0]

        p1_rank = self.get_player_ranking(player1_canonical, date)
        p2_rank = self.get_player_ranking(player2_canonical, date)
        ranking_prob = self.calculate_ranking_differential_odds(p1_rank, p2_rank)

        ranking_diff = p1_rank - p2_rank
        upset_adjustment = self.calculate_upset_frequency(ranking_diff, surface, self.historical_data)

        surface_ratio = self.calculate_surface_performance_ratio(player1_canonical, surface, player2_canonical, date)

        calibrated_prob = (0.6 * base_prob + 0.25 * ranking_prob + 0.15 * surface_ratio) * (1 - upset_adjustment * 0.1)

        return max(0.05, min(0.95, calibrated_prob))

    def get_player_ranking(self, player_canonical, date):
        """Get player ranking at specific date"""
        date_obj = pd.to_datetime(date)

        player_matches = self.historical_data[
            ((self.historical_data['winner_canonical'] == player_canonical) |
             (self.historical_data['loser_canonical'] == player_canonical)) &
            (pd.to_datetime(self.historical_data['Date']) <= date_obj)
        ].sort_values('Date', ascending=False)

        if len(player_matches) == 0:
            return 999

        latest_match = player_matches.iloc[0]

        if latest_match['winner_canonical'] == player_canonical:
            return latest_match.get('WRank', 999)
        else:
            return latest_match.get('LRank', 999)

    def calculate_match_probability(self, player1_canonical, player2_canonical, gender, surface, reference_date, best_of=3):
        player1_priors = self.extract_refined_priors(player1_canonical, gender, surface, reference_date)
        player2_priors = self.extract_refined_priors(player2_canonical, gender, surface, reference_date)

        probability = self.simulate_match(player1_priors, player2_priors, best_of)
        confidence = min(player1_priors['confidence'], player2_priors['confidence'])

        return {
            'player1_win_probability': probability,
            'player2_win_probability': 1 - probability,
            'confidence': confidence,
            'player1_priors': player1_priors,
            'player2_priors': player2_priors
        }

    def calculate_form_spike(self, recent_matches, weights, player_canonical):
        if len(recent_matches) == 0:
            return 1.0

        wins = (recent_matches['winner_canonical'] == player_canonical).astype(int)
        weighted_win_rate = np.average(wins, weights=weights)

        avg_opponent_rank = recent_matches['LRank'].fillna(recent_matches['WRank']).mean()
        player_rank = recent_matches['WRank'].fillna(recent_matches['LRank']).iloc[-1]

        if pd.notna(avg_opponent_rank) and pd.notna(player_rank):
            rank_diff = player_rank - avg_opponent_rank
            expected_win_rate = 1 / (1 + 10**(rank_diff/400))
            form_spike = min(2.0, weighted_win_rate / max(0.1, expected_win_rate))
        else:
            form_spike = 1.0

        return form_spike

    def simulate_match(self, player1_priors, player2_priors, best_of=3):
        wins = 0
        for _ in range(self.simulation_count):
            sets_won = [0, 0]
            while max(sets_won) < (best_of + 1) // 2:
                set_winner = self.simulate_set(player1_priors, player2_priors)
                sets_won[set_winner] += 1
            if sets_won[0] > sets_won[1]:
                wins += 1
        return wins / self.simulation_count

    def simulate_set(self, p1_priors, p2_priors):
        games = [0, 0]
        server = 0
        while True:
            hold_prob = p1_priors['hold_prob'] if server == 0 else p2_priors['hold_prob']
            game_winner = server if np.random.random() < hold_prob else 1 - server
            games[game_winner] += 1
            server = 1 - server
            if games[0] >= 6 and games[0] - games[1] >= 2:
                return 0
            elif games[1] >= 6 and games[1] - games[0] >= 2:
                return 1
            elif games[0] == 6 and games[1] == 6:
                return self.simulate_tiebreak(p1_priors, p2_priors)

    def simulate_tiebreak(self, p1_priors, p2_priors):
        points = [0, 0]
        server = 0
        serve_count = 0
        while True:
            hold_prob = p1_priors['hold_prob'] if server == 0 else p2_priors['hold_prob']
            point_winner = server if np.random.random() < hold_prob else 1 - server
            points[point_winner] += 1
            serve_count += 1
            if serve_count == 1 or serve_count % 2 == 0:
                server = 1 - server
            if points[0] >= 7 and points[0] - points[1] >= 2:
                return 0
            elif points[1] >= 7 and points[1] - points[0] >= 2:
                return 1

    def get_player_weighted_elo(self, player_canonical, surface, reference_date):
        recent_match = self.historical_data[
            ((self.historical_data['winner_canonical'] == player_canonical) |
             (self.historical_data['loser_canonical'] == player_canonical)) &
            (self.historical_data['Surface'] == surface)
        ].tail(1)

        if len(recent_match) > 0 and 'BlendScore' in recent_match.columns:
            blend_score = recent_match['BlendScore'].iloc[0]
            return 1500 + blend_score * 50

        any_surface_match = self.historical_data[
            (self.historical_data['winner_canonical'] == player_canonical) |
            (self.historical_data['loser_canonical'] == player_canonical)
        ].tail(1)

        if len(any_surface_match) > 0 and 'BlendScore' in any_surface_match.columns:
            return 1500 + any_surface_match['BlendScore'].iloc[0] * 200

        return 1500

    def calculate_surface_adaptation(self, player_canonical, target_surface):
        player_matches = self.historical_data[
            (self.historical_data['winner_canonical'] == player_canonical) |
            (self.historical_data['loser_canonical'] == player_canonical)
        ].copy()

        if len(player_matches) < 10:
            return 1.0

        surface_matches = player_matches[player_matches['Surface'] == target_surface]
        if len(surface_matches) < 3:
            return 1.0

        surface_wins = (surface_matches['winner_canonical'] == player_canonical).sum()
        surface_win_rate = surface_wins / len(surface_matches)

        total_wins = (player_matches['winner_canonical'] == player_canonical).sum()
        baseline_win_rate = total_wins / len(player_matches)

        if baseline_win_rate == 0:
            return 1.0

        adaptation_ratio = surface_win_rate / baseline_win_rate
        return max(0.7, min(1.5, adaptation_ratio))

    def evaluate_predictions(self, test_data):
        """Evaluate model accuracy on test dataset"""
        correct = 0
        total = 0

        for _, match in test_data.iterrows():
            prob = self.predict_match_outcome(
                match['winner_canonical'],
                match['loser_canonical'],
                match['Surface'],
                match['gender'],
                match['Date']
            )

            predicted_winner = match['winner_canonical'] if prob > 0.5 else match['loser_canonical']
            actual_winner = match['winner_canonical']

            if predicted_winner == actual_winner:
                correct += 1
            total += 1

        return correct / total if total > 0 else 0

def convert_to_canonical(name):
    return normalize_name_canonical(name)

model = BayesianTennisModel()

In [None]:
## LAYER 2 ##
def apply_contextual_adjustments(self, priors, player_canonical, opponent_canonical, match_context):
    """Layer 2: Contextual Bayesian adjustments for fatigue, injury, motivation"""

    adjusted_priors = priors.copy()

    # Fatigue Index
    fatigue_penalty = self.calculate_fatigue_index(player_canonical, match_context['reference_date'])
    adjusted_priors['hold_prob'] *= (1 - fatigue_penalty * 0.15)  # Max 15% hold penalty
    adjusted_priors['elo_std'] *= (1 + fatigue_penalty * 0.3)    # Increase uncertainty

    # Injury Flag Adjustment
    injury_factor = self.get_injury_factor(player_canonical, match_context['reference_date'])
    adjusted_priors['hold_prob'] *= injury_factor
    adjusted_priors['break_prob'] *= (2 - injury_factor)  # Inverse relationship

    # Form Spike Sustainability
    form_sustainability = self.calculate_form_sustainability(player_canonical, match_context)
    if adjusted_priors['form_factor'] > 1.2:  # Hot streak detection
        sustainability_discount = 1 - ((adjusted_priors['form_factor'] - 1) * (1 - form_sustainability))
        adjusted_priors['hold_prob'] *= sustainability_discount
        adjusted_priors['elo_mean'] *= sustainability_discount

    # Opponent Quality Weighting
    opponent_elo = self.estimate_opponent_elo(opponent_canonical, match_context)
    elo_diff = adjusted_priors['elo_mean'] - opponent_elo
    quality_adjustment = 1 / (1 + np.exp(-elo_diff / 200))  # Sigmoid scaling
    adjusted_priors['break_prob'] *= quality_adjustment

    return adjusted_priors

def calculate_fatigue_index(self, player_canonical, reference_date):
    """Fatigue based on recent match load and recovery time"""
    recent_matches = self.get_recent_matches(player_canonical, reference_date, days=14)

    if len(recent_matches) == 0:
        return 0.0

    # Calculate cumulative fatigue
    fatigue_score = 0
    for _, match in recent_matches.iterrows():
        days_ago = (pd.to_datetime(reference_date) - pd.to_datetime(match['Date'])).days
        match_duration = match.get('minutes', 120)  # Default 2 hours

        # Exponential decay with match duration weighting
        fatigue_contribution = (match_duration / 60) * np.exp(-0.1 * days_ago)
        fatigue_score += fatigue_contribution

    return min(1.0, fatigue_score / 10)  # Normalize to 0-1

def get_injury_factor(self, player_canonical, reference_date):
    """Player-specific injury fragility scoring"""
    # Injury memory bank - replace with actual injury tracking
    injury_prone_players = {
        'nadal_r': 0.85,
        'murray_a': 0.80,
        'thiem_d': 0.75,
        'badosa_p': 0.70
    }

    base_factor = injury_prone_players.get(player_canonical, 0.95)

    # Check for recent retirement/walkover flags
    recent_retirements = self.check_recent_retirements(player_canonical, reference_date)
    if recent_retirements > 0:
        base_factor *= (0.8 ** recent_retirements)

    return max(0.5, base_factor)

def calculate_form_sustainability(self, player_canonical, match_context):
    """Form spike sustainability based on opponent quality and win quality"""
    recent_matches = self.get_recent_matches(player_canonical, match_context['reference_date'], days=21)

    if len(recent_matches) < 3:
        return 0.5

    # Quality-weighted recent performance
    quality_scores = []
    for _, match in recent_matches.iterrows():
        opponent_rank = match['LRank'] if match['winner_canonical'] == player_canonical else match['WRank']
        win_quality = 1 / (1 + opponent_rank / 100) if pd.notna(opponent_rank) else 0.5
        quality_scores.append(win_quality)

    avg_opponent_quality = np.mean(quality_scores)
    consistency = 1 - np.std(quality_scores)

    return min(1.0, avg_opponent_quality * consistency)

def estimate_opponent_elo(self, opponent_canonical, match_context):
    """Quick opponent Elo estimation for quality weighting"""
    opponent_priors = self.extract_refined_priors(
        opponent_canonical,
        match_context['gender'],
        match_context['surface'],
        match_context['reference_date']
    )
    return opponent_priors['elo_mean']

def get_recent_matches(self, player_canonical, reference_date, days=14):
    try:
        cutoff_date = pd.to_datetime(reference_date) - pd.Timedelta(days=days)

        player_matches = self.historical_data[
            ((self.historical_data['winner_canonical'] == player_canonical) |
             (self.historical_data['loser_canonical'] == player_canonical))
        ].copy()

        if len(player_matches) == 0:
            return player_matches

        # Force string conversion then datetime to avoid mixed types
        player_matches['Date'] = pd.to_datetime(player_matches['Date'].astype(str), errors='coerce')
        player_matches = player_matches.dropna(subset=['Date'])
        player_matches = player_matches[player_matches['Date'] >= cutoff_date]

        return player_matches.sort_values('Date')
    except:
        # Return empty DataFrame on any error
        return pd.DataFrame()

def check_recent_retirements(self, player_canonical, reference_date):
    """Count recent retirements/walkovers - placeholder for actual retirement tracking"""
    # Implementation depends on your data structure for retirement flags
    return 0

In [None]:
## LAYER 3 ##
def simulate_match(self, player1_priors, player2_priors, best_of=3, tiebreak_sets=[1,2,3]):
    """Layer 3: Monte Carlo match simulation with Bayesian priors"""

    wins = 0
    simulations = self.simulation_count

    for _ in range(simulations):
        sets_won = [0, 0]  # [player1, player2]

        while max(sets_won) < (best_of + 1) // 2:
            set_winner = self.simulate_set(
                player1_priors,
                player2_priors,
                tiebreak=len([s for s in sets_won if s > 0]) + 1 in tiebreak_sets
            )
            sets_won[set_winner] += 1

        if sets_won[0] > sets_won[1]:
            wins += 1

    return wins / simulations

def simulate_set(self, p1_priors, p2_priors, tiebreak=True):
    """Simulate single set with service alternation"""
    games = [0, 0]
    server = 0  # 0 = player1 serves first

    while True:
        # Determine game winner based on server
        if server == 0:
            hold_prob = p1_priors['hold_prob']
            game_winner = 0 if np.random.random() < hold_prob else 1
        else:
            hold_prob = p2_priors['hold_prob']
            game_winner = 1 if np.random.random() < hold_prob else 0

        games[game_winner] += 1
        server = 1 - server  # Alternate serve

        # Check set completion
        if games[0] >= 6 and games[0] - games[1] >= 2:
            return 0
        elif games[1] >= 6 and games[1] - games[0] >= 2:
            return 1
        elif games[0] == 6 and games[1] == 6 and tiebreak:
            return self.simulate_tiebreak(p1_priors, p2_priors)

def simulate_tiebreak(self, p1_priors, p2_priors):
    """Simulate tiebreak with point-by-point serve alternation"""
    points = [0, 0]
    server = 0
    serve_count = 0

    while True:
        # Determine point winner
        if server == 0:
            hold_prob = p1_priors['hold_prob']
            point_winner = 0 if np.random.random() < hold_prob else 1
        else:
            hold_prob = p2_priors['hold_prob']
            point_winner = 1 if np.random.random() < hold_prob else 0

        points[point_winner] += 1
        serve_count += 1

        # Alternate server every 2 points (except first point)
        if serve_count == 1 or serve_count % 2 == 0:
            server = 1 - server

        # Check tiebreak completion
        if points[0] >= 7 and points[0] - points[1] >= 2:
            return 0
        elif points[1] >= 7 and points[1] - points[0] >= 2:
            return 1

def simulate_match(self, player1_priors, player2_priors, best_of=3, tiebreak_sets=[1,2,3]):
    wins = 0
    simulations = self.simulation_count

    for _ in range(simulations):
        sets_won = [0, 0]

        while max(sets_won) < (best_of + 1) // 2:
            set_winner = self.simulate_set(
                player1_priors,
                player2_priors,
                tiebreak=len([s for s in sets_won if s > 0]) + 1 in tiebreak_sets
            )
            sets_won[set_winner] += 1

        if sets_won[0] > sets_won[1]:
            wins += 1

    return wins / simulations

def simulate_set(self, p1_priors, p2_priors, tiebreak=True):
    games = [0, 0]
    server = 0

    while True:
        if server == 0:
            hold_prob = p1_priors['hold_prob']
            game_winner = 0 if np.random.random() < hold_prob else 1
        else:
            hold_prob = p2_priors['hold_prob']
            game_winner = 1 if np.random.random() < hold_prob else 0

        games[game_winner] += 1
        server = 1 - server

        if games[0] >= 6 and games[0] - games[1] >= 2:
            return 0
        elif games[1] >= 6 and games[1] - games[0] >= 2:
            return 1
        elif games[0] == 6 and games[1] == 6 and tiebreak:
            return self.simulate_tiebreak(p1_priors, p2_priors)

def simulate_tiebreak(self, p1_priors, p2_priors):
    points = [0, 0]
    server = 0
    serve_count = 0

    while True:
        if server == 0:
            hold_prob = p1_priors['hold_prob']

In [None]:
# Tomorrow's slate

import requests
from datetime import date, timedelta

API_KEY = "adfc70491c47895e5fffdc6428bbf36a561989d4bffcfa9ecfba8d91e947b4fb"
BASE = "https://api.api-tennis.com/tennis/"

def get_matches_for_date(target_date):
    params = {
        "method": "get_fixtures",
        "APIkey": API_KEY,
        "date_start": target_date,
        "date_stop": target_date
    }
    response = requests.get(BASE, params=params)
    if response.status_code != 200:
        raise RuntimeError(f"HTTP {response.status_code}")

    # Surface mapping
    TOURNAMENT_SURFACES = {
        'ATP Wimbledon': 'Grass',
        'WTA Wimbledon': 'Grass',
        'ATP French Open': 'Clay',
        'WTA French Open': 'Clay',
        'ATP US Open': 'Hard',
        'WTA US Open': 'Hard',
        'ATP Australian Open': 'Hard',
        'WTA Australian Open': 'Hard'
    }

    data = response.json()
    matches = []

    for event in data.get("result", []):
        matches.append({
            'event_key': event.get('event_key'),
            'player1_name': event['event_first_player'],
            'player2_name': event['event_second_player'],
            'tournament_name': event.get('tournament_name', 'Unknown'),
            'tournament_round': event.get('tournament_round', ''),
            'event_status': event.get('event_status', ''),
            'event_type_type': event.get('event_type_type', ''),
            'surface': TOURNAMENT_SURFACES.get(event.get('tournament_name', ''), 'Unknown'),
            'time': event.get('event_time', ''),
            'date': event.get('event_date', '')
        })

    return matches

def get_high_confidence_matches(target_date, min_confidence=0.2):
    matches = get_matches_for_date(target_date)

    results = []
    for match in matches:
        p1_canonical = convert_to_canonical(match['player1_name'])
        p2_canonical = convert_to_canonical(match['player2_name'])

        p1_priors = model.extract_refined_priors(p1_canonical, 'men', match['surface'], target_date)
        p2_priors = model.extract_refined_priors(p2_canonical, 'men', match['surface'], target_date)

        p1_win_prob = model.simulate_match(p1_priors, p2_priors)
        confidence = abs(p1_win_prob - 0.5)

        if confidence >= min_confidence:
            favorite = match['player1_name'] if p1_win_prob > 0.5 else match['player2_name']
            win_prob = max(p1_win_prob, 1 - p1_win_prob)

            results.append({
                'match': f"{match['player1_name']} vs {match['player2_name']}",
                'favorite': favorite,
                'probability': win_prob,
                'confidence': confidence
            })

    return sorted(results, key=lambda x: x['confidence'], reverse=True)

# Usage
today = date.today().isoformat()
tomorrow = (date.today() + timedelta(days=1)).isoformat()

todays_matches = get_matches_for_date(today)
tomorrows_matches = get_matches_for_date(tomorrow)

In [None]:
# Todays_matches or tomorrows_matches
todays_matches

In [None]:
# Get top 5 picks
def get_top_confidence_matches(target_date, top_n=5, min_confidence=0.05):
    matches = get_matches_for_date(target_date)

    results = []
    for match in matches:
        p1_canonical = convert_to_canonical(match['player1_name'])
        p2_canonical = convert_to_canonical(match['player2_name'])

        p1_priors = model.extract_refined_priors(p1_canonical, 'men', match['surface'], target_date)
        p2_priors = model.extract_refined_priors(p2_canonical, 'men', match['surface'], target_date)

        p1_win_prob = model.simulate_match(p1_priors, p2_priors)
        confidence = abs(p1_win_prob - 0.5)

        if confidence >= min_confidence:
            favorite = match['player1_name'] if p1_win_prob > 0.5 else match['player2_name']
            win_prob = max(p1_win_prob, 1 - p1_win_prob)

            results.append({
                'match': f"{match['player1_name']} vs {match['player2_name']}",
                'favorite': favorite,
                'probability': win_prob,
                'confidence': confidence
            })

    return sorted(results, key=lambda x: x['confidence'], reverse=True)[:top_n]

if __name__ == "__main__":
    target_date = date.today().isoformat()  # today's matches
    picks = get_top_confidence_matches(target_date, top_n=5, min_confidence=0.15)

    for i, pick in enumerate(picks, 1):
        print(f"{i}. {pick['match']}")
        print(f"   Favorite: {pick['favorite']}")
        print(f"   Win Prob: {pick['probability']:.2%}")
        print(f"   Confidence: {pick['confidence']:.5%}\n")

In [None]:
# See picks
from datetime import date

# get today’s top-5 at 5% confidence
picks = get_top_confidence_matches(date.today().isoformat(), top_n=5, min_confidence=0.05)

# print them
for i, pick in enumerate(picks, 1):
    print(f"{i}. {pick['match']}")
    print(f"   Favorite: {pick['favorite']}")
    print(f"   Win Prob: {pick['probability']:.2%}")
    print(f"   Confidence: {pick['confidence']:.1%}\n")

In [None]:
import pandas as pd

pd.DataFrame(picks)

In [None]:
# Split data chronologically
split_date = '2023-01-01'
train_data = historical_data[pd.to_datetime(historical_data['Date']) < split_date]
test_data = historical_data[pd.to_datetime(historical_data['Date']) >= split_date]

# Initialize model with training data
model.historical_data = train_data

# Run evaluation
accuracy = model.evaluate_predictions(test_data.head(100))
print(f"Enhanced model accuracy: {accuracy:.3f}")

# Compare with baseline
model_baseline = BayesianTennisModel()
model_baseline.historical_data = train_data
baseline_accuracy = model_baseline.evaluate_predictions(test_data.head(100))
print(f"Baseline accuracy: {baseline_accuracy:.3f}")
print(f"Improvement: {accuracy - baseline_accuracy:.3f}")

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import html

class TennisAbstractScraper:
    def __init__(self):
        self.headers = {"User-Agent": "Mozilla/5.0"}

    # Stats Overview
    def scrape_stats_overview(self, url):
        resp = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(resp.text, "lxml")
        scripts = [tag.string for tag in soup.find_all("script") if tag.string]
        all_js = "\n".join(scripts)
        blocks = dict(re.findall(r"var\s+(\w+)\s*=\s*'([\s\S]*?)';", all_js, flags=re.S))
        labels = {span.get_text(strip=True): span["id"] for span in soup.select("span.rounds")}
        sections = {label: html.unescape(blocks[token]) for label, token in labels.items() if token in blocks}

        match_info = self._parse_match_url(url)
        stats_html = sections.get("Stats Overview", "")
        stats_data = self._extract_stats_overview_table(stats_html)

        return self._convert_to_jeff_format(stats_data, match_info)

    # Serve Basics
    def scrape_serve_basics(self, url):
        resp = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(resp.text, "lxml")
        scripts = [tag.string for tag in soup.find_all("script") if tag.string]
        all_js = "\n".join(scripts)
        blocks = dict(re.findall(r"var\s+(\w+)\s*=\s*'([\s\S]*?)';", all_js, flags=re.S))
        labels = {span.get_text(strip=True): span["id"] for span in soup.select("span.rounds")}
        sections = {label: html.unescape(blocks[token]) for label, token in labels.items() if token in blocks}

        match_info = self._parse_match_url(url)
        serve_html = sections.get("Serve Basics", "")
        serve_data = self._parse_serve_basics(serve_html)

        return self._convert_serve_basics_to_jeff(serve_data, match_info)

    # add to TennisAbstractScraper
    MAP_SERVE_INFL = {
        'Wide %':   'serve_wide_pct',
        'T %':      'serve_t_pct',
        'Body %':   'serve_body_pct'
    }

    def scrape_serve_influence(self, url):
        resp = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(resp.text, "lxml")
        scripts = [t.string for t in soup.find_all("script") if t.string]
        js = "\n".join(scripts)
        blocks = dict(re.findall(r"var\s+(\w+)\s*=\s*'([\s\S]*?)';", js, re.S))
        html_block = html.unescape(blocks.get('serve', ''))
        if not html_block:
            return []

        tbl = BeautifulSoup(html_block, 'html.parser').table
        heads = [c.get_text(strip=True) for c in tbl.tr.find_all(['th', 'td'])]
        out = []
        for row in tbl.find_all('tr')[1:]:
            cells = [c.get_text(strip=True) for c in row.find_all('td')]
            player = cells[0]
            rec = {'Player_canonical': self._normalize_player_name(player)}
            for h, v in zip(heads[1:], cells[1:]):
                key = MAP_SERVE_INFL.get(h)
                if key:
                    rec[key] = float(v.rstrip('%')) / 100
            out.append(rec)
        return out

    def _parse_serve_basics(self, html_content):
        """Parse Serve Basics section - serves, aces, double faults breakdown"""
        soup = BeautifulSoup(html_content, 'html.parser')
        table = soup.find('table')
        if not table:
            return {}

        rows = table.find_all('tr')
        headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]

        data_rows = []
        for row in rows[1:]:
            cells = [td.get_text(strip=True) for td in row.find_all('td')]
            if len(cells) >= len(headers):
                data_rows.append(cells)

        return self._parse_serve_basics_data(headers, data_rows)

    def _parse_serve_basics_data(self, headers, data_rows):
        """Convert Serve Basics table to structured data"""
        stats_data = {}
        current_set = "Total"

        for row in data_rows:
            if not row[0]:
                continue

            if row[0].startswith('SET'):
                current_set = row[0]
                continue

            player_name = row[0]

            if current_set not in stats_data:
                stats_data[current_set] = {}

            # Parse serve basics columns - adjust indices based on actual table structure
            stats_data[current_set][player_name] = {
                'serve_pts': int(row[1]) if len(row) > 1 and row[1].isdigit() else 0,
                'aces': int(row[2]) if len(row) > 2 and row[2].isdigit() else 0,
                'dfs': int(row[3]) if len(row) > 3 and row[3].isdigit() else 0,
                'first_in': int(row[4]) if len(row) > 4 and row[4].isdigit() else 0,
                'first_won': int(row[5]) if len(row) > 5 and row[5].isdigit() else 0,
                'second_won': int(row[6]) if len(row) > 6 and row[6].isdigit() else 0
            }

        return stats_data

    def _convert_serve_basics_to_jeff(self, serve_data, match_info):
        """Convert serve basics data to Jeff format records"""
        jeff_records = []

        for set_name, set_data in serve_data.items():
            for player, data in set_data.items():
                jeff_record = {
                    'match_id': f"{match_info['Date']}-{player.replace(' ', '_')}",
                    'Date': match_info['Date'],
                    'Tournament': match_info['tournament'],
                    'player': player,
                    'Player_canonical': self._normalize_player_name(player),
                    'set': set_name,
                    'serve_pts': data['serve_pts'],
                    'aces': data['aces'],
                    'dfs': data['dfs'],
                    'first_in': data['first_in'],
                    'first_won': data['first_won'],
                    'second_won': data['second_won']
                }
                jeff_records.append(jeff_record)

        return jeff_records

    # Existing methods unchanged
    def _extract_stats_overview_table(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        table = soup.find('table')
        if not table:
            return {}

        rows = table.find_all('tr')
        headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]

        data_rows = []
        for row in rows[1:]:
            cells = [td.get_text(strip=True) for td in row.find_all('td')]
            if len(cells) >= len(headers):
                data_rows.append(cells)

        return self._parse_tennis_stats(headers, data_rows)

    def _parse_tennis_stats(self, headers, data_rows):
        stats_data = {}
        current_set = "Total"

        for row in data_rows:
            if not row[0]:
                continue

            if row[0].startswith('SET'):
                current_set = row[0]
                continue

            player_name = row[0]

            if current_set not in stats_data:
                stats_data[current_set] = {}

            winners_text = row[8] if len(row) > 8 else "0 (0/0)"
            winners_match = re.match(r'(\d+)\s*\((\d+)/(\d+)\)', winners_text)
            winners_total = int(winners_match.group(1)) if winners_match else 0
            winners_fh = int(winners_match.group(2)) if winners_match else 0
            winners_bh = int(winners_match.group(3)) if winners_match else 0

            ufe_text = row[9] if len(row) > 9 else "0 (0/0)"
            ufe_match = re.match(r'(\d+)\s*\((\d+)/(\d+)\)', ufe_text)
            ufe_total = int(ufe_match.group(1)) if ufe_match else 0
            ufe_fh = int(ufe_match.group(2)) if ufe_match else 0
            ufe_bh = int(ufe_match.group(3)) if ufe_match else 0

            stats_data[current_set][player_name] = {
                'aces_pct': row[1] if len(row) > 1 else '0%',
                'df_pct': row[2] if len(row) > 2 else '0%',
                'first_in_pct': row[3] if len(row) > 3 else '0%',
                'first_won_pct': row[4] if len(row) > 4 else '0%',
                'second_won_pct': row[5] if len(row) > 5 else '0%',
                'bp_saved': row[6] if len(row) > 6 else '0/0',
                'rpw_pct': row[7] if len(row) > 7 else '0%',
                'winners': str(winners_total),
                'winners_fh': str(winners_fh),
                'winners_bh': str(winners_bh),
                'ufe': str(ufe_total),
                'ufe_fh': str(ufe_fh),
                'ufe_bh': str(ufe_bh)
            }

        return stats_data

    def _parse_match_url(self, url):
        pattern = r'(\d{8})-([MW])-(.+?)-(.+?)-(.+?)-(.+?)\.html'
        match = re.search(pattern, url)
        if match:
            date_str, gender, tournament, round_info, player1, player2 = match.groups()
            return {
                'Date': date_str,
                'gender': 'M' if gender == 'M' else 'W',
                'tournament': tournament.replace('_', ' '),
                'round': round_info,
                'player1': player1.replace('_', ' '),
                'player2': player2.replace('_', ' ')
            }
        return {}

    def _convert_to_jeff_format(self, stats_data, match_info):
        jeff_records = []
        for set_name, set_data in stats_data.items():
            for player, data in set_data.items():
                serve_pts = 67 if set_name == 'Total' else (40 if set_name == 'SET 1' else 27)

                aces = int(float(data['aces_pct'].rstrip('%')) / 100 * serve_pts)
                dfs = int(float(data['df_pct'].rstrip('%')) / 100 * serve_pts)
                first_in = int(float(data['first_in_pct'].rstrip('%')) / 100 * serve_pts)
                first_won = int(float(data['first_won_pct'].rstrip('%')) / 100 * first_in) if first_in > 0 else 0
                second_won = int(float(data['second_won_pct'].rstrip('%')) / 100 * (serve_pts - first_in)) if (serve_pts - first_in) > 0 else 0

                bp_parts = data['bp_saved'].split('/')
                bp_saved = int(bp_parts[0])
                bp_faced = int(bp_parts[1]) if len(bp_parts) > 1 else 0

                return_pts_won = int(float(data['rpw_pct'].rstrip('%')) / 100 * serve_pts)

                jeff_record = {
                    'match_id': f"{match_info['Date']}-{player.replace(' ', '_')}",
                    'Date': match_info['Date'],
                    'Tournament': match_info['tournament'],
                    'player': player,
                    'Player_canonical': self._normalize_player_name(player),
                    'set': set_name,
                    'serve_pts': serve_pts,
                    'aces': aces,
                    'dfs': dfs,
                    'first_in': first_in,
                    'first_won': first_won,
                    'second_won': second_won,
                    'bp_saved': bp_saved,
                    'bp_faced': bp_faced,
                    'return_pts_won': return_pts_won,
                    'winners': int(data['winners']),
                    'winners_fh': int(data['winners_fh']),
                    'winners_bh': int(data['winners_bh']),
                    'unforced': int(data['ufe']),
                    'unforced_fh': int(data['ufe_fh']),
                    'unforced_bh': int(data['ufe_bh'])
                }
                jeff_records.append(jeff_record)
        return jeff_records

    def _normalize_player_name(self, name):
        parts = name.lower().replace('.', '').split()
        if len(parts) >= 2:
            return f"{parts[-1]}_{parts[0][0]}"
        return name.lower().replace(' ', '_')

    def debug_available_sections(self, url):
        resp = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(resp.text, "lxml")
        scripts = [tag.string for tag in soup.find_all("script") if tag.string]
        all_js = "\n".join(scripts)
        blocks = dict(re.findall(r"var\s+(\w+)\s*=\s*'([\s\S]*?)';", all_js, flags=re.S))
        labels = {span.get_text(strip=True): span["id"] for span in soup.select("span.rounds")}

        print("Available sections:")
        for label in labels.keys():
            print(f"- '{label}'")
        return labels

def test_extraction_completeness(self, url):
    """Test all available sections and validate data structure"""
    sections = self.debug_available_sections(url)

    results = {}
    for section_name in sections.keys():
        try:
            # Test each section extraction
            extracted_data = self._test_section_extraction(url, section_name)
            results[section_name] = len(extracted_data) > 0
        except Exception as e:
            results[section_name] = f"Error: {e}"

    return results

In [None]:
# Test both methods
scraper = TennisAbstractScraper()
url = "https://www.tennisabstract.com/charting/20250628-W-Eastbourne-F-Maya_Joint-Alexandra_Eala.html"

# Test Stats Overview
print("=== STATS OVERVIEW ===")
overview_data = scraper.scrape_stats_overview(url)
for record in overview_data:
    print(record)

print("\n=== SERVE BASICS ===")
serve_data = scraper.scrape_serve_basics(url)
for record in serve_data:
    print(record)

print("\n=== SERVE INFLUENCE ===")
serve_infl_data = scraper.scrape_serve_influence(url)
for record in serve_infl_data:
    print(record)
