In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict

In [2]:

required_columns = [
    'surface', 'tourney_level', 'tourney_date', 'match_num',
    'winner_id', 'winner_seed', 'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
    'loser_id', 'loser_seed', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age',
    'best_of', 'round', 'minutes',
    'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced',
    'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
    'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'
]

data_dir = Path("data/unprocessed")
csv_files = sorted(data_dir.glob("*.csv"))

dfs = []

for i, csv_file in enumerate(csv_files, 1):
    df = pd.read_csv(csv_file)

    available_cols = [col for col in required_columns if col in df.columns]
    df_selected = df[available_cols]

    dfs.append(df_selected)


merged_df = pd.concat(dfs, ignore_index=True)

# Save the merged DataFrame to a CSV file
output_dir = Path("data/processed")
output_dir.mkdir(exist_ok=True)

output_file = output_dir / "atp_matches_2000_2024.csv"
merged_df.to_csv(output_file, index=False)

In [3]:
# Sort by tourney_date and match_num
merged_df = merged_df.sort_values(['tourney_date', 'match_num']).reset_index(drop=True)


In [4]:
merged_df['win_pct_avg_5'] = None
merged_df['loser_win_pct_avg_5'] = None
merged_df['w_ace_avg'] = None
merged_df['w_df_avg'] = None
merged_df['w_svpt_avg'] = None
merged_df['w_1stIn_avg'] = None
merged_df['w_1stWon_avg'] = None
merged_df['w_2ndWon_avg'] = None
merged_df['w_SvGms_avg'] = None
merged_df['w_bpSaved_avg'] = None
merged_df['w_bpFaced_avg'] = None
merged_df['l_ace_avg'] = None
merged_df['l_df_avg'] = None
merged_df['l_svpt_avg'] = None
merged_df['l_1stIn_avg'] = None
merged_df['l_1stWon_avg'] = None
merged_df['l_2ndWon_avg'] = None
merged_df['l_SvGms_avg'] = None
merged_df['l_bpSaved_avg'] = None
merged_df['l_bpFaced_avg'] = None

In [5]:
import copy

empty_dict = {
    'ace': [],
    'df': [],
    'svpt': [],
    '1stIn': [],
    '1stWon': [],
    '2ndWon': [],
    'SvGms': [],
    'bpSaved': [],
    'bpFaced': [],
    'results': []
}

players_dict = defaultdict(dict)
number_of_matches = 5

def update_dict(row, winner=True):
    _prefix = 'w_' if winner else 'l_'
    player_id = row['winner_id'] if winner else row['loser_id']

    if player_id not in players_dict:
        players_dict[player_id] = copy.deepcopy(empty_dict)

    players_dict[player_id][f'ace'].append(row[f'{_prefix}ace'])
    players_dict[player_id][f'df'].append(row[f'{_prefix}df'])
    players_dict[player_id][f'svpt'].append(row[f'{_prefix}svpt'])
    players_dict[player_id][f'1stIn'].append(row[f'{_prefix}1stIn'])
    players_dict[player_id][f'1stWon'].append(row[f'{_prefix}1stWon'])
    players_dict[player_id][f'2ndWon'].append(row[f'{_prefix}2ndWon'])
    players_dict[player_id][f'SvGms'].append(row[f'{_prefix}SvGms'])
    players_dict[player_id][f'bpSaved'].append(row[f'{_prefix}bpSaved'])
    players_dict[player_id][f'bpFaced'].append(row[f'{_prefix}bpFaced'])
    players_dict[player_id]['results'].append(1 if winner else 0)


def has_enough_matches(row):
    # Check if both players have at least 'number_of_matches' prior matches and if they exist in players_dict
    winner_id = row['winner_id']
    loser_id = row['loser_id']

    return (winner_id in players_dict and len(players_dict[winner_id]['results']) >= number_of_matches and
            loser_id in players_dict and len(players_dict[loser_id]['results']) >= number_of_matches)

def update_df_with_averages(idx, row):
    winner_id = row['winner_id']
    loser_id = row['loser_id']

    # Calculate win percentage correctly (count 1s in results list and divide by total)
    winner_results = players_dict[winner_id]['results'][-number_of_matches:]
    loser_results = players_dict[loser_id]['results'][-number_of_matches:]

    merged_df.at[idx, 'win_pct_avg_5'] = sum(winner_results) / len(winner_results)
    merged_df.at[idx, 'loser_win_pct_avg_5'] = sum(loser_results) / len(loser_results)

    merged_df.at[idx, 'w_ace_avg'] = np.mean(players_dict[winner_id]['ace'][-number_of_matches:])
    merged_df.at[idx, 'w_df_avg'] = np.mean(players_dict[winner_id]['df'][-number_of_matches:])
    merged_df.at[idx, 'w_svpt_avg'] = np.mean(players_dict[winner_id]['svpt'][-number_of_matches:])
    merged_df.at[idx, 'w_1stIn_avg'] = np.mean(players_dict[winner_id]['1stIn'][-number_of_matches:])
    merged_df.at[idx, 'w_1stWon_avg'] = np.mean(players_dict[winner_id]['1stWon'][-number_of_matches:])
    merged_df.at[idx, 'w_2ndWon_avg'] = np.mean(players_dict[winner_id]['2ndWon'][-number_of_matches:])
    merged_df.at[idx, 'w_SvGms_avg'] = np.mean(players_dict[winner_id]['SvGms'][-number_of_matches:])
    merged_df.at[idx, 'w_bpSaved_avg'] = np.mean(players_dict[winner_id]['bpSaved'][-number_of_matches:])
    merged_df.at[idx, 'w_bpFaced_avg'] = np.mean(players_dict[winner_id]['bpFaced'][-number_of_matches:])

    merged_df.at[idx, 'l_ace_avg'] = np.mean(players_dict[loser_id]['ace'][-number_of_matches:])
    merged_df.at[idx, 'l_df_avg'] = np.mean(players_dict[loser_id]['df'][-number_of_matches:])
    merged_df.at[idx, 'l_svpt_avg'] = np.mean(players_dict[loser_id]['svpt'][-number_of_matches:])
    merged_df.at[idx, 'l_1stIn_avg'] = np.mean(players_dict[loser_id]['1stIn'][-number_of_matches:])
    merged_df.at[idx, 'l_1stWon_avg'] = np.mean(players_dict[loser_id]['1stWon'][-number_of_matches:])
    merged_df.at[idx, 'l_2ndWon_avg'] = np.mean(players_dict[loser_id]['2ndWon'][-number_of_matches:])
    merged_df.at[idx, 'l_SvGms_avg'] = np.mean(players_dict[loser_id]['SvGms'][-number_of_matches:])
    merged_df.at[idx, 'l_bpSaved_avg'] = np.mean(players_dict[loser_id]['bpSaved'][-number_of_matches:])
    merged_df.at[idx, 'l_bpFaced_avg'] = np.mean(players_dict[loser_id]['bpFaced'][-number_of_matches:])


for idx, row in merged_df.iterrows():
    if has_enough_matches(row):
        update_df_with_averages(idx, row)

    update_dict(row, winner=True)
    update_dict(row, winner=False)

In [6]:
# Filter out rows where win_pct_avg_5 is None
merged_df_filtered = merged_df[merged_df['win_pct_avg_5'].notna()].copy()

In [7]:
columns_to_drop = [
    'minutes',
    'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced',
    'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
]

final_df = merged_df_filtered.drop(columns=columns_to_drop)

In [8]:
final_df.to_csv('data/processed/atp_matches_2000_2024_final.csv', index=False)

In [None]:
g