# Services

## Data preprocessing

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns



Read csv files into dataframes

In [2]:
data_folder = "data/"
dataset = "huge_dataset/tennis_atp/"

atp1 = pd.read_csv(data_folder + dataset + "atp_matches_2000.csv")
atp2 = pd.read_csv(data_folder + dataset + "atp_matches_2001.csv")
atp3 = pd.read_csv(data_folder + dataset + "atp_matches_2002.csv")
atp4 = pd.read_csv(data_folder + dataset + "atp_matches_2003.csv")
atp5 = pd.read_csv(data_folder + dataset + "atp_matches_2004.csv")
atp6 = pd.read_csv(data_folder + dataset + "atp_matches_2005.csv")
atp7 = pd.read_csv(data_folder + dataset + "atp_matches_2006.csv")
atp8 = pd.read_csv(data_folder + dataset + "atp_matches_2007.csv")
atp9 = pd.read_csv(data_folder + dataset + "atp_matches_2008.csv")
atp10 = pd.read_csv(data_folder + dataset + "atp_matches_2009.csv")
atp11 = pd.read_csv(data_folder + dataset + "atp_matches_2010.csv")
atp12 = pd.read_csv(data_folder + dataset + "atp_matches_2011.csv")
atp13 = pd.read_csv(data_folder + dataset + "atp_matches_2012.csv")
atp14 = pd.read_csv(data_folder + dataset + "atp_matches_2013.csv")
atp15 = pd.read_csv(data_folder + dataset + "atp_matches_2014.csv")
atp16 = pd.read_csv(data_folder + dataset + "atp_matches_2015.csv")
atp17 = pd.read_csv(data_folder + dataset + "atp_matches_2016.csv")
atp18 = pd.read_csv(data_folder + dataset + "atp_matches_2017.csv")
atp19 = pd.read_csv(data_folder + dataset + "atp_matches_2018.csv")
atp20 = pd.read_csv(data_folder + dataset + "atp_matches_2019.csv")
atp21 = pd.read_csv(data_folder + dataset + "atp_matches_2020.csv")
atp22 = pd.read_csv(data_folder + dataset + "atp_matches_2021.csv")
atp23 = pd.read_csv(data_folder + dataset + "atp_matches_2022.csv")
atp24 = pd.read_csv(data_folder + dataset + "atp_matches_2023.csv")
atp25 = pd.read_csv(data_folder + dataset + "atp_matches_2024.csv")

players = pd.read_csv(data_folder + dataset + "atp_players.csv")

In [3]:
atp = pd.concat([atp1, atp2, atp3, atp4, atp5, atp6, atp7, atp8, atp9, atp10,
                 atp11, atp12, atp13, atp14, atp15, atp16, atp17, atp19, atp20,
                 atp21, atp22, atp23, atp24, atp25])

In [4]:
atp = atp25
year = '2024'

The interesting columns of this dataset to capture serving performance are : 


- **w_ace** (Winner's number of aces)  
  - More aces generally indicate a strong server.  

- **w_df** (Winner's number of double faults)  
  - Too many double faults indicate inconsistency in serving.  

- **w_svpt** (Winner's total serve points played)  
  - Helps in calculating serve efficiency.  

- **w_1stIn** (Winner's first serves made)  
  - Higher values show how often the first serve goes in.  

- **w_1stWon** (Winner's first-serve points won)  
  - Shows effectiveness of first serves.  

- **w_2ndWon** (Winner's second-serve points won)  
  - Indicates reliability under pressure if the first serve fails.  

- **w_SvGms** (Winner's service games played)  
  - Allows calculation of serve performance per game.  

- **w_bpSaved** (Winner's break points saved)  
  - A strong server is good at saving break points.  

- **w_bpFaced** (Winner's break points faced)  
  - Fewer break points faced might indicate dominance on serve.  

Derived Metrics to Consider

- **Ace Rate** = `w_ace / w_svpt`  
- **First Serve %** = `w_1stIn / w_svpt`  
- **First Serve Win %** = `w_1stWon / w_1stIn`  
- **Second Serve Win %** = `w_2ndWon / (w_svpt - w_1stIn)`  
- **Break Point Save %** = `w_bpSaved / w_bpFaced` 

To find the **best server ever**, we aggregate these stats over an entire career and compare players.  
Players with high ace rates, high first-serve win %, and strong break point save rates are likely the best servers.

In [5]:
# Replace 0 values with NaN to avoid division errors
atp.replace(0, float('nan'), inplace=True)

atp["w_Ace Rate"] = atp["w_ace"] / atp["w_svpt"]
atp["w_First Serve %"] = atp["w_1stIn"] / atp["w_svpt"]
atp["w_First Serve Win %"] = atp["w_1stWon"] / atp["w_1stIn"]
atp["w_Second Serve Win %"] = atp["w_2ndWon"] / (atp["w_svpt"] - atp["w_1stIn"])
atp["w_Break Point Save %"] = atp["w_bpSaved"] / atp["w_bpFaced"]

atp["l_Ace Rate"] = atp["l_ace"] / atp["l_svpt"]
atp["l_First Serve %"] = atp["l_1stIn"] / atp["l_svpt"]
atp["l_First Serve Win %"] = atp["l_1stWon"] / atp["l_1stIn"]
atp["l_Second Serve Win %"] = atp["l_2ndWon"] / (atp["l_svpt"] - atp["l_1stIn"])
atp["l_Break Point Save %"] = atp["l_bpSaved"] / atp["l_bpFaced"]

# Create a unified dataset by stacking winner and loser data
winners_df = atp[["winner_id", "w_Ace Rate", "w_First Serve %", "w_First Serve Win %", "w_Second Serve Win %", "w_Break Point Save %"]].copy()
losers_df = atp[["loser_id", "l_Ace Rate", "l_First Serve %", "l_First Serve Win %", "l_Second Serve Win %", "l_Break Point Save %"]].copy()

# Rename columns for consistency
winners_df.columns = ["player_id", "Ace Rate", "First Serve %", "First Serve Win %", "Second Serve Win %", "Break Point Save %"]
losers_df.columns = ["player_id", "Ace Rate", "First Serve %", "First Serve Win %", "Second Serve Win %", "Break Point Save %"]

players_df = pd.concat([winners_df, losers_df])

players_avg = players_df.groupby("player_id").mean()

players_avg.fillna(0, inplace=True)

In [6]:
# Count the number of times each player appears as a winner
winner_counts = atp['winner_id'].value_counts()

# Count the number of times each player appears as a loser
loser_counts = atp['loser_id'].value_counts()

player_match_counts = winner_counts.add(loser_counts, fill_value=0).astype(int)

player_match_counts = player_match_counts.reset_index()
player_match_counts.columns = ['player_id', 'num_matches']

In [7]:
players_avg = players_avg.merge(player_match_counts, on='player_id', how='left')

# Replace NaN with 0 (in case some players are missing from the count)
players_avg['num_matches'] = players_avg['num_matches'].fillna(0).astype(int)

In [8]:
min_matches = 30
filtered_players = players_avg[players_avg['num_matches'] >= min_matches]

best_servers = filtered_players.merge(players[['player_id', 'name_first', 'name_last']], on='player_id', how='left')

# Sort by Ace Rate (or another serving metric)


output_folder = f'results/services/'

In [None]:
def save_category_csv(df, value_col, category, year, output_folder):
    df['Player'] = df['name_first'] + ' ' + df['name_last']
    out_df = df[['Player', value_col, 'num_matches']].copy()
    out_df.to_csv(f"{output_folder}{year}_{category}.csv", index=False)

min_matches = 30
filtered_players = players_avg[players_avg['num_matches'] >= min_matches]
best_servers = filtered_players.merge(
    players[['player_id', 'name_first', 'name_last']], 
    on='player_id', how='left'
)

output_folder = f'results/services/'


ace_rate = best_servers.sort_values("Ace Rate", ascending=False)
save_category_csv(ace_rate, 'Ace Rate', 'Ace Rate', year, output_folder)

first_serve = best_servers.sort_values("First Serve %", ascending=False)
save_category_csv(first_serve, 'First Serve %', 'First Serve %', year, output_folder)

first_serve_win = best_servers.sort_values("First Serve Win %", ascending=False)
save_category_csv(first_serve_win, 'First Serve Win %', 'First Serve Win %', year, output_folder)

second_serve_win = best_servers.sort_values("Second Serve Win %", ascending=False)
save_category_csv(second_serve_win, "Second Serve Win %", 'Second Serve Win %', year, output_folder)

break_point_serve = best_servers.sort_values("Break Point Save %", ascending=False)
save_category_csv(break_point_serve, 'Break Point Save %', 'Break Point Save %', year, output_folder)