# Summary Statistics Notebook

### Load Packages

In [1]:
import pandas as pd
import os

os.getcwd() # Print current working directory

'/Users/cjgimena/Desktop/Github/consulting-spring2025/season_report/summary_page'

In [2]:
os.chdir('../..') # go back two directories
os.getcwd()

'/Users/cjgimena/Desktop/Github/consulting-spring2025'

### Read Data

In [3]:
# USER INPUT: Specify Player!
player_name = 'Rudy Quan'

In [17]:
combined_data_shots = pd.read_excel(f'data/mens/{player_name}/combined.xlsx', sheet_name='Shots')
combined_data_points = pd.read_excel(f'data/mens/{player_name}/combined.xlsx', sheet_name='Points')
combined_data_sets = pd.read_excel(f'data/mens/{player_name}/combined.xlsx', sheet_name='Sets')

In [5]:
# Subset 2024-2025 Season Matches!
mens_results = pd.read_csv('data/mens/mens_results.csv')[:229]

def filter_player(data, player_name):
    data = data[(data['Player1'] == player_name) | (data['Player2'] == player_name)]

    data = data[data['Event Name'].str.startswith(('Dual Match', '2024 ITA'))]
    return data

mens_results = filter_player(mens_results, player_name)

### Longest Rally Function

In [6]:
def longest_rally(data):

    # Error Check
    if "Shot" not in data.columns:
        raise ValueError("The column 'Shot' was not found in the 'Shots' sheet.")
    
    # Find the index of the max shot value
    max_rally_length = data["Shot"].max()  

    # Return the value of the max shot
    return max_rally_length

In [7]:
longest_rally_length = longest_rally(combined_data_shots)
longest_rally_length

32

### Average Time On Court

In [8]:
def average_court_time(data):
    
    if 'Duration' not in data.columns:
        raise ValueError("The column 'Duration' was not found in the 'Sets' sheet.")
    
    if '__source_file__' not in data.columns:
        raise ValueError("The column '__source_file__' was not found in the 'Sets' sheet.")
    
    # Make sure the column values are Numeric | errors argument sets not numerics to NA
    data['Duration'] = pd.to_numeric(data['Duration'], errors='coerce')

    # Create Boolean Mask to filter out NA values
    data = data[data['Duration'].notna()]

    # Group by 'Duration' and sum values then convert to minutes
    match_durations = (
                        data.groupby('__source_file__')['Duration']
                        .sum()
                        .div(60)
                        .round(1)
                        )
    
    # Compute overall average in minutes
    avg_minutes = match_durations.mean()
    if pd.notna(avg_minutes):
        hours = int(avg_minutes) // 60
        mins = int(avg_minutes) % 60            

    duration = f"{hours}:{mins:02d}"
    
    return duration

In [9]:
average_time_on_court = average_court_time(combined_data_sets)
average_time_on_court

'9:12'

### Tiebreak Wins

In [15]:
# Work
tiebreak_data = mens_results[mens_results['Score'].str.contains(r'6-7\(|7-6\(|1-0|0-1', na=False)].reset_index(drop=True)
tiebreak_data

Unnamed: 0,Event Name,Date,Player1,Player1 UTR,Player2,Player2 UTR,Score
0,"Dual Match: University of California, Los Ange...",2025-04-26,Aidan Kim,13.75,Rudy Quan,13.67,"7-6(3), 6-2"
1,Dual Match: Ohio State University vs Universit...,2025-04-03,Alexander Bernard,13.33,Rudy Quan,13.67,"7-6(5), 0-6"
2,"Dual Match: University of California, Los Ange...",2025-04-03,Alexander Bernard,13.33,Rudy Quan,13.67,"7-6(5), 0-6, 3-1"
3,"Dual Match: University of California, Los Ange...",2025-03-20,Kenta Miyoshi,13.92,Rudy Quan,13.67,"7-6(5), 5-1"
4,Dual Match: Purdue University vs University of...,2025-03-08,Rudy Quan,13.67,Aleksa Krivokapic,12.0,"4-6, 6-3, 1-0(3)"
5,"Dual Match: University of California, Berkeley...",2025-01-31,Carl Emil Overbeck,13.49,Rudy Quan,13.67,"6-7(7), 6-3, 6-4"
6,Dual Match: University of San Diego vs Univers...,2025-01-19,Rudy Quan,13.67,Iiro Vasa,12.9,"6-3, 7-6(5)"


In [None]:
# Work
score = tiebreak_data.loc[1, 'Score']
score.count("7-6(") + score.count('1-0')

1

In [16]:
# Work
def tiebreaker_win_loss(score, player1, player2, player_name):
# def tiebreaker_win_loss(score):
    if player1 == player_name:
        win = score.count("7-6(") + score.count('1-0')
        loss = score.count("6-7(") + score.count('0-1')
    elif player2 == player_name:
        win = score.count("6-7(") + score.count('0-1')
        loss = score.count("7-6(") + score.count('1-0')
    else:
        pass
    return win, loss


tiebreak_data[['win', 'loss']] = tiebreak_data.apply(lambda x:tiebreaker_win_loss(x['Score'], x['Player1'], x['Player2'], player_name), axis=1).apply(pd.Series)
tiebreak_data[['win', 'loss']].sum()

win     3
loss    4
dtype: int64

In [37]:
def analyze_tiebreak_data(data, player_name):

    # Subset data to only include matches with tiebreaker sets
    tiebreak_data = data[data['Score'].str.contains(r'6-7\(|7-6\(|1-0|0-1', na=False)].reset_index(drop=True)

    # Function to use in apply function
    def tiebreaker_win_loss(score, player1, player2, player_name):
        if player1 == player_name:
            win = score.count("7-6(") + score.count('1-0')
            loss = score.count("6-7(") + score.count('0-1')
        elif player2 == player_name:
            win = score.count("6-7(") + score.count('0-1')
            loss = score.count("7-6(") + score.count('1-0')
        else:
            pass
        return win, loss

    # Use the apply function to create tiebreake win/loss count columsn
    tiebreak_data[['win', 'loss']] = tiebreak_data.apply(lambda x:tiebreaker_win_loss(x['Score'], 
                                                                                      x['Player1'], 
                                                                                      x['Player2'], 
                                                                                      player_name), axis=1).apply(pd.Series)
    
    # Assign tiebreaker sets wins and losses to respective variables
    total_wins = tiebreak_data['win'].sum()
    total_losses = tiebreak_data['loss'].sum()


    return total_wins, total_losses


In [36]:
# Output tiebreaker wins
tiebreaker_wins = analyze_tiebreak_data(mens_results, player_name)[0]
tiebreaker_wins

3

### Average Winners

In [35]:
def get_average_winners(data):

    # Find the number of matches
    num_matches = len(data['__source_file__'].value_counts())

    # Subset the data for all the points that our player hits
    host_wins = data[data['Point Winner'] == 'host']
    
    # Find the counts of Total Winners overall (Forehand + Backhand)
    total_winners = host_wins['Detail'].value_counts()['Forehand Winner'] + host_wins['Detail'].value_counts()['Backhand Winner']

    # Return value
    return total_winners // num_matches

In [34]:
# Output Average Winners
average_winners = get_average_winners(combined_data_points)
average_winners

12

### Three Set Matches Won

### Output Csv