# Summary Statistics Notebook

### Load Packages

In [1]:
import pandas as pd
import re
import os

os.getcwd() # Print current working directory

'/Users/cjgimena/Desktop/Github/consulting-spring2025/season_report/summary_page'

In [2]:
os.chdir('../..') # go back two directories
os.getcwd()

'/Users/cjgimena/Desktop/Github/consulting-spring2025'

### Read Data

In [7]:
# USER INPUT: Specify Player!
player_name = 'Rudy Quan'
# player_name = 'Kaylan Bigun'
# # player_name = 'Emon Van Loben Sels'
# player_name = 'Alexander Hoogmartens'
# player_name = 'Spencer Johnson'
# player_name = 'Aadarsh Tripathi'
# player_name = 'Giacomo Revelli'
# player_name = 'Gianluca Ballotta'



In [None]:
combined_data_shots = pd.read_excel(f'data/mens/{player_name}/combined.xlsx', sheet_name='Shots')
combined_data_points = pd.read_excel(f'data/mens/{player_name}/combined.xlsx', sheet_name='Points')
combined_data_sets = pd.read_excel(f'data/mens/{player_name}/combined.xlsx', sheet_name='Sets')

In [10]:
# Subset 2024-2025 Season Matches!
mens_results = pd.read_csv('data/mens/mens_results.csv')[:229]

def filter_player(data, player_name):
    data = data[(data['Player1'] == player_name) | (data['Player2'] == player_name)]

    # data = data[data['Event Name'].str.startswith(('Dual Match', '2024 ITA', '2024-25 NCAA Division'))]
    return data

mens_results_player = filter_player(mens_results, player_name)

In [11]:
mens_results_player

Unnamed: 0,Event Name,Date,Player1,Player1 UTR,Player2,Player2 UTR,Score
2,"Dual Match: University of California, Los Ange...",2025-04-26,Aidan Kim,13.75,Rudy Quan,13.67,"7-6(3), 6-2"
10,Dual Match: Michigan State University vs Unive...,2025-04-25,Rudy Quan,13.67,Aristotelis THANOS,13.61,"4-6, 3-1"
17,Dual Match: University of Michigan vs Universi...,2025-04-24,Rudy Quan,13.67,William Cooksey,12.0,"6-3, 5-6"
23,"Dual Match: University of California, Los Ange...",2025-04-19,Rudy Quan,13.67,Calvin MUELLER,13.0,"3-6, 6-3, 6-1"
25,"Dual Match: University of California, Los Ange...",2025-04-17,Michael Minasyan,12.0,Rudy Quan,13.67,"2-6, 6-3"
32,Dual Match: Michigan State University vs Unive...,2025-04-12,Rudy Quan,13.67,Ozan Baris,13.83,"6-1, 6-2"
36,"Dual Match: University of California, Los Ange...",2025-04-10,Rudy Quan,13.67,William Cooksey,12.0,"6-3, 6-3"
44,"Dual Match: University of California, Los Ange...",2025-04-05,Rudy Quan,13.67,Charl Morgan,12.0,"6-3, 6-3"
48,Dual Match: Ohio State University vs Universit...,2025-04-03,Alexander Bernard,13.33,Rudy Quan,13.67,"7-6(5), 0-6"
52,"Dual Match: University of California, Los Ange...",2025-04-03,Alexander Bernard,13.33,Rudy Quan,13.67,"7-6(5), 0-6, 3-1"


### Longest Rally Function

In [None]:
def longest_rally(data):

    # Error Check
    if "Shot" not in data.columns:
        raise ValueError("The column 'Shot' was not found in the 'Shots' sheet.")
    
    # Find the index of the max shot value
    max_rally_length = data["Shot"].max()  

    # Return the value of the max shot
    return max_rally_length

In [None]:
longest_rally_length = longest_rally(combined_data_shots)
longest_rally_length

### Average Time On Court

In [None]:
def average_court_time(data):
    
    if 'Duration' not in data.columns:
        raise ValueError("The column 'Duration' was not found in the 'Sets' sheet.")
    
    if '__source_file__' not in data.columns:
        raise ValueError("The column '__source_file__' was not found in the 'Sets' sheet.")
    
    # Make sure the column values are Numeric | errors argument sets not numerics to NA
    data['Duration'] = pd.to_numeric(data['Duration'], errors='coerce')

    # Create Boolean Mask to filter out NA values
    data = data[data['Duration'].notna()]

    # Group by 'Duration' and sum values then convert to minutes
    match_durations = (
                        data.groupby('__source_file__')['Duration']
                        .sum()
                        .div(60)
                        .round(1)
                        )
    
    # Compute overall average in minutes
    avg_minutes = match_durations.mean()
    if pd.notna(avg_minutes):
        hours = int(avg_minutes) // 60
        mins = int(avg_minutes) % 60            

    duration = f"{hours}:{mins:02d}"
    
    return duration

In [None]:
average_time_on_court = average_court_time(combined_data_sets)
average_time_on_court

### Tiebreak Wins

In [None]:
# Work
tiebreak_data = mens_results[mens_results['Score'].str.contains(r'6-7\(|7-6\(|1-0|0-1', na=False)].reset_index(drop=True)
tiebreak_data

In [None]:
# Work
score = tiebreak_data.loc[1, 'Score']
score.count("7-6(") + score.count('1-0')

In [None]:
# Work
def tiebreaker_win_loss(score, player1, player2, player_name):
# def tiebreaker_win_loss(score):
    if player1 == player_name:
        win = score.count("7-6(") + score.count('1-0')
        loss = score.count("6-7(") + score.count('0-1')
    elif player2 == player_name:
        win = score.count("6-7(") + score.count('0-1')
        loss = score.count("7-6(") + score.count('1-0')
    else:
        pass
    return win, loss


tiebreak_data[['win', 'loss']] = tiebreak_data.apply(lambda x:tiebreaker_win_loss(x['Score'], x['Player1'], x['Player2'], player_name), axis=1).apply(pd.Series)
tiebreak_data[['win', 'loss']].sum()

In [None]:
def analyze_tiebreak_data(data, player_name):

    # Subset data to only include matches with tiebreaker sets
    tiebreak_data = data[data['Score'].str.contains(r'6-7\(|7-6\(|1-0|0-1', na=False)].reset_index(drop=True)

    # Function to use in apply function
    def tiebreaker_win_loss(score, player1, player2, player_name):
        if player1 == player_name:
            win = score.count("7-6(") + score.count('1-0')
            loss = score.count("6-7(") + score.count('0-1')
        elif player2 == player_name:
            win = score.count("6-7(") + score.count('0-1')
            loss = score.count("7-6(") + score.count('1-0')
        else:
            pass
        return win, loss

    # Use the apply function to create tiebreake win/loss count columsn
    tiebreak_data[['win', 'loss']] = tiebreak_data.apply(lambda x:tiebreaker_win_loss(x['Score'], 
                                                                                      x['Player1'], 
                                                                                      x['Player2'], 
                                                                                      player_name), axis=1).apply(pd.Series)
    
    # Assign tiebreaker sets wins and losses to respective variables
    total_wins = tiebreak_data['win'].sum()
    total_losses = tiebreak_data['loss'].sum()


    return total_wins, total_losses


In [None]:
# Output tiebreaker wins
tiebreaker_wins = analyze_tiebreak_data(mens_results, player_name)[0]
tiebreaker_losses = analyze_tiebreak_data(mens_results, player_name)[1]

tiebreaker_wins, tiebreaker_losses

### Average Winners

In [None]:
def get_average_winners(data):

    # Find the number of matches
    num_matches = len(data['__source_file__'].value_counts())

    # Subset the data for all the points that our player hits
    host_wins = data[data['Point Winner'] == 'host']
    
    # Find the counts of Total Winners overall (Forehand + Backhand)
    total_winners = host_wins['Detail'].value_counts()['Forehand Winner'] + host_wins['Detail'].value_counts()['Backhand Winner']

    # Return value
    return total_winners // num_matches

In [None]:
# Output Average Winners
average_winners = get_average_winners(combined_data_points)
average_winners

### Sets Won

In [None]:
def get_sets_won(data):


return sets_won

In [21]:
sets_won = 0
for _, row in mens_results_player.iterrows():
    if pd.isna(row['Score']) or row['Score'].strip() == "":
        continue
    sets = row['Score'].split(", ")

int(sets[0].split("-")[0].split("(")[0].strip())
#     for set_score in sets:
#         if "-" not in set_score:
#             continue

#         try:
#             player1_score = int(set_score.split("-")[0].split("(")[0].strip())
#             player2_score = int(set_score.split("-")[1].split("(")[0].strip())
#         except ValueError:
#             continue

#         if row['Player1'] == player_name and player1_score > player2_score:
#             sets_won += 1
#         elif row['Player2'] == player_name and player2_score > player1_score:
#             sets_won += 1

# {"Player": player_name, "Sets_Won_This_Season": sets_won}

5

In [None]:
def get_sets_won(data):

    sets_won = 0
    for _, row in data.iterrows():
        if pd.isna(row['Score']) or row['Score'].strip() == "":
            continue
        sets = row['Score'].split(", ")

        for set_score in sets:
            if "-" not in set_score:
                continue

            try:
                player1_score = int(set_score.split("-")[0].split("(")[0].strip())
                player2_score = int(set_score.split("-")[1].split("(")[0].strip())
            except ValueError:
                continue

            
            # Needs to account for unfinished sets (Matches that dont end with 6-*, 7-5, 7-6)


            if row['Player1'] == player_name and player1_score > player2_score:
                sets_won += 1
            elif row['Player2'] == player_name and player2_score > player1_score:
                sets_won += 1

    return {"Player": player_name, "Sets_Won_This_Season": sets_won}


In [16]:
get_sets_won(mens_results_player)

{'Player': 'Rudy Quan', 'Sets_Won_This_Season': 44}

### Three Set Matches Won (WIP)

In [12]:
# Function to find all score pairs
def extract_scores(score):
    return score.split(', ')

# Applying the function to the 'Score' column
mens_results_player[['Set 1', 'Set 2', 'Set 3']] = mens_results_player['Score'].apply(extract_scores).apply(pd.Series)

mens_results_player

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mens_results_player[['Set 1', 'Set 2', 'Set 3']] = mens_results_player['Score'].apply(extract_scores).apply(pd.Series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mens_results_player[['Set 1', 'Set 2', 'Set 3']] = mens_results_player['Score'].apply(extract_scores).apply(pd.Series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#

Unnamed: 0,Event Name,Date,Player1,Player1 UTR,Player2,Player2 UTR,Score,Set 1,Set 2,Set 3
2,"Dual Match: University of California, Los Ange...",2025-04-26,Aidan Kim,13.75,Rudy Quan,13.67,"7-6(3), 6-2",7-6(3),6-2,
10,Dual Match: Michigan State University vs Unive...,2025-04-25,Rudy Quan,13.67,Aristotelis THANOS,13.61,"4-6, 3-1",4-6,3-1,
17,Dual Match: University of Michigan vs Universi...,2025-04-24,Rudy Quan,13.67,William Cooksey,12.0,"6-3, 5-6",6-3,5-6,
23,"Dual Match: University of California, Los Ange...",2025-04-19,Rudy Quan,13.67,Calvin MUELLER,13.0,"3-6, 6-3, 6-1",3-6,6-3,6-1
25,"Dual Match: University of California, Los Ange...",2025-04-17,Michael Minasyan,12.0,Rudy Quan,13.67,"2-6, 6-3",2-6,6-3,
32,Dual Match: Michigan State University vs Unive...,2025-04-12,Rudy Quan,13.67,Ozan Baris,13.83,"6-1, 6-2",6-1,6-2,
36,"Dual Match: University of California, Los Ange...",2025-04-10,Rudy Quan,13.67,William Cooksey,12.0,"6-3, 6-3",6-3,6-3,
44,"Dual Match: University of California, Los Ange...",2025-04-05,Rudy Quan,13.67,Charl Morgan,12.0,"6-3, 6-3",6-3,6-3,
48,Dual Match: Ohio State University vs Universit...,2025-04-03,Alexander Bernard,13.33,Rudy Quan,13.67,"7-6(5), 0-6",7-6(5),0-6,
52,"Dual Match: University of California, Los Ange...",2025-04-03,Alexander Bernard,13.33,Rudy Quan,13.67,"7-6(5), 0-6, 3-1",7-6(5),0-6,3-1


In [13]:
def count_threeSet_wins(df):
    count = 0

    #loop through non NaN rows
    for s in df['Score'].dropna():
        #convert string scores to number scores
        sets = re.findall(r'(\d+)-(\d+)', s)

        # filter only 3 set games
        if len(sets) == 3:
            ucla_set_wins = 0
            ucla_set_wins = sum(int(u) > int(o) for u, o in sets)
            # count ucla wins
            if ucla_set_wins >= 2:
                count += 1

    return count

count_threeSet_wins(mens_results_player)


10

### Output Csv