In [86]:
import numpy as np
import pandas as pd
import torch
import openpyxl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### PLAN

Write function to pull data for a player for the last 3 years  
Write a function to compare a player to another in their last 3 years  
Separate data by court surface  
Get data by vs righties vs righties  

Live Data: https://rapidapi.com/sportcontentapi/api/tennis-live-data

In [6]:
players = pd.read_csv('data/atp_players.csv')
players

Unnamed: 0,player_id,name_first,name_last,hand,dob,ioc,height,wikidata_id
0,100001,Gardnar,Mulloy,R,19131122.0,USA,185.0,Q54544
1,100002,Pancho,Segura,R,19210620.0,ECU,168.0,Q54581
2,100003,Frank,Sedgman,R,19271002.0,AUS,180.0,Q962049
3,100004,Giuseppe,Merlo,R,19271011.0,ITA,,Q1258752
4,100005,Richard,Gonzalez,R,19280509.0,USA,188.0,Q53554
...,...,...,...,...,...,...,...,...
65014,212913,Pietro,Ricci,U,,ITA,,
65015,212914,Corey,Craig,U,,USA,,
65016,212915,Aleksandar,Ljubojevic,U,,SRB,,
65017,212916,Marko,Milosavljevic,U,,SRB,,


In [7]:
atp_singles = 'data/atp_singles/atp_matches_'

In [8]:
atp_2024 = pd.read_csv(f'{atp_singles}2024.csv')
atp_2024['tourney_date'] = pd.to_datetime(atp_2024['tourney_date'], format='%Y%m%d')
print(atp_2024.columns)

Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')


In [9]:
atp_2024.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2024-0339,Brisbane,Hard,32,A,2024-01-01,300,105777,2.0,,...,58.0,44.0,16.0,11.0,8.0,9.0,14.0,2570.0,8.0,3660.0
1,2024-0339,Brisbane,Hard,32,A,2024-01-01,299,208029,1.0,,...,35.0,31.0,10.0,11.0,5.0,7.0,8.0,3660.0,39.0,1122.0
2,2024-0339,Brisbane,Hard,32,A,2024-01-01,298,105777,2.0,,...,39.0,24.0,14.0,10.0,5.0,7.0,14.0,2570.0,55.0,902.0
3,2024-0339,Brisbane,Hard,32,A,2024-01-01,297,208029,1.0,,...,51.0,31.0,16.0,10.0,3.0,5.0,8.0,3660.0,116.0,573.0
4,2024-0339,Brisbane,Hard,32,A,2024-01-01,296,126128,,,...,37.0,27.0,16.0,10.0,5.0,8.0,39.0,1122.0,44.0,1021.0


In [10]:
### Score Reader
def score_reader(score):
    w_games_won = 0
    l_games_won = 0
    tiebreaks = 0
    tiebreak_points = []
    num_sets = 1

    i = 1
    for num in score[1:-1]:
        if num == '-':
            w_games_won += int(score[i-1])
            l_games_won += int(score[i+1])
        if num == '(':
            tiebreaks += 1
            if score[i-1] == '7':
                loser = 'w'
            else:
                loser = 'l'
            tiebreak_points.append(loser, score[i+1])
        if num == ' ':
            num_sets += 1

    total_games = w_games_won + l_games_won
    return w_games_won, l_games_won, total_games, tiebreaks, tiebreak_points, num_sets

In [11]:
def get_player_name(player_id):
    player = players[players['player_id'] == player_id]
    first_name = player['name_first'].iloc[0]
    last_name = player['name_last'].iloc[0]
    return first_name, last_name

In [12]:
def get_all_player_names(player_ids):
    first_names = pd.Series([players.loc[players['player_id'] == pid, 'name_first'].values[0] if not players.loc[players['player_id'] == pid, 'name_first'].empty else None for pid in player_ids])
    last_names = pd.Series([players.loc[players['player_id'] == pid, 'name_last'].values[0] if not players.loc[players['player_id'] == pid, 'name_last'].empty else None for pid in player_ids])
    
    return first_names, last_names

In [13]:
### Convert to Percentages
def get_percentages(orig_df):
    percentages = {}
    
    percentages['tourney_id'] = orig_df['tourney_id']
    percentages['tourney_name'] = orig_df['tourney_name']
    percentages['surface'] = orig_df['surface']
    percentages['draw_size'] = orig_df['draw_size']
    percentages['tourney_date'] = orig_df['tourney_date']
    percentages['match_num'] = orig_df['match_num']
    percentages['winner_id'] = orig_df['winner_id']
    winner_first, winner_last = get_all_player_names(orig_df['winner_id'])
    percentages['winner_name'] = winner_first + ' ' + winner_last
    percentages['winner_seed'] = orig_df['winner_seed']
    percentages['winner_hand'] = orig_df['winner_hand']
    percentages['winner_ht'] = orig_df['winner_ht']
    percentages['winner_age'] = orig_df['winner_age']
    percentages['winner_rank'] = orig_df['winner_rank']
    percentages['winner_rank_points'] = orig_df['winner_rank_points']
    percentages['loser_id'] = orig_df['loser_id']
    loser_first, loser_last = get_all_player_names(orig_df['loser_id'])
    percentages['loser_name'] = loser_first + ' ' + loser_last
    percentages['loser_seed'] = orig_df['loser_seed']
    percentages['loser_hand'] = orig_df['loser_hand']
    percentages['loser_ht'] = orig_df['loser_ht']
    percentages['loser_age'] = orig_df['loser_age']
    percentages['loser_rank'] = orig_df['loser_rank']
    percentages['loser_rank_points'] = orig_df['loser_rank_points']

    percentages['w_ace_perc'] = orig_df['w_ace'] / orig_df['w_svpt']
    percentages['w_df_perc'] = orig_df['w_df'] / (orig_df['w_svpt'] - orig_df['w_1stIn'])
    percentages['w_first_in'] = orig_df['w_1stIn'] / orig_df['w_svpt']
    percentages['w_first_win_perc'] = orig_df['w_1stWon'] / orig_df['w_1stIn']
    percentages['w_second_in'] = 1 - percentages['w_df_perc']
    percentages['w_second_win_perc'] = orig_df['w_2ndWon'] / (orig_df['w_svpt'] - orig_df['w_1stIn'])
    percentages['w_bp_hold_perc'] = orig_df['w_bpSaved'] / orig_df['w_bpFaced']
    percentages['w_bp_win_perc'] = 1 - (orig_df['l_bpSaved'] / orig_df['l_bpFaced'])
    percentages['w_first_return_win_perc'] = 1 - orig_df['l_1stWon'] / orig_df['l_1stIn']
    percentages['w_second_return_win_perc'] = 1 - orig_df['l_2ndWon'] / (orig_df['l_svpt'] - orig_df['l_1stIn'])
    percentages['w_return_win_perc'] = (orig_df['l_bpFaced'] - orig_df['l_bpSaved']) / orig_df['l_SvGms']

    percentages['l_ace_perc'] = orig_df['l_ace'] / orig_df['l_svpt']
    percentages['l_df_perc'] = orig_df['l_df'] / (orig_df['l_svpt'] - atp_2024['l_1stIn'])
    percentages['l_first_in'] = orig_df['l_1stIn'] / orig_df['l_svpt']
    percentages['l_first_win_perc'] = orig_df['l_1stWon'] / orig_df['l_1stIn']
    percentages['l_second_in'] = 1 - percentages['l_df_perc']
    percentages['l_second_win_perc'] = orig_df['l_2ndWon'] / (orig_df['l_svpt'] - orig_df['l_1stIn'])
    percentages['l_bp_hold_perc'] = orig_df['l_bpSaved'] / orig_df['l_bpFaced']
    percentages['l_bp_win_perc'] = 1 - percentages['w_bp_hold_perc']
    percentages['l_first_return_win_perc'] = 1 - orig_df['w_1stWon'] / orig_df['w_1stIn']
    percentages['l_second_return_win_perc'] = 1 - orig_df['w_2ndWon'] / (orig_df['w_svpt'] - orig_df['w_1stIn'])
    percentages['l_return_win_perc'] = (orig_df['w_bpFaced'] - orig_df['w_bpSaved']) / orig_df['w_SvGms']
    
    return pd.DataFrame(percentages)

In [14]:
percentages = get_percentages(atp_2024)

In [15]:
# Player Data Search
def search_player(first_name, last_name, df=percentages):
    player_id = players.loc[(players['name_first'] == first_name) & (players['name_last'] == last_name), 'player_id'].iloc[0]
    data = df[(df['winner_id'] == player_id) | (df['loser_id'] == player_id)]
    data = data.sort_values(by='tourney_date', ascending=False)
    return data

In [16]:
### Filter out the stats of the opposing player
def filter_results(df, player_name):
    player_stats = []

    for index, row in df.iterrows():
        if row['winner_name'] == player_name:
            player_stat = {
                'tourney_id': row['tourney_id'],
                'tourney_name': row['tourney_name'], 
                'surface': row['surface'], 
                'tourney_date': row['tourney_date'], 
                'player': row['winner_name'], 
                'opponent': row['loser_name'],
                'result': 'win',
                'ace_percentage' : row['w_ace_perc'],
                'df_percentage': row['w_df_perc'],
                'first_in': row['w_first_in'],
                'first_win_perc': row['w_first_win_perc'],
                'second_in': row['w_second_in'],
                'second_win_perc': row['w_second_win_perc'],
                'bp_hold_perc': row['w_bp_hold_perc'],
                'bp_win_perc': row['w_bp_win_perc'],
                'opp_first_in': row['l_first_in'],
                'first_return_win_perc' : row['w_first_return_win_perc'],
                'second_return_win_perc' : row['w_second_return_win_perc'],
                'return_win_perc': row['w_return_win_perc'],
            }
        elif row['loser_name'] == player_name:
            player_stat = {
                'tourney_id': row['tourney_id'],
                'tourney_name': row['tourney_name'], 
                'surface': row['surface'], 
                'tourney_date': row['tourney_date'], 
                'player': row['loser_name'], 
                'opponent': row['winner_name'],
                'result': 'loss',
                'ace_percentage' : row['l_ace_perc'],
                'df_percentage': row['l_df_perc'],
                'first_in': row['l_first_in'],
                'first_win_perc': row['l_first_win_perc'],
                'second_in': row['l_second_in'],
                'second_win_perc': row['l_second_win_perc'],
                'bp_hold_perc': row['l_bp_hold_perc'],
                'bp_win_perc': row['l_bp_win_perc'],
                'opp_first_in': row['w_first_in'],
                'first_return_win_perc' : row['l_first_return_win_perc'],
                'second_return_win_perc' : row['l_second_return_win_perc'],
                'return_win_perc': row['l_return_win_perc'],
            }
        player_stats.append(player_stat)

    player_stats_df = pd.DataFrame(player_stats)
    
    return player_stats_df


In [17]:
first = "Fabian"
last = "Marozsan"
player_f = filter_results(search_player(first, last), f"{first} {last}")
player_f

Unnamed: 0,tourney_id,tourney_name,surface,tourney_date,player,opponent,result,ace_percentage,df_percentage,first_in,first_win_perc,second_in,second_win_perc,bp_hold_perc,bp_win_perc,opp_first_in,first_return_win_perc,second_return_win_perc,return_win_perc
0,2024-0322,Geneva,Clay,2024-05-20,Fabian Marozsan,Alexander Shevchenko,loss,0.028169,0.064516,0.56338,0.6,0.935484,0.645161,0.5,0.333333,0.621622,0.23913,0.464286,0.090909
1,2024-0416,Rome Masters,Clay,2024-05-06,Fabian Marozsan,Alexander Shevchenko,loss,0.0,0.086957,0.603448,0.542857,0.913043,0.434783,0.333333,,0.622222,0.285714,0.294118,0.0
2,2024-1536,Madrid Masters,Clay,2024-04-22,Fabian Marozsan,Aslan Karatsev,win,0.028169,0.173913,0.676056,0.75,0.826087,0.521739,0.0,0.333333,0.516129,0.354167,0.511111,0.166667
3,2024-1536,Madrid Masters,Clay,2024-04-22,Fabian Marozsan,Francisco Cerundolo,loss,0.014286,0.0,0.757143,0.622642,1.0,0.411765,0.2,0.285714,0.763889,0.290909,0.588235,0.2
4,2024-0425,Barcelona,Clay,2024-04-15,Fabian Marozsan,Facundo Diaz Acosta,loss,0.0,0.0,0.695122,0.578947,1.0,0.52,0.5,1.0,0.701493,0.276596,0.5,0.153846
5,2024-0425,Barcelona,Clay,2024-04-15,Fabian Marozsan,Luca Van Assche,win,0.181818,0.0,0.727273,0.75,1.0,0.666667,,0.6,0.59375,0.473684,0.692308,0.6
6,2024-0403,Miami Masters,Hard,2024-03-18,Fabian Marozsan,Aleksandar Kovacevic,win,0.017241,0.052632,0.672414,0.692308,0.947368,0.684211,1.0,0.5,0.765957,0.333333,0.818182,0.375
7,2024-0403,Miami Masters,Hard,2024-03-18,Fabian Marozsan,Holger Rune,win,0.08,0.111111,0.82,0.756098,0.888889,0.333333,1.0,1.0,0.609756,0.6,0.5625,0.714286
8,2024-0403,Miami Masters,Hard,2024-03-18,Fabian Marozsan,Alexei Popyrin,win,0.042254,0.076923,0.816901,0.689655,0.923077,0.538462,0.875,0.428571,0.625,0.325,0.5,0.3
9,2024-0403,Miami Masters,Hard,2024-03-18,Fabian Marozsan,Alex De Minaur,win,0.0375,0.095238,0.7375,0.627119,0.904762,0.571429,0.4,0.428571,0.596774,0.351351,0.36,0.272727


In [18]:
print(player_f.columns)

Index(['tourney_id', 'tourney_name', 'surface', 'tourney_date', 'player',
       'opponent', 'result', 'ace_percentage', 'df_percentage', 'first_in',
       'first_win_perc', 'second_in', 'second_win_perc', 'bp_hold_perc',
       'bp_win_perc', 'opp_first_in', 'first_return_win_perc',
       'second_return_win_perc', 'return_win_perc'],
      dtype='object')


## Skill Scores

In [19]:
### Service Score
def service_score(first_name, last_name):
    score = 0
    stats = filter_results(search_player(first_name=first_name, last_name=last_name), f"{first_name} {last_name}")
    ace_perc = np.average(stats['ace_percentage'])
    first_in = np.average(stats['first_in'])
    first_win_perc = np.average(stats['first_win_perc'])
    second_win_perc = np.average(stats['second_win_perc'])
    df_perc = np.average(stats['df_percentage'])

    return score


In [20]:
### Return Score
def return_score(first_name, last_name):
    score = 0
    stats = filter_results(search_player(first_name=first_name, last_name=last_name), f"{first_name} {last_name}")
    bp_win_perc = np.average(stats['bp_win_perc'])
    first_return_win_perc = np.average(stats['first_return_win_perc'])
    second_return_win_perc = np.average(stats['second_return_win_perc'])
    return_win_perc = np.average(stats['return_win_perc'])

    return score

In [21]:
### Pressure Score
def pressure_score(first_name, last_name):
    score = 0
    stats = filter_results(search_player(first_name=first_name, last_name=last_name), f"{first_name} {last_name}")
    bp_win_perc = np.average(stats['bp_win_perc'])
    bp_hold_perc = np.average(stats['bp_hold_perc'])

    return score

## Model Experimentation

### Markov Chain 

In [22]:
def serve_return_win_perc(first_name, last_name):
    stat_df = filter_results(search_player(first_name, last_name), f"{first_name} {last_name}")
    serve_df = stat_df[['first_in', 'first_win_perc', 'second_win_perc']].dropna()
    serve_df['service_point_win_perc'] = (serve_df['first_in'] * serve_df['first_win_perc']) + ((1 - serve_df['first_in']) * serve_df['second_win_perc'])
    avg_service_point_win_perc = serve_df['service_point_win_perc'].mean()
    
    return_df = stat_df[['opp_first_in', 'first_return_win_perc' ,'second_return_win_perc']].dropna()
    return_df['return_point_win_perc'] = (return_df['opp_first_in'] * return_df['first_return_win_perc']) + ((1 - return_df['opp_first_in']) * return_df['second_return_win_perc'])
    avg_return_point_win_perc = return_df['return_point_win_perc'].mean()
    
    return avg_service_point_win_perc, avg_return_point_win_perc


In [23]:
serve_pc, return_pc = serve_return_win_perc("Alexander", "Zverev")

print(serve_pc)
print(return_pc)

0.7126916777606077
0.3739518339745773


For transition list with this order:  
['0-0', '15-0', '0-15', '30-0', '15-15', '0-30', '40-0', '30-15', '15-30', '0-40', '40-15', '30-30', '15-40', '40-30', '30-40', 'Deuce', 'Advantage Player 1', 'Advantage Player 2', 'Game Player 1', 'Game Player 2']


In [170]:
def get_game_chain(service_pc):
    transition_matrix = np.zeros((20, 20))
    transition_matrix[19][19] = 1
    transition_matrix[18][18] = 1
    return_pc = 1 - service_pc
    service_coords = [(0, 1), (1, 3), (2, 4), (3, 6), (4, 7), (5, 8), (6, 18), (7, 10), (8, 11), (9, 12), (10, 18), (11, 13), (12, 14), (13, 18), (14, 15), (15, 16), (16, 18), (17, 15)]
    return_coords = [(0, 2), (1, 4), (2, 5), (3, 7), (4, 8), (5, 9), (6, 10), (7, 11), (8, 12), (9, 19), (10, 13), (11, 14), (12, 19), (13, 15), (14, 19), (15, 17), (16, 15), (17, 19)]
    for (i, j) , (x, y) in zip(service_coords, return_coords):
        transition_matrix[i, j] = service_pc
        transition_matrix[x, y] = return_pc
    return transition_matrix

In [36]:
def get_fund_matrix(transition_mat, num_absorbing):
    Q_mat = transition_mat[:-num_absorbing, :-num_absorbing]
    R_mat = transition_mat[:-num_absorbing, -num_absorbing:]
    identity_mat = np.eye(len(transition_mat) - num_absorbing)
    N_mat = np.linalg.inv(identity_mat - Q_mat)
    fund_mat = np.dot(N_mat, R_mat)
    return fund_mat

For transition list with this order:  
["0-0", "1-0", "0-1", "2-0", "1-1", "0-2", "3-0", "2-1", "1-2", "0-3", "4-0", "3-1", "2-2", "1-3", "0-4", "5-0", "4-1", "3-2", "2-3", "1-4",  
"0-5", "5-1", "4-2", "3-3", "2-4", "1-5", "5-2", "4-3", "3-4", "2-5", "5-3", "4-4", "3-5", "5-4", "4-5", "5-5", "6-5", "5-6", "Set Player 1", "Set Player 2", "6-6"]

In [167]:
def get_set_chain(p1_service_game_perc, p2_service_game_perc):
    transition_matrix = np.zeros((41, 41))
    transition_matrix[40][40] = 1
    transition_matrix[39][39] = 1
    transition_matrix[38][38] = 1
    
    p1_return_game_perc = 1 - p2_service_game_perc
    p2_return_game_perc = 1 - p1_service_game_perc
    
    p1_serve_coords = [(0, 1), (3, 6), (4, 7), (5, 8), (10, 15), (11, 16), (12, 17), (13, 18), (14, 19), (21, 38), (22, 26), (23, 27), (24, 28), (25, 29), (30, 38), (31, 33), (32, 34), (35, 36)]
    p2_return_coords = [(0, 2), (3, 7), (4, 8), (5, 9), (10, 16), (11, 17), (12, 18), (13, 19), (14, 20), (21, 26), (22, 27), (23, 28), (24, 29), (25, 39), (30, 33), (31, 34), (32, 39), (35, 37)]
    
    p2_serve_coords = [(1, 4), (2, 5), (6, 10), (7, 11), (8, 12), (9, 13), (15, 21), (16, 22), (17, 23), (18, 24), (19, 25), (20, 39), (26, 30), (27, 31), (28, 32), (29, 39), (33, 35), (34, 39), (36, 40), (37, 39)]
    p1_return_coords = [(1, 3), (2, 4), (6, 11), (7, 12), (8, 13), (9, 14), (15, 38), (16, 21), (17, 22), (18, 23), (19, 24), (20, 25), (26, 38), (27, 30), (28, 31), (29, 32), (33, 38), (34, 35), (36, 38), (37, 40)]
    
    for t in range(0, len(p1_return_coords)):
        if t < 18:
            i, j = p1_serve_coords[t]
            x, y = p2_return_coords[t]
        n, m = p2_serve_coords[t]
        o, p = p1_return_coords[t]
        
        transition_matrix[i, j] = p1_service_game_perc
        transition_matrix[x, y] = p2_return_game_perc
        transition_matrix[n, m] = p2_service_game_perc
        transition_matrix[o, p] = p1_return_game_perc
    
    return transition_matrix

For transition list with this order:  
["0-0", "1-0", "0-1", "2-0", "1-1", "0-2", "3-0", "2-1", "1-2", "0-3", "4-0", "3-1", "2-2", "1-3", "0-4", "5-0", "4-1", "3-2", "2-3", "1-4", "0-5",  
"6-0", "5-1", "4-2", "3-3", "2-4", "1-5", "0-6", "6-1", "5-2", "4-3", "3-4", "2-5", "1-6", "6-2", "5-3", "4-4", "3-5", "2-6", "6-3", "5-4", "4-5", "3-6",  
"6-4", "5-5", "4-6", "6-5", "5-6", "6-6", "Ad Player A", "Ad Player B", "Game Player 1", "Game Player 2"]

In [136]:
def write_tiebreak_transition_matrix():
    items = ["0-0", "1-0", "0-1", "2-0", "1-1", "0-2", "3-0", "2-1", "1-2", "0-3",
             "4-0", "3-1", "2-2", "1-3", "0-4", "5-0", "4-1", "3-2", "2-3", "1-4",
             "0-5", "6-0", "5-1", "4-2", "3-3", "2-4", "1-5", "0-6", "6-1", "5-2",
             "4-3", "3-4", "2-5", "1-6", "6-2", "5-3", "4-4", "3-5", "2-6", "6-3",
             "5-4", "4-5", "3-6", "6-4", "5-5", "4-6", "6-5", "5-6", "6-6", 
             "Ad Player 1", "Ad Player 2"]
    item_index_dict = {item: index for index, item in enumerate(items)}
    p1_serve_coords = []
    p2_return_coords = []
    p2_serve_coords = []
    p1_return_coords = []
    server = True
    
    for idx, item in enumerate(items):
        if item[0].isdigit():
            p1_points = int(item[0])
            p2_points = int(item[-1])
            next_item = items[idx + 1]
            if next_item[0].isdigit() and idx < len(items) - 2:
                next_point_sum = int(next_item[0]) + int(next_item[-1])
            if p1_points + 1 < 7 and p2_points + 1 < 7:
                p1_win_coords = (idx, item_index_dict[f"{p1_points + 1}-{p2_points}"])
                p2_win_coords = (idx, item_index_dict[f"{p1_points}-{p2_points + 1}"])
            elif p1_points + 1 == 7 and p2_points + 1 < 7:
                p1_win_coords = (idx, 51)
                p2_win_coords = (idx, item_index_dict[f"{p1_points}-{p2_points + 1}"])
            elif p1_points + 1 < 7 and p2_points + 1 == 7:
                p1_win_coords = (idx, item_index_dict[f"{p1_points + 1}-{p2_points}"])
                p2_win_coords = (idx, 52)
            elif p1_points + 1 == 7 and p2_points + 1 == 7:
                p1_win_coords = (idx, 49)
                p2_win_coords = (idx, 50)
        else:
            if item[-1] == "1":
                p1_win_coords = (idx, 51)
                p2_win_coords = (idx, 44)
            elif item[-1] == "2":
                p1_win_coords = (idx, 44)
                p2_win_coords = (idx, 52)
        if server:
            p1_serve_coords.append(p1_win_coords)
            p2_return_coords.append(p2_win_coords)
            if next_item[0] == "A" or next_point_sum % 4 == 1 :
                server = False
        elif not server:
            p2_serve_coords.append(p2_win_coords)
            p1_return_coords.append(p1_win_coords)
            if next_point_sum % 4 == 3:
                server = True
            
            
    return p1_serve_coords, p2_return_coords, p2_serve_coords, p1_return_coords

In [144]:
def get_tiebreak_chain(p1_service_perc, p2_service_perc):
    
    transition_matrix = np.zeros((53, 53))
    transition_matrix[52][52] = 1
    transition_matrix[51][51] = 1
    
    p1_serve_coords, p2_return_coords, p2_serve_coords, p1_return_coords = write_tiebreak_transition_matrix()
    
    p1_return_perc = 1 - p2_service_perc
    p2_return_perc = 1 - p1_service_perc
    
    for t in range(0, len(p2_serve_coords)):
        if t < 24:
            n, m = p1_serve_coords[t]
            o, p = p2_return_coords[t]
        i, j = p2_serve_coords[t]
        x, y = p1_return_coords[t]
        
        transition_matrix[i, j] = p2_service_perc
        transition_matrix[x, y] = p1_return_perc
        transition_matrix[n, m] = p1_service_perc
        transition_matrix[o, p] = p2_return_perc
    
    return transition_matrix

In [150]:
def get_set_win_perc(p1_service_perc, p2_service_perc):
    p1_game_transition_matrix = get_game_chain(p1_service_perc)
    p2_game_transition_matrix = get_game_chain(p2_service_perc)
    
    p1_fund_matrix = get_fund_matrix(p1_game_transition_matrix, 2)
    p2_fund_matrix = get_fund_matrix(p2_game_transition_matrix, 2)
    
    p1_service_game_perc = p1_fund_matrix[0][0]
    p2_service_game_perc = p2_fund_matrix[0][1]
    
    set_transition_matrix = get_set_chain(p1_service_game_perc, p2_service_game_perc)
    set_fund_matrix = get_fund_matrix(set_transition_matrix, 3)
    tiebreak_perc = set_fund_matrix[0][-1]
    tiebreak_transition_matrix = get_tiebreak_chain(p1_service_perc, p2_service_perc)
    tiebreak_fund_matrix = get_fund_matrix(tiebreak_transition_matrix, 2)
    p1_tiebreak_winner = tiebreak_fund_matrix[0][0]
    p2_tiebreak_winner = tiebreak_fund_matrix[0][1]

    p1_win_perc = set_fund_matrix[0][0] + tiebreak_perc * p1_tiebreak_winner
    p2_win_perc = set_fund_matrix[0][1] + tiebreak_perc * p2_tiebreak_winner
    
    return p1_win_perc, p2_win_perc

In [164]:
def get_match_prob(p1_service_perc, p2_service_perc, best_of=3):
    p1_set_win_perc, p2_set_win_perc = get_set_win_perc(p1_service_perc, p2_service_perc)
    
    if best_of == 3:
        p1_straight_sets = p1_set_win_perc * p1_set_win_perc
        p1_split_sets = p1_set_win_perc * p1_set_win_perc * p2_set_win_perc * 2
        
        p2_straight_sets = p2_set_win_perc * p2_set_win_perc
        p2_split_sets = p2_set_win_perc * p2_set_win_perc * p1_set_win_perc * 2
        
        return p1_straight_sets + p1_split_sets, p2_straight_sets + p2_split_sets
    elif best_of == 5:
        p1_straight_sets = p1_set_win_perc * p1_set_win_perc * p1_set_win_perc 
        p1_drop_1_sets = p1_set_win_perc * p1_set_win_perc * p1_set_win_perc * p2_set_win_perc * 3
        p1_drop_2_sets = p1_set_win_perc * p1_set_win_perc * p1_set_win_perc * p2_set_win_perc * p2_set_win_perc * 6
        
        p2_straight_sets = p2_set_win_perc * p2_set_win_perc * p2_set_win_perc 
        p2_drop_1_sets = p2_set_win_perc * p2_set_win_perc * p2_set_win_perc * p1_set_win_perc * 3
        p2_drop_2_sets = p2_set_win_perc * p2_set_win_perc * p2_set_win_perc * p1_set_win_perc * p1_set_win_perc * 6
        
        return p1_straight_sets + p1_drop_1_sets + p1_drop_2_sets, p2_straight_sets + p2_drop_1_sets + p2_drop_2_sets
    
    return "THIS IS NOT A POSSIBLE BEST OF OPTION"

In [165]:
print(get_match_prob(0.6, 0.4, 5))

(0.8036560893088448, 0.19634391069115492)
