In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
WL_extensions = ['age', 'entry', 'hand', 'ht', 'id', 'ioc', 'name', 'rank', 'rank_points', 'seed']

def obscure_features(df):
    
    df['player_1'] = pd.concat([df['winner_name'], df['loser_name']], axis = 1).min(axis = 1)
    df['player_2'] = pd.concat([df['winner_name'], df['loser_name']], axis = 1).max(axis = 1)
    
    for ext in WL_extensions:
        p1_feature = np.where(df['player_1'] == df['winner_name'],
                     df['winner_' + ext],
                     df['loser_' + ext])
    
        p2_feature = np.where(df['player_2'] == df['winner_name'],
                     df['winner_' + ext],
                     df['loser_' + ext])
    
        df['player_1_' + ext] = p1_feature
        df['player_2_' + ext] = p2_feature
        
    winner_cols = list(filter(lambda x: x.startswith('winner'), df.columns))
    loser_cols = list(filter(lambda x: x.startswith('loser'), df.columns))
    cols_to_drop = winner_cols + loser_cols
    
    target = df['winner_name']
    
    df.drop(cols_to_drop, axis = 1, inplace = True)
    
    df['winner'] = target
    
    return df

In [3]:
df1 = [pd.read_csv('atp_matches_' + str(year) + '.csv') for year in range(2015,2023)]
df1

[     tourney_id                tourney_name surface  draw_size tourney_level  \
 0      2015-339                    Brisbane    Hard         28             A   
 1      2015-339                    Brisbane    Hard         28             A   
 2      2015-339                    Brisbane    Hard         28             A   
 3      2015-339                    Brisbane    Hard         28             A   
 4      2015-339                    Brisbane    Hard         28             A   
 ...         ...                         ...     ...        ...           ...   
 2938   2015-605                 Tour Finals    Hard          8             F   
 2939   2015-605                 Tour Finals    Hard          8             F   
 2940  2015-D015  Davis Cup WG F: BEL vs GBR    Clay          4             D   
 2941  2015-D015  Davis Cup WG F: BEL vs GBR    Clay          4             D   
 2942  2015-D015  Davis Cup WG F: BEL vs GBR    Clay          4             D   
 
       tourney_date  match

In [4]:
matches = obscure_features(pd.concat(df1, axis = 0))

In [5]:
matches.index = range(matches.shape[0])

In [6]:
# Player rankings from 2015-2023

rankings_10s = pd.read_csv('atp_rankings_10s.csv', header = None, dtype=str, skiprows=1)
rankings_10s.columns = ['week', 'ranking', 'player_id', 'ranking_points']

rankings_20s = pd.read_csv('atp_rankings_20s.csv', header = None, dtype=str, skiprows=1)
rankings_20s.columns = ['week', 'ranking', 'player_id', 'ranking_points']

rankings_current = pd.read_csv('atp_rankings_current.csv', header = None, dtype=str, skiprows=1)
rankings_current.columns = ['week', 'ranking', 'player_id', 'ranking_points']

rankings = pd.concat([rankings_10s, rankings_20s, rankings_current])

rankings.index = range(rankings.shape[0])

rankings

Unnamed: 0,week,ranking,player_id,ranking_points
0,20100104,1,103819,10550
1,20100104,2,104745,9205
2,20100104,3,104925,8310
3,20100104,4,104918,7030
4,20100104,5,105223,6785
...,...,...,...,...
1250612,20240101,2052,131888,1
1250613,20240101,2052,210236,1
1250614,20240101,2052,210649,1
1250615,20240101,2052,211765,1


In [7]:
# Convert ranking dates to datetime
rankings['week'] = pd.to_datetime(rankings['week'], format = '%Y%m%d')
rankings.head()

Unnamed: 0,week,ranking,player_id,ranking_points
0,2010-01-04,1,103819,10550
1,2010-01-04,2,104745,9205
2,2010-01-04,3,104925,8310
3,2010-01-04,4,104918,7030
4,2010-01-04,5,105223,6785


In [8]:
# Importing modules for working with date and time
from datetime import timedelta, date

In [9]:
# Convert tournament dates to datetime
matches['tourney_date'] = pd.to_datetime(matches.tourney_date, format = '%Y%m%d')
matches

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,best_of,round,...,player_2_ioc,player_1_name,player_2_name,player_1_rank,player_2_rank,player_1_rank_points,player_2_rank_points,player_1_seed,player_2_seed,winner
0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,3,R32,...,USA,John Millman,Rhyne Williams,153.0,220.0,328.0,221.0,,,John Millman
1,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,3,R32,...,FIN,Denis Kudla,Jarkko Nieminen,123.0,73.0,440.0,689.0,,,Jarkko Nieminen
2,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,3,R32,...,AUS,Gilles Simon,James Duckworth,21.0,125.0,1730.0,430.0,6.0,,James Duckworth
3,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,3,R32,...,FRA,Andrey Golubev,Jeremy Chardy,72.0,31.0,691.0,1195.0,,,Jeremy Chardy
4,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7(5) 7-6(6) 6-1,3,R32,...,SVK,Jurgen Melzer,Martin Klizan,110.0,34.0,505.0,1094.0,,,Martin Klizan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21605,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,4,6-4 6-4,3,RR,...,USA,Aristotelis Thanos,Blaise Bicknell,1103.0,1130.0,9.0,8.0,,,Aristotelis Thanos
21606,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,5,4-6 6-4 6-4,3,RR,...,JAM,Petros Tsitsipas,Rowland Phillips,808.0,1390.0,23.0,4.0,,,Petros Tsitsipas
21607,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,1,1-6 6-4 4-1 RET,3,RR,...,HKG,Alexis Klegou,Hong Kit Jack Wong,1881.0,1059.0,1.0,10.0,,,Hong Kit Jack Wong
21608,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,2,6-4 6-4,3,RR,...,BEN,Coleman Wong,Delmas Ntcha,1050.0,,10.0,,,,Coleman Wong


In [10]:
import re

In [11]:
#remove rows whose scores contain letters, indicating a retirement, default, or walkover

score_mask = matches['score'].apply(lambda x: bool(re.findall('[a-zA-Z]', x)))
matches = matches[~score_mask]
matches

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,best_of,round,...,player_2_ioc,player_1_name,player_2_name,player_1_rank,player_2_rank,player_1_rank_points,player_2_rank_points,player_1_seed,player_2_seed,winner
0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,3,R32,...,USA,John Millman,Rhyne Williams,153.0,220.0,328.0,221.0,,,John Millman
1,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,3,R32,...,FIN,Denis Kudla,Jarkko Nieminen,123.0,73.0,440.0,689.0,,,Jarkko Nieminen
2,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,3,R32,...,AUS,Gilles Simon,James Duckworth,21.0,125.0,1730.0,430.0,6.0,,James Duckworth
3,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,3,R32,...,FRA,Andrey Golubev,Jeremy Chardy,72.0,31.0,691.0,1195.0,,,Jeremy Chardy
4,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7(5) 7-6(6) 6-1,3,R32,...,SVK,Jurgen Melzer,Martin Klizan,110.0,34.0,505.0,1094.0,,,Martin Klizan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21604,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,2,6-3 7-5,3,RR,...,GRE,Blaise Bicknell,Petros Tsitsipas,1130.0,808.0,8.0,23.0,,,Blaise Bicknell
21605,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,4,6-4 6-4,3,RR,...,USA,Aristotelis Thanos,Blaise Bicknell,1103.0,1130.0,9.0,8.0,,,Aristotelis Thanos
21606,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,5,4-6 6-4 6-4,3,RR,...,JAM,Petros Tsitsipas,Rowland Phillips,808.0,1390.0,23.0,4.0,,,Petros Tsitsipas
21608,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,2,6-4 6-4,3,RR,...,BEN,Coleman Wong,Delmas Ntcha,1050.0,,10.0,,,,Coleman Wong


In [12]:
# remove parenthetical portion of score for tie-breakers

scores = matches['score'].copy()
matches.loc[:,'score'] = scores.apply(lambda score: re.sub('\(\d*\)', '', score))
matches

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,best_of,round,...,player_2_ioc,player_1_name,player_2_name,player_1_rank,player_2_rank,player_1_rank_points,player_2_rank_points,player_1_seed,player_2_seed,winner
0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,3,R32,...,USA,John Millman,Rhyne Williams,153.0,220.0,328.0,221.0,,,John Millman
1,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,3,R32,...,FIN,Denis Kudla,Jarkko Nieminen,123.0,73.0,440.0,689.0,,,Jarkko Nieminen
2,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,3,R32,...,AUS,Gilles Simon,James Duckworth,21.0,125.0,1730.0,430.0,6.0,,James Duckworth
3,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,3,R32,...,FRA,Andrey Golubev,Jeremy Chardy,72.0,31.0,691.0,1195.0,,,Jeremy Chardy
4,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7 7-6 6-1,3,R32,...,SVK,Jurgen Melzer,Martin Klizan,110.0,34.0,505.0,1094.0,,,Martin Klizan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21604,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,2,6-3 7-5,3,RR,...,GRE,Blaise Bicknell,Petros Tsitsipas,1130.0,808.0,8.0,23.0,,,Blaise Bicknell
21605,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,4,6-4 6-4,3,RR,...,USA,Aristotelis Thanos,Blaise Bicknell,1103.0,1130.0,9.0,8.0,,,Aristotelis Thanos
21606,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,5,4-6 6-4 6-4,3,RR,...,JAM,Petros Tsitsipas,Rowland Phillips,808.0,1390.0,23.0,4.0,,,Petros Tsitsipas
21608,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,2,6-4 6-4,3,RR,...,BEN,Coleman Wong,Delmas Ntcha,1050.0,,10.0,,,,Coleman Wong


In [13]:
# Function indicating number of games played in match

def total_games(s):
    num_list = re.findall('\d+', s)
    num_list = [int(x) for x in num_list]
    return sum(num_list)

matches.loc[:,'total_games'] = matches['score'].apply(total_games)
matches

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches.loc[:,'total_games'] = matches['score'].apply(total_games)


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,best_of,round,...,player_1_name,player_2_name,player_1_rank,player_2_rank,player_1_rank_points,player_2_rank_points,player_1_seed,player_2_seed,winner,total_games
0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,3,R32,...,John Millman,Rhyne Williams,153.0,220.0,328.0,221.0,,,John Millman,16
1,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,3,R32,...,Denis Kudla,Jarkko Nieminen,123.0,73.0,440.0,689.0,,,Jarkko Nieminen,27
2,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,3,R32,...,Gilles Simon,James Duckworth,21.0,125.0,1730.0,430.0,6.0,,James Duckworth,16
3,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,3,R32,...,Andrey Golubev,Jeremy Chardy,72.0,31.0,691.0,1195.0,,,Jeremy Chardy,20
4,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7 7-6 6-1,3,R32,...,Jurgen Melzer,Martin Klizan,110.0,34.0,505.0,1094.0,,,Martin Klizan,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21604,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,2,6-3 7-5,3,RR,...,Blaise Bicknell,Petros Tsitsipas,1130.0,808.0,8.0,23.0,,,Blaise Bicknell,21
21605,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,4,6-4 6-4,3,RR,...,Aristotelis Thanos,Blaise Bicknell,1103.0,1130.0,9.0,8.0,,,Aristotelis Thanos,20
21606,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,5,4-6 6-4 6-4,3,RR,...,Petros Tsitsipas,Rowland Phillips,808.0,1390.0,23.0,4.0,,,Petros Tsitsipas,30
21608,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,2,6-4 6-4,3,RR,...,Coleman Wong,Delmas Ntcha,1050.0,,10.0,,,,Coleman Wong,20


In [14]:
matches.index = range(matches.shape[0])
matches

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,best_of,round,...,player_1_name,player_2_name,player_1_rank,player_2_rank,player_1_rank_points,player_2_rank_points,player_1_seed,player_2_seed,winner,total_games
0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,3,R32,...,John Millman,Rhyne Williams,153.0,220.0,328.0,221.0,,,John Millman,16
1,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,3,R32,...,Denis Kudla,Jarkko Nieminen,123.0,73.0,440.0,689.0,,,Jarkko Nieminen,27
2,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,3,R32,...,Gilles Simon,James Duckworth,21.0,125.0,1730.0,430.0,6.0,,James Duckworth,16
3,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,3,R32,...,Andrey Golubev,Jeremy Chardy,72.0,31.0,691.0,1195.0,,,Jeremy Chardy,20
4,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7 7-6 6-1,3,R32,...,Jurgen Melzer,Martin Klizan,110.0,34.0,505.0,1094.0,,,Martin Klizan,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20893,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,2,6-3 7-5,3,RR,...,Blaise Bicknell,Petros Tsitsipas,1130.0,808.0,8.0,23.0,,,Blaise Bicknell,21
20894,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,4,6-4 6-4,3,RR,...,Aristotelis Thanos,Blaise Bicknell,1103.0,1130.0,9.0,8.0,,,Aristotelis Thanos,20
20895,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,5,4-6 6-4 6-4,3,RR,...,Petros Tsitsipas,Rowland Phillips,808.0,1390.0,23.0,4.0,,,Petros Tsitsipas,30
20896,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,2,6-4 6-4,3,RR,...,Coleman Wong,Delmas Ntcha,1050.0,,10.0,,,,Coleman Wong,20


In [15]:
# Arrange matches chronologically
# This will be important, as many added features are based on a player's previous performance

matches2 = matches.copy().sort_values(by = ['tourney_date', 'tourney_id', 'match_num'])
matches2

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,best_of,round,...,player_1_name,player_2_name,player_1_rank,player_2_rank,player_1_rank_points,player_2_rank_points,player_1_seed,player_2_seed,winner,total_games
0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,3,R32,...,John Millman,Rhyne Williams,153.0,220.0,328.0,221.0,,,John Millman,16
1,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,3,R32,...,Denis Kudla,Jarkko Nieminen,123.0,73.0,440.0,689.0,,,Jarkko Nieminen,27
2,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,3,R32,...,Gilles Simon,James Duckworth,21.0,125.0,1730.0,430.0,6.0,,James Duckworth,16
3,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,3,R32,...,Andrey Golubev,Jeremy Chardy,72.0,31.0,691.0,1195.0,,,Jeremy Chardy,20
4,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7 7-6 6-1,3,R32,...,Jurgen Melzer,Martin Klizan,110.0,34.0,505.0,1094.0,,,Martin Klizan,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20741,2022-M-DC-2022-FLS-M-AUS-CRO-01,Davis Cup Finals SF: AUS vs CRO,Hard,2,D,2022-11-25,2,6-2 6-2,3,RR,...,Alex De Minaur,Marin Cilic,24.0,17.0,1790.0,2105.0,,,Alex De Minaur,16
20750,2022-M-DC-2022-FLS-M-ITA-CAN-01,Davis Cup Finals SF: ITA vs CAN,Hard,2,D,2022-11-26,1,7-6 6-7 6-4,3,RR,...,Denis Shapovalov,Lorenzo Sonego,18.0,45.0,2075.0,950.0,,,Lorenzo Sonego,36
20751,2022-M-DC-2022-FLS-M-ITA-CAN-01,Davis Cup Finals SF: ITA vs CAN,Hard,2,D,2022-11-26,2,6-3 6-4,3,RR,...,Felix Auger Aliassime,Lorenzo Musetti,6.0,23.0,4195.0,1865.0,,,Felix Auger Aliassime,19
20744,2022-M-DC-2022-FLS-M-CAN-AUS-01,Davis Cup Finals F: CAN vs AUS,Hard,2,D,2022-11-27,1,6-2 6-4,3,RR,...,Denis Shapovalov,Thanasi Kokkinakis,18.0,95.0,2075.0,592.0,,,Denis Shapovalov,18


In [16]:
# want to add columns containing head-to-heads between players from previous matches.

# first add column with both players' names

matches2['players'] = list(zip(matches2['player_1_name'], matches2['player_2_name']))

g = matches2.groupby('players')
df_list = []

#group by (player_1_name, player_2_name) and keep running total of wins by each player

for group, df in g:
    df = df[['winner']]
    
    n = df.shape[0]
    
    player_1_h2h = np.zeros(n)
    player_2_h2h = np.zeros(n)
    
    p1 = group[0]
    p2 = group[1]

    for i in range(1,n):
        if df.iloc[i-1,0] == p1:
            player_1_h2h[i] = player_1_h2h[i-1] + 1
            player_2_h2h[i] = player_2_h2h[i-1]
        else:
            player_1_h2h[i] = player_1_h2h[i-1]
            player_2_h2h[i] = player_2_h2h[i-1] + 1
            
    df['player_1_h2h'] = player_1_h2h
    df['player_2_h2h'] = player_2_h2h
    
    df_list.append(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['player_1_h2h'] = player_1_h2h
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['player_2_h2h'] = player_2_h2h
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['player_1_h2h'] = player_1_h2h
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [17]:
df_h2h = pd.concat(df_list, axis = 0)
df_h2h

Unnamed: 0,winner,player_1_h2h,player_2_h2h
3476,Jeson Patrombon,0.0,0.0
1778,Abdullah Maqdas,0.0,0.0
5963,Jirat Navasirisomboon,0.0,0.0
1776,Shahin Khaledan,0.0,0.0
5965,Wishaya Trongcharoenchaikul,0.0,0.0
...,...,...,...
3435,Ze Zhang,0.0,0.0
14580,Yasutaka Uchiyama,0.0,0.0
13951,Yoshihito Nishioka,0.0,0.0
11009,Yibing Wu,0.0,0.0


In [18]:
df_h2h = df_h2h.drop('winner', axis = 1)
df_h2h

Unnamed: 0,player_1_h2h,player_2_h2h
3476,0.0,0.0
1778,0.0,0.0
5963,0.0,0.0
1776,0.0,0.0
5965,0.0,0.0
...,...,...
3435,0.0,0.0
14580,0.0,0.0
13951,0.0,0.0
11009,0.0,0.0


In [19]:
matches2 = pd.merge( df_h2h,matches2, left_index=True, right_index=True, how='inner')
matches2 = matches2.drop('players', axis = 1)
matches2

Unnamed: 0,player_1_h2h,player_2_h2h,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,...,player_1_name,player_2_name,player_1_rank,player_2_rank,player_1_rank_points,player_2_rank_points,player_1_seed,player_2_seed,winner,total_games
3476,0.0,0.0,2016-M-DC-2016-G2-AO-M-PHI-KUW-01,Davis Cup G2 R1: PHI vs KUW,Clay,4,D,2016-03-04,4,7-6 6-3,...,Abdulhamid Mubarak,Jeson Patrombon,,1198.0,,7.0,,,Jeson Patrombon,22
1778,0.0,0.0,2015-D069,Davis Cup G2 PO: IRI vs KUW,Clay,4,D,2015-07-17,4,6-0 7-6 4-6 6-2,...,Abdullah Maqdas,Anoosha Shahgholi,1695.0,1527.0,2.0,3.0,,,Abdullah Maqdas,37
5963,0.0,0.0,2017-M-DC-2017-G2-AO-M-KUW-THA-01,Davis Cup G2 R1: KUW vs THA,Hard,4,D,2017-02-03,1,6-2 6-4 6-7 6-4,...,Abdullah Maqdas,Jirat Navasirisomboon,1298.0,997.0,4.0,12.0,,,Jirat Navasirisomboon,41
1776,0.0,0.0,2015-D069,Davis Cup G2 PO: IRI vs KUW,Clay,4,D,2015-07-17,1,4-6 6-2 6-0 7-5,...,Abdullah Maqdas,Shahin Khaledan,1695.0,2101.0,2.0,1.0,,,Shahin Khaledan,36
5965,0.0,0.0,2017-M-DC-2017-G2-AO-M-KUW-THA-01,Davis Cup G2 R1: KUW vs THA,Hard,4,D,2017-02-03,4,6-4 6-2 5-7 4-6 7-6,...,Abdullah Maqdas,Wishaya Trongcharoenchaikul,1298.0,773.0,4.0,25.0,,,Wishaya Trongcharoenchaikul,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3435,0.0,0.0,2016-M-DC-2016-G1-AO-M-PAK-CHN-01,Davis Cup G1 R1: PAK vs CHN,Clay,4,D,2016-03-04,5,6-3 6-0,...,Yasir Khan,Ze Zhang,,195.0,,271.0,,,Ze Zhang,15
14580,0.0,0.0,2020-0495,Dubai,Hard,32,A,2020-02-24,280,3-6 6-1 6-2,...,Yasutaka Uchiyama,Yen Hsun Lu,99.0,962.0,604.0,10.0,,,Yasutaka Uchiyama,24
13951,0.0,0.0,2019-M-DC-2019-QLS-M-JPN-CHN-01,Davis Cup QLS R1: JPN vs CHN,Hard,4,D,2019-02-01,4,6-2 6-0,...,Yibing Wu,Yoshihito Nishioka,309.0,66.0,105.0,758.0,,,Yoshihito Nishioka,14
11009,0.0,0.0,2018-5014,Shanghai Masters,Hard,64,M,2018-10-08,263,7-6 6-2,...,Yibing Wu,Zhe Li,415.0,254.0,91.0,214.0,,,Yibing Wu,21


In [20]:
# impute missing player rankings

n = matches2.shape[0]

p1_rank_missing = matches2['player_1_rank'].isnull()
p2_rank_missing = matches2['player_2_rank'].isnull()

for i in range(n):
    if p1_rank_missing[i]:
        week = matches2.loc[i, 'tourney_date']
        week -= timedelta(week.weekday())
        p1_id = matches2.loc[i, 'player_1_id']
        
        mask = (rankings['week'] == week) & (rankings['player_id'] == p1_id)
        if rankings[mask].shape[0] > 1:
            matches2.loc[i, 'player_1_rank'] = rankings.loc[mask, 'ranking'][0]
            
    if p2_rank_missing[i]:
        week = matches2.loc[i, 'tourney_date']
        week -= timedelta(week.weekday())
        p2_id = matches2.loc[i, 'player_2_id']
        
        mask = (rankings['week'] == week) & (rankings['player_id'] == p2_id)
        if rankings[mask].shape[0] > 1:
            matches2.loc[i, 'player_2_rank'] = rankings.loc[mask, 'ranking'][0]
            
matches2

Unnamed: 0,player_1_h2h,player_2_h2h,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,...,player_1_name,player_2_name,player_1_rank,player_2_rank,player_1_rank_points,player_2_rank_points,player_1_seed,player_2_seed,winner,total_games
3476,0.0,0.0,2016-M-DC-2016-G2-AO-M-PHI-KUW-01,Davis Cup G2 R1: PHI vs KUW,Clay,4,D,2016-03-04,4,7-6 6-3,...,Abdulhamid Mubarak,Jeson Patrombon,,1198.0,,7.0,,,Jeson Patrombon,22
1778,0.0,0.0,2015-D069,Davis Cup G2 PO: IRI vs KUW,Clay,4,D,2015-07-17,4,6-0 7-6 4-6 6-2,...,Abdullah Maqdas,Anoosha Shahgholi,1695.0,1527.0,2.0,3.0,,,Abdullah Maqdas,37
5963,0.0,0.0,2017-M-DC-2017-G2-AO-M-KUW-THA-01,Davis Cup G2 R1: KUW vs THA,Hard,4,D,2017-02-03,1,6-2 6-4 6-7 6-4,...,Abdullah Maqdas,Jirat Navasirisomboon,1298.0,997.0,4.0,12.0,,,Jirat Navasirisomboon,41
1776,0.0,0.0,2015-D069,Davis Cup G2 PO: IRI vs KUW,Clay,4,D,2015-07-17,1,4-6 6-2 6-0 7-5,...,Abdullah Maqdas,Shahin Khaledan,1695.0,2101.0,2.0,1.0,,,Shahin Khaledan,36
5965,0.0,0.0,2017-M-DC-2017-G2-AO-M-KUW-THA-01,Davis Cup G2 R1: KUW vs THA,Hard,4,D,2017-02-03,4,6-4 6-2 5-7 4-6 7-6,...,Abdullah Maqdas,Wishaya Trongcharoenchaikul,1298.0,773.0,4.0,25.0,,,Wishaya Trongcharoenchaikul,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3435,0.0,0.0,2016-M-DC-2016-G1-AO-M-PAK-CHN-01,Davis Cup G1 R1: PAK vs CHN,Clay,4,D,2016-03-04,5,6-3 6-0,...,Yasir Khan,Ze Zhang,,195.0,,271.0,,,Ze Zhang,15
14580,0.0,0.0,2020-0495,Dubai,Hard,32,A,2020-02-24,280,3-6 6-1 6-2,...,Yasutaka Uchiyama,Yen Hsun Lu,99.0,962.0,604.0,10.0,,,Yasutaka Uchiyama,24
13951,0.0,0.0,2019-M-DC-2019-QLS-M-JPN-CHN-01,Davis Cup QLS R1: JPN vs CHN,Hard,4,D,2019-02-01,4,6-2 6-0,...,Yibing Wu,Yoshihito Nishioka,309.0,66.0,105.0,758.0,,,Yoshihito Nishioka,14
11009,0.0,0.0,2018-5014,Shanghai Masters,Hard,64,M,2018-10-08,263,7-6 6-2,...,Yibing Wu,Zhe Li,415.0,254.0,91.0,214.0,,,Yibing Wu,21


In [21]:
# a column indicating whether the match was won by player 1 or player 2
# will serve as the target variable for machine learning models

matches2['target'] = np.where(matches2['winner'] == matches2['player_1_name'],
                              'player_1',
                              'player_2')
matches2

Unnamed: 0,player_1_h2h,player_2_h2h,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,...,player_2_name,player_1_rank,player_2_rank,player_1_rank_points,player_2_rank_points,player_1_seed,player_2_seed,winner,total_games,target
3476,0.0,0.0,2016-M-DC-2016-G2-AO-M-PHI-KUW-01,Davis Cup G2 R1: PHI vs KUW,Clay,4,D,2016-03-04,4,7-6 6-3,...,Jeson Patrombon,,1198.0,,7.0,,,Jeson Patrombon,22,player_2
1778,0.0,0.0,2015-D069,Davis Cup G2 PO: IRI vs KUW,Clay,4,D,2015-07-17,4,6-0 7-6 4-6 6-2,...,Anoosha Shahgholi,1695.0,1527.0,2.0,3.0,,,Abdullah Maqdas,37,player_1
5963,0.0,0.0,2017-M-DC-2017-G2-AO-M-KUW-THA-01,Davis Cup G2 R1: KUW vs THA,Hard,4,D,2017-02-03,1,6-2 6-4 6-7 6-4,...,Jirat Navasirisomboon,1298.0,997.0,4.0,12.0,,,Jirat Navasirisomboon,41,player_2
1776,0.0,0.0,2015-D069,Davis Cup G2 PO: IRI vs KUW,Clay,4,D,2015-07-17,1,4-6 6-2 6-0 7-5,...,Shahin Khaledan,1695.0,2101.0,2.0,1.0,,,Shahin Khaledan,36,player_2
5965,0.0,0.0,2017-M-DC-2017-G2-AO-M-KUW-THA-01,Davis Cup G2 R1: KUW vs THA,Hard,4,D,2017-02-03,4,6-4 6-2 5-7 4-6 7-6,...,Wishaya Trongcharoenchaikul,1298.0,773.0,4.0,25.0,,,Wishaya Trongcharoenchaikul,53,player_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3435,0.0,0.0,2016-M-DC-2016-G1-AO-M-PAK-CHN-01,Davis Cup G1 R1: PAK vs CHN,Clay,4,D,2016-03-04,5,6-3 6-0,...,Ze Zhang,,195.0,,271.0,,,Ze Zhang,15,player_2
14580,0.0,0.0,2020-0495,Dubai,Hard,32,A,2020-02-24,280,3-6 6-1 6-2,...,Yen Hsun Lu,99.0,962.0,604.0,10.0,,,Yasutaka Uchiyama,24,player_1
13951,0.0,0.0,2019-M-DC-2019-QLS-M-JPN-CHN-01,Davis Cup QLS R1: JPN vs CHN,Hard,4,D,2019-02-01,4,6-2 6-0,...,Yoshihito Nishioka,309.0,66.0,105.0,758.0,,,Yoshihito Nishioka,14,player_2
11009,0.0,0.0,2018-5014,Shanghai Masters,Hard,64,M,2018-10-08,263,7-6 6-2,...,Zhe Li,415.0,254.0,91.0,214.0,,,Yibing Wu,21,player_1


In [22]:
n = matches2.shape[0]

p1_matches = matches2[['tourney_date', 'match_num', 'player_1_name', 'winner']].copy()
p1_matches.columns = ['tourney_date', 'match_num', 'player', 'winner']
p2_matches = matches2[['tourney_date', 'match_num', 'player_2_name', 'winner']].copy()
p2_matches.columns = ['tourney_date', 'match_num', 'player', 'winner']

p1_matches['winner'] = (p1_matches['winner'] == p1_matches['player']).astype('int')
p2_matches['winner'] = (p2_matches['winner'] == p2_matches['player']).astype('int')

p1_matches['category'] = np.ones(n)
p2_matches['category'] = np.ones(n)*2

all_players = pd.concat([p1_matches, p2_matches], axis = 0)
all_players = all_players.sort_values(by = ['tourney_date', 'match_num'], axis = 0)

groups = all_players.groupby('player')

df_list = []
for group, df in groups:
    df['cumulative_matches'] = range(df.shape[0])
    df['cumulative_wins'] = df['winner'].cumsum().apply(lambda x: max(0, x - 1))
    df_list.append(df)
    
all_players = pd.concat(df_list, axis = 0)

for group, df in all_players.groupby('category'):
    df = df.drop(['tourney_date', 'match_num', 'player', 'winner', 'category'], axis = 1)
    df.columns = ['player_' + str(int(group)) + '_matches',
                  'player_' + str(int(group)) + '_wins']
    matches2 = pd.merge(matches2, df, left_index = True, right_index = True, sort = True)

In [23]:
matches2

Unnamed: 0,player_1_h2h,player_2_h2h,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,...,player_2_rank_points,player_1_seed,player_2_seed,winner,total_games,target,player_1_matches,player_1_wins,player_2_matches,player_2_wins
0,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,...,221.0,,,John Millman,16,player_1,0,0,0,0
1,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,...,689.0,,,Jarkko Nieminen,27,player_2,0,0,0,0
2,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,...,430.0,6.0,,James Duckworth,16,player_2,0,0,0,0
3,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,...,1195.0,,,Jeremy Chardy,20,player_2,0,0,0,0
4,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7 7-6 6-1,...,1094.0,,,Martin Klizan,33,player_2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20893,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,2,6-3 7-5,...,23.0,,,Blaise Bicknell,21,player_1,2,1,4,1
20894,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,4,6-4 6-4,...,8.0,,,Aristotelis Thanos,20,player_1,1,0,3,1
20895,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,5,4-6 6-4 6-4,...,4.0,,,Petros Tsitsipas,30,player_1,5,2,2,0
20896,0.0,0.0,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,2,6-4 6-4,...,,,,Coleman Wong,20,player_1,1,0,0,0


In [24]:
def winpct(x, y):
    
    if x == 0:
        return 0
    else:
        return y/x

winpct_vec = np.vectorize(winpct, otypes=[float])
matches2['player_1_win_pct'] = winpct_vec(matches2['player_1_matches'], matches2['player_1_wins'])
matches2['player_2_win_pct'] = winpct_vec(matches2['player_2_matches'], matches2['player_2_wins'])

matches2

Unnamed: 0,player_1_h2h,player_2_h2h,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,...,player_2_seed,winner,total_games,target,player_1_matches,player_1_wins,player_2_matches,player_2_wins,player_1_win_pct,player_2_win_pct
0,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,...,,John Millman,16,player_1,0,0,0,0,0.0,0.000000
1,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,...,,Jarkko Nieminen,27,player_2,0,0,0,0,0.0,0.000000
2,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,...,,James Duckworth,16,player_2,0,0,0,0,0.0,0.000000
3,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,...,,Jeremy Chardy,20,player_2,0,0,0,0,0.0,0.000000
4,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7 7-6 6-1,...,,Martin Klizan,33,player_2,0,0,0,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20893,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,2,6-3 7-5,...,,Blaise Bicknell,21,player_1,2,1,4,1,0.5,0.250000
20894,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,4,6-4 6-4,...,,Aristotelis Thanos,20,player_1,1,0,3,1,0.0,0.333333
20895,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,5,4-6 6-4 6-4,...,,Petros Tsitsipas,30,player_1,5,2,2,0,0.4,0.000000
20896,0.0,0.0,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,2,6-4 6-4,...,,Coleman Wong,20,player_1,1,0,0,0,0.0,0.000000


In [25]:
# recent form is overall win % plus a "penalty" of log10(1-(overall win % ) + (last 6 months win %))

n = matches2.shape[0]

player_1_recent_form = np.zeros(n)
player_2_recent_form = np.zeros(n)

for i in range(n):
    p1 = matches2.loc[i, 'player_1_name']
    p2 = matches2.loc[i, 'player_2_name']
    
    current_date = matches2.loc[i, 'tourney_date']
    match_number = matches2.loc[i, 'match_num']
    
    mask1 = matches2['tourney_date'] < current_date
    mask2 = (matches2['tourney_date'] == current_date) & (matches2['match_num'] < match_number)
    mask3 = matches2['tourney_date'] >= current_date - timedelta(180)
    
    recent_matches = matches2[(mask1 | mask2) & mask3]
    
    mask4 = (recent_matches['player_1_name'] == p1) | (recent_matches['player_2_name'] == p1)
    mask5 = (recent_matches['winner'] == p1)
    
    p1_matches = recent_matches[mask4].shape[0]
    p1_wins = recent_matches[mask5].shape[0]
    
    if p1_wins >= 10:
        player_1_recent_form[i] = (matches2.loc[i, 'player_1_win_pct']
                                   + np.log10(1 - matches2.loc[1, 'player_1_win_pct']
                                              + winpct(p1_matches, p1_wins)))
    else:
        player_1_recent_form[i] = matches2.loc[i, 'player_1_win_pct']
    
    mask6 = (recent_matches['player_1_name'] == p2) | (recent_matches['player_2_name'] == p2)
    mask7 = (recent_matches['winner'] == p2)
    
    p2_matches = recent_matches[mask6].shape[0]
    p2_wins = recent_matches[mask7].shape[0]
    
    if p2_wins >= 10:
        player_2_recent_form[i] = (matches2.loc[i, 'player_2_win_pct']
                                   + np.log10(1 - matches2.loc[1, 'player_2_win_pct']
                                              + winpct(p2_matches, p2_wins)))
    else:
        player_2_recent_form[i] = matches2.loc[i, 'player_2_win_pct']

In [26]:
matches2['player_1_recent_form'] = player_1_recent_form
matches2['player_2_recent_form'] = player_2_recent_form
matches2

Unnamed: 0,player_1_h2h,player_2_h2h,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,...,total_games,target,player_1_matches,player_1_wins,player_2_matches,player_2_wins,player_1_win_pct,player_2_win_pct,player_1_recent_form,player_2_recent_form
0,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,...,16,player_1,0,0,0,0,0.0,0.000000,0.0,0.000000
1,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,...,27,player_2,0,0,0,0,0.0,0.000000,0.0,0.000000
2,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,...,16,player_2,0,0,0,0,0.0,0.000000,0.0,0.000000
3,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,...,20,player_2,0,0,0,0,0.0,0.000000,0.0,0.000000
4,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7 7-6 6-1,...,33,player_2,0,0,0,0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20893,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,2,6-3 7-5,...,21,player_1,2,1,4,1,0.5,0.250000,0.5,0.250000
20894,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,4,6-4 6-4,...,20,player_1,1,0,3,1,0.0,0.333333,0.0,0.333333
20895,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,5,4-6 6-4 6-4,...,30,player_1,5,2,2,0,0.4,0.000000,0.4,0.000000
20896,0.0,0.0,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,2,6-4 6-4,...,20,player_1,1,0,0,0,0.0,0.000000,0.0,0.000000


In [27]:
print('number of observations with \'surface\' missing:', matches2['surface'].isnull().sum())
# there are 0 matches with the feature 'surface' missing

number of observations with 'surface' missing: 0


In [28]:
# player's win % on surface

n = matches2.shape[0]

p1_matches = matches2[['tourney_date', 'match_num', 'surface', 'player_1_name', 'winner']].copy()
p1_matches.columns = ['tourney_date', 'match_num', 'surface', 'player', 'winner']
p2_matches = matches2[['tourney_date', 'match_num', 'surface', 'player_2_name', 'winner']].copy()
p2_matches.columns = ['tourney_date', 'match_num', 'surface', 'player', 'winner']

p1_matches['winner'] = (p1_matches['winner'] == p1_matches['player']).astype('int')
p2_matches['winner'] = (p2_matches['winner'] == p2_matches['player']).astype('int')

p1_matches['category'] = np.ones(n)
p2_matches['category'] = np.ones(n)*2

all_players = pd.concat([p1_matches, p2_matches], axis = 0)
all_players = all_players.sort_values(by = ['tourney_date', 'match_num'], axis = 0)

groups = all_players.groupby(['surface', 'player'])

df_list = []
for (group, surface), df in groups:
    df['cumulative_matches'] = range(df.shape[0])
    df['cumulative_wins'] = df['winner'].cumsum().apply(lambda x: max(0, x - 1))
    df_list.append(df)
    
all_players = pd.concat(df_list, axis = 0)

for group, df in all_players.groupby('category'):
    df = df.drop(['tourney_date', 'match_num', 'surface', 'player', 'winner', 'category'], axis = 1)
    df.columns = ['player_' + str(int(group)) + '_surface_matches',
                  'player_' + str(int(group)) + '_surface_wins']

    matches2 = pd.merge(matches2, df, left_index = True, right_index = True, sort = True)

In [29]:
matches2['player_1_surface_win_pct'] = winpct_vec(matches2['player_1_surface_matches'],
                                                  matches2['player_1_surface_wins'])
matches2['player_2_surface_win_pct'] = winpct_vec(matches2['player_2_surface_matches'],
                                                  matches2['player_2_surface_wins'])

matches2

Unnamed: 0,player_1_h2h,player_2_h2h,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,...,player_1_win_pct,player_2_win_pct,player_1_recent_form,player_2_recent_form,player_1_surface_matches,player_1_surface_wins,player_2_surface_matches,player_2_surface_wins,player_1_surface_win_pct,player_2_surface_win_pct
0,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,1,6-3 6-1,...,0.0,0.000000,0.0,0.000000,0,0,0,0,0.000000,0.0
1,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,2,4-6 6-1 6-4,...,0.0,0.000000,0.0,0.000000,0,0,0,0,0.000000,0.0
2,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,3,6-2 6-2,...,0.0,0.000000,0.0,0.000000,0,0,0,0,0.000000,0.0
3,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,4,6-4 6-4,...,0.0,0.000000,0.0,0.000000,0,0,0,0,0.000000,0.0
4,0.0,0.0,2015-339,Brisbane,Hard,28,A,2015-01-04,5,6-7 7-6 6-1,...,0.0,0.000000,0.0,0.000000,0,0,0,0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20893,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,2,6-3 7-5,...,0.5,0.250000,0.5,0.250000,0,0,2,1,0.000000,0.5
20894,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,4,6-4 6-4,...,0.0,0.333333,0.0,0.333333,0,0,1,0,0.000000,0.0
20895,0.0,0.0,2022-M-DC-2022-WG2-PO-GRE-JAM-01,Davis Cup WG2 PO: GRE vs JAM,Clay,4,D,2022-03-04,5,4-6 6-4 6-4,...,0.4,0.000000,0.4,0.000000,3,2,1,0,0.666667,0.0
20896,0.0,0.0,2022-M-DC-2022-WG2-PO-HKG-BEN-01,Davis Cup WG2 PO: HKG vs BEN,Hard,4,D,2022-03-04,2,6-4 6-4,...,0.0,0.000000,0.0,0.000000,1,0,0,0,0.000000,0.0


In [30]:
matches2['tourney_level'].value_counts()

#different tournament levels:
    #G: grand slam
    #D: fed cup
    #F: Tour finals

tourney_level
A    11425
M     3903
G     3777
D     1628
F      165
Name: count, dtype: int64

In [31]:
#there is an overabundance of the value 0 in many engineered features,
#which will lead to inaccuracy in predictions
#to resolve this, restrict to matches in which both players have played at least 20 matches
#thus, the following predictive models should only be used to predict winner between players who have played
#20 matches in their career at the 5 tournament levels considered in this project

mask = (matches2['player_1_matches'] >= 20) & (matches2['player_2_matches'] >= 20)
matches3 = matches2[mask].copy()

print(matches3.shape[0])
matches3 #we still have 14344 observations

14344


Unnamed: 0,player_1_h2h,player_2_h2h,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,...,player_1_win_pct,player_2_win_pct,player_1_recent_form,player_2_recent_form,player_1_surface_matches,player_1_surface_wins,player_2_surface_matches,player_2_surface_wins,player_1_surface_win_pct,player_2_surface_win_pct
873,0.0,0.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,69,6-2 6-2,...,0.826087,0.700000,1.087608,0.943038,23,19,20,14,0.826087,0.700000
881,0.0,1.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,77,6-7 7-6 6-1,...,0.730769,0.800000,0.978554,1.055273,26,19,25,20,0.730769,0.800000
886,1.0,0.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,82,7-6 6-0,...,0.909091,0.666667,1.189918,0.900750,18,16,21,14,0.888889,0.666667
890,0.0,0.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,86,6-4 3-6 6-3,...,0.818182,0.636364,1.077819,0.862143,22,18,22,14,0.818182,0.636364
892,0.0,0.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,89,7-5 7-5,...,0.869565,0.916667,1.151290,1.199213,19,16,24,22,0.842105,0.916667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20814,0.0,0.0,2022-M-DC-2022-WG1-M-ECU-SUI-01,Davis Cup WG1 R1: ECU vs SUI,Hard,4,D,2022-09-17,2,6-4 6-3,...,0.370370,0.394231,0.370370,0.394231,20,6,60,23,0.300000,0.383333
20815,0.0,0.0,2022-M-DC-2022-WG1-M-ECU-SUI-01,Davis Cup WG1 R1: ECU vs SUI,Hard,4,D,2022-09-17,4,6-7 6-2 6-2,...,0.392857,0.416667,0.392857,0.416667,21,7,11,5,0.333333,0.454545
20824,0.0,0.0,2022-M-DC-2022-WG1-M-IND-NOR-01,Davis Cup WG1 R1: IND vs NOR,Hard,4,D,2022-09-16,1,6-1 6-4,...,0.656780,0.243243,0.896003,0.243243,95,55,27,7,0.578947,0.259259
20831,0.0,0.0,2022-M-DC-2022-WG1-M-SVK-ROU-01,Davis Cup WG1 R1: SVK vs ROU,Clay,4,D,2022-09-16,2,7-5 6-1,...,0.557692,0.405172,0.758052,0.405172,26,18,24,3,0.692308,0.125000


In [32]:
p1_missing_rank = matches3[matches3['player_1_rank'].isnull()][['player_1_name', 'player_1_id', 'tourney_name', 'tourney_date']]
p1_missing_rank
#there are still some rankings missing
#these are players that were unranked at the time of the match

Unnamed: 0,player_1_name,player_1_id,tourney_name,tourney_date
14456,Jack Sock,106058,Delray Beach,2020-02-17
14468,Jack Sock,106058,Delray Beach,2020-02-17
14723,Andrey Kuznetsov,105723,Us Open,2020-08-31
14758,Andrey Kuznetsov,105723,Us Open,2020-08-31


In [33]:
p2_missing_rank = matches3[matches3['player_2_rank'].isnull()][['player_2_name', 'player_2_id', 'tourney_name', 'tourney_date']]
p2_missing_rank

Unnamed: 0,player_2_name,player_2_id,tourney_name,tourney_date
7012,Thanasi Kokkinakis,106423,Lyon,2017-05-22
7043,Thanasi Kokkinakis,106423,Roland Garros,2017-05-29
11324,Steve Darcis,104327,Pune,2018-12-31
11327,Steve Darcis,104327,Pune,2018-12-31
11333,Steve Darcis,104327,Pune,2018-12-31
11343,Steve Darcis,104327,Pune,2018-12-31
11476,Janko Tipsarevic,104386,Australian Open,2019-01-14
14165,Yen Hsun Lu,104229,Australian Open,2020-01-20
16482,Yuki Bhambri,106026,Dubai,2021-03-15
17650,Nicolas Kicker,106044,Cordoba,2021-02-22


In [34]:
mask_player1 = matches3['player_1_rank'].isnull()
mask_player2 = matches3['player_2_rank'].isnull()

# Combine the masks using logical OR to identify rows with missing ranks for either player
mask_missing_ranks = mask_player1 | mask_player2

# Drop rows with missing ranks from matches3
matches3 = matches3[~mask_missing_ranks]

In [35]:
matches3.isnull().sum()[matches3.isnull().sum() > 0]

minutes            1048
w_ace               197
w_df                197
w_svpt              197
w_1stIn             197
w_1stWon            197
w_2ndWon            197
w_SvGms             197
w_bpSaved           197
w_bpFaced           197
l_ace               197
l_df                197
l_svpt              197
l_1stIn             197
l_1stWon            197
l_2ndWon            197
l_SvGms             197
l_bpSaved           197
l_bpFaced           197
player_1_entry    12718
player_2_entry    12424
player_1_seed      8442
player_2_seed      8803
dtype: int64

In [36]:
matches3_copy = matches3.copy()

matches3_copy['h2h_diff'] = matches3_copy['player_1_h2h'] - matches3_copy['player_2_h2h']
matches3_copy['recent_form_diff'] = matches3_copy['player_1_recent_form'] - matches3_copy['player_2_recent_form']
matches3_copy['surface_diff'] = matches3_copy['player_1_surface_win_pct'] - matches3_copy['player_2_surface_win_pct']
matches3_copy['level_diff'] = matches3_copy['player_1_win_pct'] - matches3_copy['player_2_win_pct']
matches3_copy['log_player_1_rank'] = np.log(matches3_copy['player_1_rank'])
matches3_copy['log_player_2_rank'] = np.log(matches3_copy['player_2_rank'])
matches3_copy['log_rank_ratio'] = matches3_copy['log_player_1_rank'] - matches3_copy['log_player_2_rank']
matches3_copy['age_diff'] = matches3_copy['player_1_age'] - matches3_copy['player_2_age']

In [37]:
matches3_copy

Unnamed: 0,player_1_h2h,player_2_h2h,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,...,player_1_surface_win_pct,player_2_surface_win_pct,h2h_diff,recent_form_diff,surface_diff,level_diff,log_player_1_rank,log_player_2_rank,log_rank_ratio,age_diff
873,0.0,0.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,69,6-2 6-2,...,0.826087,0.700000,0.0,0.144570,0.126087,0.126087,1.609438,3.663562,-2.054124,-3.9
881,0.0,1.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,77,6-7 7-6 6-1,...,0.730769,0.800000,-1.0,-0.076719,-0.069231,-0.069231,3.367296,2.197225,1.170071,-7.1
886,1.0,0.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,82,7-6 6-0,...,0.888889,0.666667,1.0,0.289168,0.222222,0.242424,1.945910,2.639057,-0.693147,2.7
890,0.0,0.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,86,6-4 3-6 6-3,...,0.818182,0.636364,0.0,0.215676,0.181818,0.181818,1.386294,2.833213,-1.446919,-1.0
892,0.0,0.0,2015-403,Miami Masters,Hard,96,M,2015-03-25,89,7-5 7-5,...,0.842105,0.916667,0.0,-0.047923,-0.074561,-0.047101,1.945910,0.000000,1.945910,5.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20814,0.0,0.0,2022-M-DC-2022-WG1-M-ECU-SUI-01,Davis Cup WG1 R1: ECU vs SUI,Hard,4,D,2022-09-17,2,6-4 6-3,...,0.300000,0.383333,0.0,-0.023860,-0.083333,-0.023860,4.718499,4.812184,-0.093685,0.3
20815,0.0,0.0,2022-M-DC-2022-WG1-M-ECU-SUI-01,Davis Cup WG1 R1: ECU vs SUI,Hard,4,D,2022-09-17,4,6-7 6-2 6-2,...,0.333333,0.454545,0.0,-0.023810,-0.121212,-0.023810,4.718499,4.465908,0.252591,4.5
20824,0.0,0.0,2022-M-DC-2022-WG1-M-IND-NOR-01,Davis Cup WG1 R1: IND vs NOR,Hard,4,D,2022-09-16,1,6-1 6-4,...,0.578947,0.259259,0.0,0.652759,0.319688,0.413536,0.693147,5.814131,-5.120983,-9.1
20831,0.0,0.0,2022-M-DC-2022-WG1-M-SVK-ROU-01,Davis Cup WG1 R1: SVK vs ROU,Clay,4,D,2022-09-16,2,7-5 6-1,...,0.692308,0.125000,0.0,0.352880,0.567308,0.152520,3.806662,5.849325,-2.042662,-7.2
