# Notebook

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import ComplementNB, GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.impute import SimpleImputer
import numpy as np

In [2]:
# Reading in data
years = [2016, 2017, 2018, 2019, 2020]
dfs = []
for year in years:
    df = pd.read_csv(f'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv')
    df['year'] = year
    dfs.append(df)
match_df = pd.concat(dfs)

## Data Exploration

In [3]:
match_df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,year
0,2016-M020,Brisbane,Hard,32,A,20160104,271,105062,,,...,22.0,12.0,10.0,4.0,7.0,65.0,762.0,61.0,781.0,2016
1,2016-M020,Brisbane,Hard,32,A,20160104,272,103285,,PR,...,15.0,8.0,7.0,4.0,8.0,197.0,252.0,76.0,678.0,2016
2,2016-M020,Brisbane,Hard,32,A,20160104,273,106071,7.0,,...,21.0,10.0,9.0,3.0,6.0,18.0,1675.0,71.0,710.0,2016
3,2016-M020,Brisbane,Hard,32,A,20160104,275,104471,,Q,...,22.0,9.0,8.0,3.0,6.0,87.0,636.0,813.0,25.0,2016
4,2016-M020,Brisbane,Hard,32,A,20160104,276,106298,,,...,42.0,30.0,15.0,12.0,15.0,78.0,672.0,117.0,495.0,2016


In [4]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13016 entries, 0 to 1461
Data columns (total 50 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tourney_id          13016 non-null  object 
 1   tourney_name        13016 non-null  object 
 2   surface             13016 non-null  object 
 3   draw_size           13016 non-null  int64  
 4   tourney_level       13016 non-null  object 
 5   tourney_date        13016 non-null  int64  
 6   match_num           13016 non-null  int64  
 7   winner_id           13016 non-null  int64  
 8   winner_seed         5440 non-null   float64
 9   winner_entry        1828 non-null   object 
 10  winner_name         13016 non-null  object 
 11  winner_hand         13013 non-null  object 
 12  winner_ht           12906 non-null  float64
 13  winner_ioc          13016 non-null  object 
 14  winner_age          13015 non-null  float64
 15  loser_id            13016 non-null  int64  
 16  loser

## Data Cleaning

### Dropping Columns
The first thing we are going to is drop irrelevant or uneeded columns.

In [5]:
# Tournament id is not going to be helpful with overall predictions but could be for individual tournaments
# Country of origin not needed
# winner_entry and seed are basically same thing so only need one
# score not needed if we know who already won
# hand not really necessary?
match_df = match_df.drop(columns = ['tourney_id', 'tourney_name', 'tourney_level', 'winner_ioc', 'loser_ioc',
                                   'match_num', 'winner_entry', 'loser_entry', 'score', 'winner_hand', 'loser_hand'])

### Dealing with NaN's
We need to clean null values up.

In [6]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13016 entries, 0 to 1461
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   surface             13016 non-null  object 
 1   draw_size           13016 non-null  int64  
 2   tourney_date        13016 non-null  int64  
 3   winner_id           13016 non-null  int64  
 4   winner_seed         5440 non-null   float64
 5   winner_name         13016 non-null  object 
 6   winner_ht           12906 non-null  float64
 7   winner_age          13015 non-null  float64
 8   loser_id            13016 non-null  int64  
 9   loser_seed          3176 non-null   float64
 10  loser_name          13016 non-null  object 
 11  loser_ht            12728 non-null  float64
 12  loser_age           13016 non-null  float64
 13  best_of             13016 non-null  int64  
 14  round               13016 non-null  object 
 15  minutes             12707 non-null  float64
 16  w_ace

In [7]:
# Going to start with winner and loser seed
# nan values are probably players that are not seeded at tournaments
match_df['winner_seed'].unique()

array([nan,  7.,  6.,  8.,  2.,  4.,  3.,  1.,  5., 28., 14.,  9., 26.,
       27., 19., 15., 12., 24., 29., 30., 23., 13., 21., 25., 31., 18.,
       10., 16., 32., 17., 11., 22., 20., 33.])

In [8]:
# nr stands for not ranked
match_df['winner_seed'] = match_df['winner_seed'].replace(np.nan, '0').replace('WC', '0').replace(
'LL', '0').replace('Q', '0')

In [9]:
# Replacing loser nulls
match_df['loser_seed'] = match_df['loser_seed'].replace(np.nan, '0').replace('WC', '0').replace(
'LL', '0').replace('Q', '0')

In [10]:
# winner_ht, going to replace with mode
# most atp players are about 6' 1"
match_df['winner_ht'].unique()

array([183., 185., 193., 180., 188., 178., 196., 198., 181., 173., 190.,
       175., 172., 191., 206., 203., 170.,  nan, 189., 174., 208., 186.,
       201., 211., 194.])

In [11]:
replace = match_df['winner_ht'].fillna(match_df['winner_ht'].mode()[0], inplace = True)

In [12]:
replace0 = match_df['loser_ht'].fillna(match_df['winner_ht'].mode()[0], inplace = True)

In [13]:
# Winner and loser age
# one null in each so just fill with mode 
replace2 = match_df['winner_age'].fillna(match_df['winner_age'].mode()[0], inplace = True)

In [14]:
replace3 = match_df['loser_age'].fillna(match_df['loser_age'].mode()[0], inplace = True)

We are going to drop null values for game data. We fill it would be inaccurate to try and fill matches with game data based on measures of central tendency.

In [15]:
match_df = match_df.dropna()

## Converting Datatypes to Numeric
We do not have any nulls now. To compare our columns and make models in the future, we need to get our categorical columns to become numeric

In [16]:
# Surface has 3 values so we are going to encode it
match_df["surface"] = match_df["surface"].astype('category')
match_df.dtypes
match_df["surface_cat"] = match_df["surface"].cat.codes

Now, since we have surface_cats as an int, we no longer need 'surface'

In [17]:
match_df = match_df.drop(columns = 'surface')

Next, we are going to change winner and loser seeds to integers

In [18]:
match_df['winner_seed'] = match_df['winner_seed'].astype(int)
match_df['loser_seed'] = match_df['loser_seed'].astype(int)

For our round column, we are going to remove 'R' from values such as 'R32'. We are also going to drop 'B' and 'BR' which stand for bye and bye round. We are also going to get rid of matches with 'RR'. This stands for round robin and these matches typically do not affect rankings and are uneeded.

In [19]:
match_df = match_df.loc[match_df['round'] != 'RR']
match_df = match_df.loc[match_df['round'] != 'B']
match_df = match_df.loc[match_df['round'] != 'BR']

In [20]:
match_df['round'] = match_df['round'].str.replace('R', '')

In [21]:
match_df['round'].unique()

array(['32', '16', 'QF', 'SF', 'F', '128', '64'], dtype=object)

The last thing we are doing for round is replacing QF, F, and SF with the number of players in those rounds.

In [22]:
match_df['round'] = match_df['round'].str.replace('QF', '8').replace('SF', '4').replace('F', '2')

In [23]:
match_df['round'] = match_df['round'].astype(int)

In [24]:
match_df = match_df.reset_index(drop=True)

## Creating fav_win column

This column will allow us to see if the favorite won the match or not.

In [25]:
winners_r = match_df['winner_rank']
losers_r = match_df['loser_rank']
winners_r

0         65.0
1        197.0
2         18.0
3         87.0
4         78.0
         ...  
11631     35.0
11632    528.0
11633     28.0
11634     90.0
11635     38.0
Name: winner_rank, Length: 11636, dtype: float64

In [26]:
favorites = []
for i in list(range(0, 11636)):
    if winners_r[i] > losers_r[i]:             
        favorites.append(False)
    elif winners_r[i] < losers_r[i]:
        favorites.append(True)
favorites

[False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 T

In [27]:
match_df['fav_win'] = favorites

In [28]:
match_df['fav_win'].value_counts()

True     7462
False    4174
Name: fav_win, dtype: int64

## Train-Test Split

In [29]:
X = match_df.drop(columns='fav_win')
y = match_df['fav_win']
# Preparing our hold out data for train test split
X_cols = X.columns
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=42, test_size=.25)
X_test, X_hold, y_test, y_hold = train_test_split(X_test, y_test, random_state=42, test_size=.25)

## Gathering Player Averages

By gathering player's averages, we will be able to play to predict future years.

In [30]:
X_train['tourney_date'] = pd.to_datetime(X_train['tourney_date'], format='%Y%m%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['tourney_date'] = pd.to_datetime(X_train['tourney_date'], format='%Y%m%d')


In [31]:
X_train['prev_year'] = X_train['tourney_date'].dt.year - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['prev_year'] = X_train['tourney_date'].dt.year - 1


In [34]:
match_df

Unnamed: 0,draw_size,tourney_date,winner_id,winner_seed,winner_name,winner_ht,winner_age,loser_id,loser_seed,loser_name,...,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,year,surface_cat,fav_win
0,32,20160104,105062,0,Mikhail Kukushkin,183.0,28.0,104797,0,Denis Istomin,...,10.0,4.0,7.0,65.0,762.0,61.0,781.0,2016,3,False
1,32,20160104,103285,0,Radek Stepanek,185.0,37.1,105583,0,Dusan Lajovic,...,7.0,4.0,8.0,197.0,252.0,76.0,678.0,2016,3,False
2,32,20160104,106071,7,Bernard Tomic,193.0,23.2,103917,0,Nicolas Mahut,...,9.0,3.0,6.0,18.0,1675.0,71.0,710.0,2016,3,True
3,32,20160104,104471,0,Ivan Dodig,183.0,31.0,117352,0,Oliver Anderson,...,8.0,3.0,6.0,87.0,636.0,813.0,25.0,2016,3,True
4,32,20160104,106298,0,Lucas Pouille,185.0,21.8,106415,0,Yoshihito Nishioka,...,15.0,12.0,15.0,78.0,672.0,117.0,495.0,2016,3,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11631,32,20201019,105554,0,Daniel Evans,175.0,30.4,106099,0,Salvatore Caruso,...,13.0,3.0,7.0,35.0,1384.0,79.0,802.0,2020,3,True
11632,32,20201019,200267,0,Zizou Bergs,185.0,21.3,105077,0,Albert Ramos,...,12.0,4.0,9.0,528.0,58.0,45.0,1165.0,2020,3,False
11633,32,20201019,126203,7,Taylor Fritz,193.0,22.9,124187,0,Reilly Opelka,...,9.0,3.0,5.0,28.0,1670.0,33.0,1402.0,2020,3,True
11634,32,20201019,144750,0,Lloyd Harris,193.0,23.6,144895,0,Corentin Moutet,...,10.0,7.0,11.0,90.0,748.0,74.0,838.0,2020,3,False


In [35]:
X_train.columns

Index(['draw_size', 'tourney_date', 'winner_id', 'winner_seed', 'winner_name',
       'winner_ht', 'winner_age', 'loser_id', 'loser_seed', 'loser_name',
       'loser_ht', 'loser_age', 'best_of', 'round', 'minutes', 'w_ace', 'w_df',
       'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved',
       'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon',
       'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'winner_rank',
       'winner_rank_points', 'loser_rank', 'loser_rank_points', 'year',
       'surface_cat', 'prev_year'],
      dtype='object')

In [36]:
X_trainW = X_train.groupby(['winner_id', 'year', 'surface_cat']).mean()[['w_df',             
'w_svpt',             
   'w_1stIn',            
   'w_1stWon',           
   'w_2ndWon',           
'w_SvGms',            
  'w_bpSaved',          
  'w_bpFaced',
                                                'w_ace'
    ]]

In [37]:
X_trainW = X_trainW.reset_index()

In [38]:
X_trainL = X_train.groupby(['loser_id', 'year', 'surface_cat']).mean()[['l_ace',              
  'l_df',               
  'l_svpt',             
  'l_1stIn',            
 'l_1stWon',           
  'l_2ndWon',           
  'l_SvGms',            
 'l_bpSaved',          
  'l_bpFaced',
    ]]

In [39]:
match_df = X_train.merge(X_trainW, 
              left_on = ['winner_id', 'prev_year', 'surface_cat'], 
              right_on = ['winner_id', 'year', 'surface_cat'],
              suffixes = ('_match', '_winner'),
              how = 'left')

In [40]:
match_df = match_df.merge(X_trainL, 
              left_on = ['loser_id', 'prev_year', 'surface_cat'], 
              right_on = ['loser_id', 'year', 'surface_cat'],
              suffixes = ('_match', '_loser'),
              how = 'left')

In [41]:
# drop columns 
# base on favorite 

In [42]:
match_df.columns

Index(['draw_size', 'tourney_date', 'winner_id', 'winner_seed', 'winner_name',
       'winner_ht', 'winner_age', 'loser_id', 'loser_seed', 'loser_name',
       'loser_ht', 'loser_age', 'best_of', 'round', 'minutes', 'w_ace_match',
       'w_df_match', 'w_svpt_match', 'w_1stIn_match', 'w_1stWon_match',
       'w_2ndWon_match', 'w_SvGms_match', 'w_bpSaved_match', 'w_bpFaced_match',
       'l_ace_match', 'l_df_match', 'l_svpt_match', 'l_1stIn_match',
       'l_1stWon_match', 'l_2ndWon_match', 'l_SvGms_match', 'l_bpSaved_match',
       'l_bpFaced_match', 'winner_rank', 'winner_rank_points', 'loser_rank',
       'loser_rank_points', 'year_match', 'surface_cat', 'prev_year',
       'year_winner', 'w_df_winner', 'w_svpt_winner', 'w_1stIn_winner',
       'w_1stWon_winner', 'w_2ndWon_winner', 'w_SvGms_winner',
       'w_bpSaved_winner', 'w_bpFaced_winner', 'w_ace_winner', 'l_ace_loser',
       'l_df_loser', 'l_svpt_loser', 'l_1stIn_loser', 'l_1stWon_loser',
       'l_2ndWon_loser', 'l_SvGms_los

These columns are not needed

In [43]:
match_df = match_df.drop(columns = ['w_ace_match',
       'w_df_match', 'w_svpt_match', 'w_1stIn_match', 'w_1stWon_match',
       'w_2ndWon_match', 'w_SvGms_match', 'w_bpSaved_match', 'w_bpFaced_match',
       'l_ace_match', 'l_df_match', 'l_svpt_match', 'l_1stIn_match',
       'l_1stWon_match', 'l_2ndWon_match', 'l_SvGms_match', 'l_bpSaved_match',
       'l_bpFaced_match', ])

In [44]:
match_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8727 entries, 0 to 8726
Data columns (total 41 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   draw_size           8727 non-null   int64         
 1   tourney_date        8727 non-null   datetime64[ns]
 2   winner_id           8727 non-null   int64         
 3   winner_seed         8727 non-null   int32         
 4   winner_name         8727 non-null   object        
 5   winner_ht           8727 non-null   float64       
 6   winner_age          8727 non-null   float64       
 7   loser_id            8727 non-null   int64         
 8   loser_seed          8727 non-null   int32         
 9   loser_name          8727 non-null   object        
 10  loser_ht            8727 non-null   float64       
 11  loser_age           8727 non-null   float64       
 12  best_of             8727 non-null   int64         
 13  round               8727 non-null   int32       

In [45]:
match_df.head()

Unnamed: 0,draw_size,tourney_date,winner_id,winner_seed,winner_name,winner_ht,winner_age,loser_id,loser_seed,loser_name,...,w_ace_winner,l_ace_loser,l_df_loser,l_svpt_loser,l_1stIn_loser,l_1stWon_loser,l_2ndWon_loser,l_SvGms_loser,l_bpSaved_loser,l_bpFaced_loser
0,32,2018-01-01,111577,0,Jared Donaldson,188.0,21.2,111442,0,Jordan Thompson,...,8.285714,5.875,3.5,87.5,51.375,33.75,16.0,13.625,5.5,10.5
1,32,2020-10-12,200175,0,Miomir Kecmanovic,183.0,21.1,105062,0,Mikhail Kukushkin,...,3.545455,2.4,1.6,67.1,41.2,25.1,12.0,10.3,4.4,8.4
2,32,2017-04-24,104678,7,Viktor Troicki,193.0,31.2,105539,0,Evgeny Donskoy,...,4.333333,2.666667,1.333333,70.166667,38.333333,24.5,15.0,10.5,4.5,8.166667
3,128,2019-08-26,136440,0,Dominik Koepfer,180.0,25.3,144719,0,Jaume Munar,...,4.0,3.666667,4.0,82.0,51.666667,35.0,12.166667,12.666667,7.0,11.333333
4,32,2019-10-21,105967,0,Henri Laaksonen,185.0,27.5,105332,8,Benoit Paire,...,6.0,7.083333,6.0,72.25,35.916667,26.166667,12.916667,11.166667,4.333333,8.916667


In [46]:
[col for col in match_df.columns if 'l_' in col]

['l_ace_loser',
 'l_df_loser',
 'l_svpt_loser',
 'l_1stIn_loser',
 'l_1stWon_loser',
 'l_2ndWon_loser',
 'l_SvGms_loser',
 'l_bpSaved_loser',
 'l_bpFaced_loser']

differences = []
for idx, row in match_df.iterrows():
    # if fav_win in row is True
    if row['fav_win'] == True:
        # difference is winner - loser
        row['w_ace_winner'] - row['l_ace_loser']
        
    
when favwin = true
winner - loser 
when favwin = false
loser - winner

In [47]:
pd.set_option('display.max_columns', None)
match_df.head()

Unnamed: 0,draw_size,tourney_date,winner_id,winner_seed,winner_name,winner_ht,winner_age,loser_id,loser_seed,loser_name,loser_ht,loser_age,best_of,round,minutes,winner_rank,winner_rank_points,loser_rank,loser_rank_points,year_match,surface_cat,prev_year,year_winner,w_df_winner,w_svpt_winner,w_1stIn_winner,w_1stWon_winner,w_2ndWon_winner,w_SvGms_winner,w_bpSaved_winner,w_bpFaced_winner,w_ace_winner,l_ace_loser,l_df_loser,l_svpt_loser,l_1stIn_loser,l_1stWon_loser,l_2ndWon_loser,l_SvGms_loser,l_bpSaved_loser,l_bpFaced_loser
0,32,2018-01-01,111577,0,Jared Donaldson,188.0,21.2,111442,0,Jordan Thompson,183.0,23.7,3,32,90.0,54.0,890.0,94.0,593.0,2018,3,2017,2017.0,3.785714,79.571429,44.142857,35.0,18.5,12.428571,3.642857,5.214286,8.285714,5.875,3.5,87.5,51.375,33.75,16.0,13.625,5.5,10.5
1,32,2020-10-12,200175,0,Miomir Kecmanovic,183.0,21.1,105062,0,Mikhail Kukushkin,183.0,32.7,3,32,96.0,41.0,1258.0,88.0,751.0,2020,3,2019,2019.0,1.909091,60.545455,36.0,27.909091,14.090909,9.818182,1.818182,3.090909,3.545455,2.4,1.6,67.1,41.2,25.1,12.0,10.3,4.4,8.4
2,32,2017-04-24,104678,7,Viktor Troicki,193.0,31.2,105539,0,Evgeny Donskoy,185.0,26.9,3,32,53.0,39.0,1055.0,96.0,558.0,2017,1,2016,2016.0,2.0,119.333333,74.666667,51.0,23.0,17.333333,7.0,10.0,4.333333,2.666667,1.333333,70.166667,38.333333,24.5,15.0,10.5,4.5,8.166667
3,128,2019-08-26,136440,0,Dominik Koepfer,180.0,25.3,144719,0,Jaume Munar,183.0,22.3,5,128,214.0,118.0,474.0,97.0,550.0,2019,3,2018,2018.0,2.0,57.0,36.0,32.0,12.0,10.0,0.0,0.0,4.0,3.666667,4.0,82.0,51.666667,35.0,12.166667,12.666667,7.0,11.333333
4,32,2019-10-21,105967,0,Henri Laaksonen,185.0,27.5,105332,8,Benoit Paire,196.0,30.4,3,32,67.0,105.0,541.0,25.0,1528.0,2019,3,2018,2018.0,5.0,57.0,30.0,28.0,12.0,9.0,3.0,3.0,6.0,7.083333,6.0,72.25,35.916667,26.166667,12.916667,11.166667,4.333333,8.916667


In [48]:
winners_r = match_df['winner_rank']

In [49]:
losers_r = match_df['loser_rank']

In [50]:
favorites = []
for i in list(range(0, 8727)):
    if winners_r[i] > losers_r[i]:             
        favorites.append(False)
    elif winners_r[i] < losers_r[i]:
        favorites.append(True)
favorites

[True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,


In [51]:
match_df['fav_win'] = favorites

In [52]:
match_df.columns

Index(['draw_size', 'tourney_date', 'winner_id', 'winner_seed', 'winner_name',
       'winner_ht', 'winner_age', 'loser_id', 'loser_seed', 'loser_name',
       'loser_ht', 'loser_age', 'best_of', 'round', 'minutes', 'winner_rank',
       'winner_rank_points', 'loser_rank', 'loser_rank_points', 'year_match',
       'surface_cat', 'prev_year', 'year_winner', 'w_df_winner',
       'w_svpt_winner', 'w_1stIn_winner', 'w_1stWon_winner', 'w_2ndWon_winner',
       'w_SvGms_winner', 'w_bpSaved_winner', 'w_bpFaced_winner',
       'w_ace_winner', 'l_ace_loser', 'l_df_loser', 'l_svpt_loser',
       'l_1stIn_loser', 'l_1stWon_loser', 'l_2ndWon_loser', 'l_SvGms_loser',
       'l_bpSaved_loser', 'l_bpFaced_loser', 'fav_win'],
      dtype='object')

In [53]:
match_df = match_df.drop(columns = ['winner_seed', 'loser_seed', 'winner_rank_points', 'loser_rank_points', 'minutes'])

In [54]:
fav_win_T = match_df.loc[match_df['fav_win'] == True]

In [55]:
fav_win_T.columns

Index(['draw_size', 'tourney_date', 'winner_id', 'winner_name', 'winner_ht',
       'winner_age', 'loser_id', 'loser_name', 'loser_ht', 'loser_age',
       'best_of', 'round', 'winner_rank', 'loser_rank', 'year_match',
       'surface_cat', 'prev_year', 'year_winner', 'w_df_winner',
       'w_svpt_winner', 'w_1stIn_winner', 'w_1stWon_winner', 'w_2ndWon_winner',
       'w_SvGms_winner', 'w_bpSaved_winner', 'w_bpFaced_winner',
       'w_ace_winner', 'l_ace_loser', 'l_df_loser', 'l_svpt_loser',
       'l_1stIn_loser', 'l_1stWon_loser', 'l_2ndWon_loser', 'l_SvGms_loser',
       'l_bpSaved_loser', 'l_bpFaced_loser', 'fav_win'],
      dtype='object')

In [56]:
import warnings
warnings.filterwarnings('ignore')

In [57]:
fav_win_T['rank_diff'] = fav_win_T['winner_rank'] - fav_win_T['loser_rank']
fav_win_T['bpFaced_diff'] = fav_win_T['w_bpFaced_winner'] - fav_win_T['l_bpFaced_loser']
fav_win_T['bpSaved_diff'] = fav_win_T['w_bpSaved_winner'] - fav_win_T['l_bpSaved_loser']
fav_win_T['SvGms_diff'] = fav_win_T['w_SvGms_winner'] - fav_win_T['l_SvGms_loser']
fav_win_T['2ndWon_diff'] = fav_win_T['w_2ndWon_winner'] - fav_win_T['l_2ndWon_loser']
fav_win_T['1stWon_diff'] = fav_win_T['w_1stWon_winner'] - fav_win_T['l_1stWon_loser']
fav_win_T['1stIn_diff'] = fav_win_T['w_1stIn_winner'] - fav_win_T['l_1stIn_loser']
fav_win_T['svpt_diff'] = fav_win_T['w_svpt_winner'] - fav_win_T['l_svpt_loser']
fav_win_T['df_diff'] = fav_win_T['w_df_winner'] - fav_win_T['l_df_loser']
fav_win_T['ace_diff'] = fav_win_T['w_ace_winner'] - fav_win_T['l_ace_loser']
fav_win_T['age_diff'] = fav_win_T['winner_age'] - fav_win_T['loser_age']
fav_win_T['ht_diff'] = fav_win_T['winner_ht'] - fav_win_T['loser_ht']

In [58]:
fav_win_T.columns

Index(['draw_size', 'tourney_date', 'winner_id', 'winner_name', 'winner_ht',
       'winner_age', 'loser_id', 'loser_name', 'loser_ht', 'loser_age',
       'best_of', 'round', 'winner_rank', 'loser_rank', 'year_match',
       'surface_cat', 'prev_year', 'year_winner', 'w_df_winner',
       'w_svpt_winner', 'w_1stIn_winner', 'w_1stWon_winner', 'w_2ndWon_winner',
       'w_SvGms_winner', 'w_bpSaved_winner', 'w_bpFaced_winner',
       'w_ace_winner', 'l_ace_loser', 'l_df_loser', 'l_svpt_loser',
       'l_1stIn_loser', 'l_1stWon_loser', 'l_2ndWon_loser', 'l_SvGms_loser',
       'l_bpSaved_loser', 'l_bpFaced_loser', 'fav_win', 'rank_diff',
       'bpFaced_diff', 'bpSaved_diff', 'SvGms_diff', '2ndWon_diff',
       '1stWon_diff', '1stIn_diff', 'svpt_diff', 'df_diff', 'ace_diff',
       'age_diff', 'ht_diff'],
      dtype='object')

In [59]:
# dropping not diff columns
fav_win_T = fav_win_T.drop(columns = ['w_df_winner',
       'w_svpt_winner', 'w_1stIn_winner', 'w_1stWon_winner', 'w_2ndWon_winner',
       'w_SvGms_winner', 'w_bpSaved_winner', 'w_bpFaced_winner',
       'w_ace_winner', 'l_ace_loser', 'l_df_loser', 'l_svpt_loser',
       'l_1stIn_loser', 'l_1stWon_loser', 'l_2ndWon_loser', 'l_SvGms_loser',
       'l_bpSaved_loser', 'l_bpFaced_loser', 'winner_ht', 'loser_ht', 'winner_age', 'loser_age'])

In [60]:
pd.set_option('display.max_columns', None)

In [61]:
fav_win_T.head()

Unnamed: 0,draw_size,tourney_date,winner_id,winner_name,loser_id,loser_name,best_of,round,winner_rank,loser_rank,year_match,surface_cat,prev_year,year_winner,fav_win,rank_diff,bpFaced_diff,bpSaved_diff,SvGms_diff,2ndWon_diff,1stWon_diff,1stIn_diff,svpt_diff,df_diff,ace_diff,age_diff,ht_diff
0,32,2018-01-01,111577,Jared Donaldson,111442,Jordan Thompson,3,32,54.0,94.0,2018,3,2017,2017.0,True,-40.0,-5.285714,-1.857143,-1.196429,2.5,1.25,-7.232143,-7.928571,0.285714,2.410714,-2.5,5.0
1,32,2020-10-12,200175,Miomir Kecmanovic,105062,Mikhail Kukushkin,3,32,41.0,88.0,2020,3,2019,2019.0,True,-47.0,-5.309091,-2.581818,-0.481818,2.090909,2.809091,-5.2,-6.554545,0.309091,1.145455,-11.6,0.0
2,32,2017-04-24,104678,Viktor Troicki,105539,Evgeny Donskoy,3,32,39.0,96.0,2017,1,2016,2016.0,True,-57.0,1.833333,2.5,6.833333,8.0,26.5,36.333333,49.166667,0.666667,1.666667,4.3,8.0
5,64,2018-10-29,126207,Frances Tiafoe,103917,Nicolas Mahut,3,64,44.0,169.0,2018,3,2017,2017.0,True,-125.0,-4.380952,-2.0,0.571429,4.761905,2.095238,-5.571429,-6.095238,-0.857143,4.571429,-16.0,-2.0
6,32,2018-02-19,105683,Milos Raonic,106121,Taro Daniel,3,32,32.0,105.0,2018,3,2017,2017.0,True,-73.0,-6.777778,-2.666667,0.0,1.222222,5.555556,-5.222222,-11.444444,-1.666667,14.0,2.1,5.0


In [68]:
fav_win_T = fav_win_T.drop(columns = ['year_winner'])

In [69]:
fav_win_T.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5589 entries, 0 to 8726
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   draw_size     5589 non-null   int64         
 1   tourney_date  5589 non-null   datetime64[ns]
 2   winner_id     5589 non-null   int64         
 3   winner_name   5589 non-null   object        
 4   loser_id      5589 non-null   int64         
 5   loser_name    5589 non-null   object        
 6   best_of       5589 non-null   int64         
 7   round         5589 non-null   int32         
 8   winner_rank   5589 non-null   float64       
 9   loser_rank    5589 non-null   float64       
 10  year_match    5589 non-null   int64         
 11  surface_cat   5589 non-null   int8          
 12  prev_year     5589 non-null   int64         
 13  fav_win       5589 non-null   bool          
 14  rank_diff     5589 non-null   float64       
 15  bpFaced_diff  3277 non-null   float64 

In [65]:
fav_win_F = match_df.loc[match_df['fav_win'] == False]

In [66]:
fav_win_F.columns

Index(['draw_size', 'tourney_date', 'winner_id', 'winner_name', 'winner_ht',
       'winner_age', 'loser_id', 'loser_name', 'loser_ht', 'loser_age',
       'best_of', 'round', 'winner_rank', 'loser_rank', 'year_match',
       'surface_cat', 'prev_year', 'year_winner', 'w_df_winner',
       'w_svpt_winner', 'w_1stIn_winner', 'w_1stWon_winner', 'w_2ndWon_winner',
       'w_SvGms_winner', 'w_bpSaved_winner', 'w_bpFaced_winner',
       'w_ace_winner', 'l_ace_loser', 'l_df_loser', 'l_svpt_loser',
       'l_1stIn_loser', 'l_1stWon_loser', 'l_2ndWon_loser', 'l_SvGms_loser',
       'l_bpSaved_loser', 'l_bpFaced_loser', 'fav_win'],
      dtype='object')

In [72]:
fav_win_F['rank_diff'] =fav_win_F['loser_rank'] - fav_win_F['winner_rank'] 
fav_win_F['bpFaced_diff'] =fav_win_F['l_bpFaced_loser'] - fav_win_F['w_bpFaced_winner']  
fav_win_F['bpSaved_diff'] =fav_win_F['l_bpSaved_loser'] - fav_win_F['w_bpSaved_winner']  
fav_win_F['SvGms_diff'] =fav_win_F['l_SvGms_loser'] - fav_win_F['w_SvGms_winner'] 
fav_win_F['2ndWon_diff'] =fav_win_F['l_2ndWon_loser'] - fav_win_F['w_2ndWon_winner']  
fav_win_F['1stWon_diff'] =fav_win_F['l_1stWon_loser'] - fav_win_F['w_1stWon_winner']  
fav_win_F['1stIn_diff'] =fav_win_F['l_1stIn_loser'] - fav_win_F['w_1stIn_winner'] 
fav_win_F['svpt_diff'] =fav_win_F['l_svpt_loser'] - fav_win_F['w_svpt_winner'] 
fav_win_F['df_diff'] =fav_win_F['l_df_loser'] - fav_win_F['w_df_winner']  
fav_win_F['ace_diff'] =fav_win_F['l_ace_loser'] - fav_win_F['w_ace_winner']  
fav_win_F['age_diff'] =fav_win_F['loser_age'] - fav_win_F['winner_age'] 
fav_win_F['ht_diff'] =fav_win_F['loser_ht'] - fav_win_F['winner_ht'] 

In [75]:
fav_win_F = fav_win_F.drop(columns = ['w_df_winner',
       'w_svpt_winner', 'w_1stIn_winner', 'w_1stWon_winner', 'w_2ndWon_winner',
       'w_SvGms_winner', 'w_bpSaved_winner', 'w_bpFaced_winner',
       'w_ace_winner', 'l_ace_loser', 'l_df_loser', 'l_svpt_loser',
       'l_1stIn_loser', 'l_1stWon_loser', 'l_2ndWon_loser', 'l_SvGms_loser',
       'l_bpSaved_loser', 'l_bpFaced_loser', 'winner_ht', 'loser_ht', 'winner_age', 'loser_age'])

In [83]:
fav_win_F = fav_win_F.drop(columns = 'year_winner')

In [84]:
agg_df = pd.concat([fav_win_T, fav_win_F])

In [85]:
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8727 entries, 0 to 8722
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   draw_size     8727 non-null   int64         
 1   tourney_date  8727 non-null   datetime64[ns]
 2   winner_id     8727 non-null   int64         
 3   winner_name   8727 non-null   object        
 4   loser_id      8727 non-null   int64         
 5   loser_name    8727 non-null   object        
 6   best_of       8727 non-null   int64         
 7   round         8727 non-null   int32         
 8   winner_rank   8727 non-null   float64       
 9   loser_rank    8727 non-null   float64       
 10  year_match    8727 non-null   int64         
 11  surface_cat   8727 non-null   int8          
 12  prev_year     8727 non-null   int64         
 13  fav_win       8727 non-null   bool          
 14  rank_diff     8727 non-null   float64       
 15  bpFaced_diff  5061 non-null   float64 

In [86]:
agg_df.head()

Unnamed: 0,draw_size,tourney_date,winner_id,winner_name,loser_id,loser_name,best_of,round,winner_rank,loser_rank,year_match,surface_cat,prev_year,fav_win,rank_diff,bpFaced_diff,bpSaved_diff,SvGms_diff,2ndWon_diff,1stWon_diff,1stIn_diff,svpt_diff,df_diff,ace_diff,age_diff,ht_diff
0,32,2018-01-01,111577,Jared Donaldson,111442,Jordan Thompson,3,32,54.0,94.0,2018,3,2017,True,-40.0,-5.285714,-1.857143,-1.196429,2.5,1.25,-7.232143,-7.928571,0.285714,2.410714,-2.5,5.0
1,32,2020-10-12,200175,Miomir Kecmanovic,105062,Mikhail Kukushkin,3,32,41.0,88.0,2020,3,2019,True,-47.0,-5.309091,-2.581818,-0.481818,2.090909,2.809091,-5.2,-6.554545,0.309091,1.145455,-11.6,0.0
2,32,2017-04-24,104678,Viktor Troicki,105539,Evgeny Donskoy,3,32,39.0,96.0,2017,1,2016,True,-57.0,1.833333,2.5,6.833333,8.0,26.5,36.333333,49.166667,0.666667,1.666667,4.3,8.0
5,64,2018-10-29,126207,Frances Tiafoe,103917,Nicolas Mahut,3,64,44.0,169.0,2018,3,2017,True,-125.0,-4.380952,-2.0,0.571429,4.761905,2.095238,-5.571429,-6.095238,-0.857143,4.571429,-16.0,-2.0
6,32,2018-02-19,105683,Milos Raonic,106121,Taro Daniel,3,32,32.0,105.0,2018,3,2017,True,-73.0,-6.777778,-2.666667,0.0,1.222222,5.555556,-5.222222,-11.444444,-1.666667,14.0,2.1,5.0


In [87]:
agg_df.to_csv('MR')