In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [3]:
file_paths = ["data/atp_matches_2016.csv",
              "data/atp_matches_2017 (1).csv",
              "data/atp_matches_2018.csv",
              "data/atp_matches_2019.csv",
              "data/atp_matches_2020.csv",
              "data/atp_matches_2021.csv",
              "data/atp_matches_2022.csv",
              "data/atp_matches_2023.csv",
              "data/atp_matches_2024.csv"]
dfs=[]
for file in file_paths:
    df = pd.read_csv(file)
    dfs.append(df)

df = pd.concat(dfs,ignore_index=True)

df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,2016-M020,Brisbane,Hard,32,A,20160104,271,105062,,,...,32.0,22.0,12.0,10.0,4.0,7.0,65.0,762.0,61.0,781.0
1,2016-M020,Brisbane,Hard,32,A,20160104,272,103285,,PR,...,25.0,15.0,8.0,7.0,4.0,8.0,197.0,252.0,76.0,678.0
2,2016-M020,Brisbane,Hard,32,A,20160104,273,106071,7.0,,...,29.0,21.0,10.0,9.0,3.0,6.0,18.0,1675.0,71.0,710.0
3,2016-M020,Brisbane,Hard,32,A,20160104,275,104471,,Q,...,30.0,22.0,9.0,8.0,3.0,6.0,87.0,636.0,813.0,25.0
4,2016-M020,Brisbane,Hard,32,A,20160104,276,106298,,,...,64.0,42.0,30.0,15.0,12.0,15.0,78.0,672.0,117.0,495.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24724,2024-M-DC-2024-WG2-PO-URU-MDA-01,Davis Cup WG2 PO: URU vs MDA,Clay,4,D,20240203,5,212051,,,...,30.0,17.0,7.0,6.0,8.0,14.0,1109.0,8.0,740.0,34.0
24725,2024-M-DC-2024-WG2-PO-VIE-RSA-01,Davis Cup WG2 PO: VIE vs RSA,Hard,4,D,20240202,1,122533,,,...,41.0,25.0,6.0,9.0,1.0,4.0,554.0,67.0,748.0,32.0
24726,2024-M-DC-2024-WG2-PO-VIE-RSA-01,Davis Cup WG2 PO: VIE vs RSA,Hard,4,D,20240202,2,144748,,,...,51.0,25.0,7.0,11.0,5.0,12.0,416.0,109.0,,
24727,2024-M-DC-2024-WG2-PO-VIE-RSA-01,Davis Cup WG2 PO: VIE vs RSA,Hard,4,D,20240202,4,122533,,,...,51.0,32.0,17.0,14.0,5.0,9.0,554.0,67.0,416.0,109.0


In [4]:
pd.set_option("display.max_columns",None)
print(df.head())

  tourney_id tourney_name surface  draw_size tourney_level  tourney_date  \
0  2016-M020     Brisbane    Hard         32             A      20160104   
1  2016-M020     Brisbane    Hard         32             A      20160104   
2  2016-M020     Brisbane    Hard         32             A      20160104   
3  2016-M020     Brisbane    Hard         32             A      20160104   
4  2016-M020     Brisbane    Hard         32             A      20160104   

   match_num  winner_id  winner_seed winner_entry        winner_name  \
0        271     105062          NaN          NaN  Mikhail Kukushkin   
1        272     103285          NaN           PR     Radek Stepanek   
2        273     106071          7.0          NaN      Bernard Tomic   
3        275     104471          NaN            Q         Ivan Dodig   
4        276     106298          NaN          NaN      Lucas Pouille   

  winner_hand  winner_ht winner_ioc  winner_age  loser_id  loser_seed  \
0           R      183.0        KAZ  

In [5]:
df = df[['surface','tourney_level','tourney_date','round','winner_id','loser_id','winner_hand','loser_hand','winner_ht','loser_ht','winner_age','loser_age','winner_rank','loser_rank','winner_rank_points','loser_rank_points']]
print(df.head())

  surface tourney_level  tourney_date round  winner_id  loser_id winner_hand  \
0    Hard             A      20160104   R32     105062    104797           R   
1    Hard             A      20160104   R32     103285    105583           R   
2    Hard             A      20160104   R32     106071    103917           R   
3    Hard             A      20160104   R32     104471    117352           R   
4    Hard             A      20160104   R32     106298    106415           R   

  loser_hand  winner_ht  loser_ht  winner_age  loser_age  winner_rank  \
0          R      183.0     185.0        28.0       29.3         65.0   
1          R      185.0     183.0        37.1       25.5        197.0   
2          R      196.0     191.0        23.2       33.9         18.0   
3          R      183.0     175.0        31.0       17.6         87.0   
4          L      185.0     170.0        21.8       20.2         78.0   

   loser_rank  winner_rank_points  loser_rank_points  
0        61.0            

In [6]:
# Shuffle winner and loser randomly for each row to make it symmetric

def restructure(row):
    if np.random.rand() < 0.5:
        return pd.Series({
            'player1_id': row['winner_id'],
            'player2_id': row['loser_id'],
            'player1_hand': row['winner_hand'],
            'player2_hand': row['loser_hand'],
            'player1_ht': row['winner_ht'],
            'player2_ht': row['loser_ht'],
            'player1_age': row['winner_age'],
            'player2_age': row['loser_age'],
            'player1_rank': row['winner_rank'],
            'player2_rank': row['loser_rank'],
            'player1_rank_points': row['winner_rank_points'],
            'player2_rank_points': row['loser_rank_points'],
            'surface': row['surface'],
            'tourney_level': row['tourney_level'],
            'tourney_date': row['tourney_date'],
            'label': 1
        })
    else:
        return pd.Series({
            'player1_id': row['loser_id'],
            'player2_id': row['winner_id'],
            'player1_hand': row['loser_hand'],
            'player2_hand': row['winner_hand'],
            'player1_ht': row['loser_ht'],
            'player2_ht': row['winner_ht'],
            'player1_age': row['loser_age'],
            'player2_age': row['winner_age'],
            'player1_rank': row['loser_rank'],
            'player2_rank': row['winner_rank'],
            'player1_rank_points': row['loser_rank_points'],
            'player2_rank_points': row['winner_rank_points'],
            'surface': row['surface'],
            'tourney_level': row['tourney_level'],
            'tourney_date': row['tourney_date'],
            'label': 0
        })


restructured_df = df.apply(restructure, axis=1)

restructured_df


Unnamed: 0,player1_id,player2_id,player1_hand,player2_hand,player1_ht,player2_ht,player1_age,player2_age,player1_rank,player2_rank,player1_rank_points,player2_rank_points,surface,tourney_level,tourney_date,label
0,104797,105062,R,R,185.0,183.0,29.3,28.0,61.0,65.0,781.0,762.0,Hard,A,20160104,0
1,105583,103285,R,R,183.0,185.0,25.5,37.1,76.0,197.0,678.0,252.0,Hard,A,20160104,0
2,103917,106071,R,R,191.0,196.0,33.9,23.2,71.0,18.0,710.0,1675.0,Hard,A,20160104,0
3,117352,104471,R,R,175.0,183.0,17.6,31.0,813.0,87.0,25.0,636.0,Hard,A,20160104,0
4,106415,106298,L,R,170.0,185.0,20.2,21.8,117.0,78.0,495.0,672.0,Hard,A,20160104,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24724,209943,212051,R,U,188.0,,21.8,18.8,740.0,1109.0,34.0,8.0,Clay,D,20240203,0
24725,122533,202475,R,R,175.0,,26.9,23.2,554.0,748.0,67.0,32.0,Hard,D,20240202,1
24726,144748,144775,R,R,185.0,183.0,27.3,26.4,416.0,,109.0,,Hard,D,20240202,1
24727,122533,144748,R,R,175.0,185.0,26.9,27.3,554.0,416.0,67.0,109.0,Hard,D,20240202,1


In [7]:
restructured_df = restructured_df.dropna()
#restructured_df = restructured_df.drop(columns=['player1_id','player2_id'])
restructured_df


Unnamed: 0,player1_id,player2_id,player1_hand,player2_hand,player1_ht,player2_ht,player1_age,player2_age,player1_rank,player2_rank,player1_rank_points,player2_rank_points,surface,tourney_level,tourney_date,label
0,104797,105062,R,R,185.0,183.0,29.3,28.0,61.0,65.0,781.0,762.0,Hard,A,20160104,0
1,105583,103285,R,R,183.0,185.0,25.5,37.1,76.0,197.0,678.0,252.0,Hard,A,20160104,0
2,103917,106071,R,R,191.0,196.0,33.9,23.2,71.0,18.0,710.0,1675.0,Hard,A,20160104,0
3,117352,104471,R,R,175.0,183.0,17.6,31.0,813.0,87.0,25.0,636.0,Hard,A,20160104,0
4,106415,106298,L,R,170.0,185.0,20.2,21.8,117.0,78.0,495.0,672.0,Hard,A,20160104,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24717,207134,133933,R,R,175.0,183.0,25.0,28.2,569.0,819.0,64.0,24.0,Hard,D,20240203,1
24719,132374,121411,R,R,188.0,178.0,28.8,31.0,900.0,279.0,18.0,205.0,Hard,D,20240202,0
24721,208364,209943,L,R,185.0,188.0,23.9,21.8,616.0,740.0,55.0,34.0,Clay,D,20240203,1
24723,208364,105430,L,R,185.0,175.0,23.9,34.2,616.0,136.0,55.0,489.0,Clay,D,20240203,0


In [8]:
cat_features = ['player1_hand','player2_hand','surface','tourney_level','player1_id','player2_id']
X = restructured_df.drop(columns=['label'])
y=  restructured_df['label']

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
train_pool = Pool(x_train,y_train,cat_features=cat_features)
test_pool=  Pool(x_test,cat_features=cat_features)

model = CatBoostClassifier(iterations=1500,learning_rate=0.05,depth=6,verbose=20,early_stopping_rounds=100)
model.fit(train_pool)
y_pred = model.predict(test_pool)

0:	learn: 0.6878386	total: 253ms	remaining: 6m 19s
20:	learn: 0.6396595	total: 2.24s	remaining: 2m 37s
40:	learn: 0.6181101	total: 4.26s	remaining: 2m 31s
60:	learn: 0.6093557	total: 6.24s	remaining: 2m 27s
80:	learn: 0.6030261	total: 8.33s	remaining: 2m 25s
100:	learn: 0.5987844	total: 10.4s	remaining: 2m 24s
120:	learn: 0.5956959	total: 12.6s	remaining: 2m 23s
140:	learn: 0.5933173	total: 14.6s	remaining: 2m 20s
160:	learn: 0.5908475	total: 16.6s	remaining: 2m 18s
180:	learn: 0.5877731	total: 18.8s	remaining: 2m 16s
200:	learn: 0.5849787	total: 20.9s	remaining: 2m 15s
220:	learn: 0.5819970	total: 23s	remaining: 2m 13s
240:	learn: 0.5791664	total: 25.1s	remaining: 2m 11s
260:	learn: 0.5760852	total: 27.2s	remaining: 2m 9s
280:	learn: 0.5734674	total: 29.3s	remaining: 2m 7s
300:	learn: 0.5705574	total: 31.5s	remaining: 2m 5s
320:	learn: 0.5681315	total: 33.6s	remaining: 2m 3s
340:	learn: 0.5658386	total: 35.8s	remaining: 2m 1s
360:	learn: 0.5632951	total: 38s	remaining: 1m 59s
380:	lea

In [10]:
acc=accuracy_score(y_test,y_pred)
print("accuracy",acc)
print(classification_report(y_test,y_pred))


accuracy 0.6677713338856669
              precision    recall  f1-score   support

           0       0.69      0.64      0.66      2457
           1       0.65      0.70      0.67      2371

    accuracy                           0.67      4828
   macro avg       0.67      0.67      0.67      4828
weighted avg       0.67      0.67      0.67      4828



In [11]:
full_pool = Pool(X,y,cat_features = cat_features)
model.fit(full_pool)

0:	learn: 0.6881247	total: 100ms	remaining: 2m 30s
20:	learn: 0.6367671	total: 2.2s	remaining: 2m 34s
40:	learn: 0.6166549	total: 4.21s	remaining: 2m 30s
60:	learn: 0.6087164	total: 6.35s	remaining: 2m 29s
80:	learn: 0.6044274	total: 8.54s	remaining: 2m 29s
100:	learn: 0.6006901	total: 10.7s	remaining: 2m 28s
120:	learn: 0.5976175	total: 12.9s	remaining: 2m 27s
140:	learn: 0.5949891	total: 15.1s	remaining: 2m 25s
160:	learn: 0.5929557	total: 17.3s	remaining: 2m 23s
180:	learn: 0.5906393	total: 19.5s	remaining: 2m 22s
200:	learn: 0.5888370	total: 21.6s	remaining: 2m 19s
220:	learn: 0.5867391	total: 24s	remaining: 2m 18s
240:	learn: 0.5840605	total: 26.2s	remaining: 2m 16s
260:	learn: 0.5814546	total: 28.5s	remaining: 2m 15s
280:	learn: 0.5790055	total: 30.7s	remaining: 2m 13s
300:	learn: 0.5767971	total: 32.9s	remaining: 2m 11s
320:	learn: 0.5746056	total: 35.1s	remaining: 2m 9s
340:	learn: 0.5722790	total: 37.4s	remaining: 2m 7s
360:	learn: 0.5700572	total: 39.6s	remaining: 2m 5s
380:	

<catboost.core.CatBoostClassifier at 0x213540263c0>

In [12]:
model.save_model("tennis_model.cbm")