<a href="https://colab.research.google.com/github/dundurlunka/ml-grand-slam/blob/main/ml_grand_slam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Read data from github

In [143]:
import pandas as pd
import numpy as np

import sys
import csv

In [144]:
csv.field_size_limit(sys.maxsize)

points = pd.read_csv('./charting-m-points-from-2017.csv', quoting=csv.QUOTE_NONE, encoding = 'ISO-8859-1')
matches = pd.read_csv('./charting-m-matches.csv', quoting=csv.QUOTE_NONE, encoding = 'ISO-8859-1')

  exec(code_obj, self.user_global_ns, self.user_ns)


Merge the two datasets together by match_id

In [145]:
joinedData = pd.merge(
    matches,
    points,
    how="inner",
    on='match_id',
)

In [146]:
df_obj = joinedData.select_dtypes(['object'])
joinedData[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

Select relevant columns from dataframe


In [147]:
joinedData.columns = joinedData.columns.to_series().apply(lambda x: x.strip())
joinedData = joinedData.loc[:, ['Player 1', 'Player 2', 'Pl 1 hand', 'Pl 2 hand', 'Tournament', 'Surface', 'Set1','Set2', 'Gm1', 'Gm2', 'Pts', 'Serving', '1st', '2nd', 'isAce', 'isUnret']]

In [148]:
joinedData['isAce'] = joinedData['isAce'] | joinedData['isUnret']
joinedData = joinedData.drop('isUnret', axis='columns')

In [149]:
tennis_players = ['Daniil Medvedev', 'Alexander Zverev', 'Roger Federer', 'Dominic Thiem', 'Nick Kyrgios', 'Novak Djokovic', 'Rafael Nadal', 'Stefanos Tsitsipas', 'Casper Ruud', 'Grigor Dimitrov']
tennis_players_initials = [''.join([x[0].upper() for x in fullname.split(' ')]) for fullname in tennis_players]
joinedData = joinedData.loc[((joinedData['Player 1'].isin(tennis_players)) | (joinedData['Player 2'].isin(tennis_players)))]
joinedData = joinedData.loc[joinedData['Serving'].isin(tennis_players_initials)]

In [150]:
joinedData[['Pts1', 'Pts2']] = joinedData['Pts'].str.split('-', expand=True)
joinedData = joinedData.drop('Pts', axis='columns')

Separate the columns into new columns that are specific for Server and Returner.

In [151]:
names = []
pointsServer = []
pointsReturner = []
gamesServer = []
gamesReturner = []
setsServer = []
setsReturner = []
handReturner = []
for index, row in joinedData.iterrows():
    if ''.join([x[0].upper() for x in row['Player 1'].split(' ')]) == row['Serving']:
      names.append(row['Player 1'])
      pointsServer.append(row['Pts1'])
      pointsReturner.append(row['Pts2'])
      gamesServer.append(row['Gm1'])
      gamesReturner.append(row['Gm2'])
      setsServer.append(row['Set1'])
      setsReturner.append(row['Set2'])
      handReturner.append(row['Pl 2 hand'])
    elif ''.join([x[0].upper() for x in row['Player 2'].split(' ')]) == row['Serving']:
      names.append(row['Player 2'])
      pointsServer.append(row['Pts2'])
      pointsReturner.append(row['Pts1'])
      gamesServer.append(row['Gm2'])
      gamesReturner.append(row['Gm1'])
      setsServer.append(row['Set2'])
      setsReturner.append(row['Set1'])
      handReturner.append(row['Pl 1 hand'])

joinedData['FullNameServer'] = names
joinedData['PointsServer'] = pointsServer
joinedData['PointsReturner'] = pointsReturner
joinedData['GamesServer'] = gamesServer
joinedData['GamesReturner'] = gamesReturner
joinedData['SetsServer'] = setsServer
joinedData['SetsReturner'] = setsReturner
joinedData['HandReturner'] = handReturner

joinedData = joinedData.drop(columns=['Player 1', 'Player 2', 'Pts1', 'Pts2', 'Gm1', 'Gm2', 'Set1', 'Set2', 'Pl 1 hand', 'Pl 2 hand', 'Serving'])

Combine the serves into one column with a list of 1 or 2 serves so that it can be exploded into several rows later

In [152]:
joinedData['Serves'] = joinedData.apply(lambda x: list(['f' + str(x['1st']),
                                                        's' + str(x['2nd'])]), axis=1)   

Explode list of first and second serve into separate rows

In [153]:
joinedData = joinedData.explode('Serves')

Add new column indicating whether the serve was first or second

In [154]:
joinedData['IsFirstServe'] = joinedData['Serves'].str[0] == 'f'

Remove 'f' and 's' (first and second serve) that were indicators before the explosion

In [155]:
joinedData['Serves'] = joinedData['Serves'].str[1:]
joinedData = joinedData[(joinedData['Serves'] != 'nan') & (joinedData['Serves'] != '')]
joinedData = joinedData.drop(columns=['1st', '2nd'])

Remove lets

In [156]:
joinedData['Serves'] = joinedData['Serves'].map(lambda x: x.lstrip('c'))

In [157]:
joinedData['FullNameServer'].value_counts()

Roger Federer         12724
Novak Djokovic        12549
Daniil Medvedev       12427
Stefanos Tsitsipas    10837
Rafael Nadal          10717
Dominic Thiem         10051
Alexander Zverev       8100
Nick Kyrgios           7470
Casper Ruud            5300
Grigor Dimitrov        5101
Name: FullNameServer, dtype: int64

Extract direction of serve

In [158]:
def extract_serve_direction(row):
  serves_directions_dict = {
      '4': 'Out wide',
      '5': 'Body',
      '6': 'Down the T'
  }

  if len(row['Serves']) > 1 and row['Serves'][1] in ['x', 'd', 'w', 'n', 'g', '!', 'e']:
      return "Fault"
  else:
      return serves_directions_dict.get(row['Serves'][0], 'body')

joinedData['Direction'] = joinedData.apply(extract_serve_direction, axis=1)
joinedData = joinedData.drop(columns='Serves')

# Start of machine learning

In [160]:
import seaborn as sns

In [159]:
pd.set_option('display.max_rows', 500)
joinedData.head(500)

Unnamed: 0,Tournament,Surface,isAce,FullNameServer,PointsServer,PointsReturner,GamesServer,GamesReturner,SetsServer,SetsReturner,HandReturner,IsFirstServe,Direction
637,Tour Finals,Hard,False,Novak Djokovic,0,0,0,0,0,0,R,True,Fault
637,Tour Finals,Hard,False,Novak Djokovic,0,0,0,0,0,0,R,False,Out wide
638,Tour Finals,Hard,False,Novak Djokovic,0,15,0,0,0,0,R,True,Fault
638,Tour Finals,Hard,False,Novak Djokovic,0,15,0,0,0,0,R,False,Down the T
639,Tour Finals,Hard,True,Novak Djokovic,15,15,0,0,0,0,R,True,Out wide
640,Tour Finals,Hard,False,Novak Djokovic,30,15,0,0,0,0,R,True,Fault
640,Tour Finals,Hard,False,Novak Djokovic,30,15,0,0,0,0,R,False,Body
641,Tour Finals,Hard,False,Novak Djokovic,40,15,0,0,0,0,R,True,Out wide
642,Tour Finals,Hard,False,Casper Ruud,0,0,0,1,0,0,R,True,Fault
642,Tour Finals,Hard,False,Casper Ruud,0,0,0,1,0,0,R,False,Down the T
