In [141]:
import math
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import collections
from scipy.stats.stats import pearsonr
import pandas as pd
import os

In [72]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
df_tennis = pd.read_csv(DATASET_DIR + 'tennis_matches.csv', sep=',', index_col=0) 

# index_col=False say to not use the first column as ID
df_male = pd.read_csv(DATASET_DIR + 'male_players.csv', sep=',', index_col=False)
df_female = pd.read_csv(DATASET_DIR + 'female_players.csv', sep=',', index_col=False) 

## Print some records of the datasets

In [21]:
df_tennis.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_entry,winner_name,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_spectators,tourney_revenue
0,2019-M020,Brisbane,Hard,32.0,A,20181231.0,300.0,105453.0,,Kei Nishikori,...,20.0,14.0,10.0,15.0,9.0,3590.0,16.0,1977.0,3928.0,742618.69
1,2019-M020,Brisbane,Hard,32.0,A,20181231.0,299.0,106421.0,,Daniil Medvedev,...,7.0,10.0,10.0,13.0,16.0,1977.0,239.0,200.0,3928.0,742618.69
2,2019-M020,Brisbane,Hard,32.0,A,20181231.0,298.0,105453.0,,Kei Nishikori,...,6.0,8.0,1.0,5.0,9.0,3590.0,40.0,1050.0,3928.0,742618.69
3,2019-M020,Brisbane,Hard,32.0,A,20181231.0,297.0,104542.0,PR,Jo-Wilfried Tsonga,...,9.0,11.0,4.0,6.0,239.0,200.0,31.0,1298.0,3928.0,742618.69
4,2019-M020,Brisbane,Hard,32.0,A,20181231.0,296.0,106421.0,,Daniil Medvedev,...,19.0,15.0,2.0,4.0,16.0,1977.0,18.0,1855.0,3928.0,742618.69


In [22]:
df_male.head()

Unnamed: 0,name,surname
0,Gardnar,Mulloy
1,Pancho,Segura
2,Frank,Sedgman
3,Giuseppe,Merlo
4,Richard Pancho,Gonzales


In [23]:
df_female.head()

Unnamed: 0,name,surname
0,Bobby,Riggs
1,X,X
2,Martina,Hingis
3,Mirjana,Lucic
4,Justine,Henin


## Missing values: Null

In [24]:
#info about data that we have for male
df_male.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55208 entries, 0 to 55207
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     55031 non-null  object
 1   surname  55165 non-null  object
dtypes: object(2)
memory usage: 862.8+ KB


In [25]:
#number of null in the columns
df_male.isnull().sum(axis = 0)

name       177
surname     43
dtype: int64

So, the male dataframe has 2 columns: name and surname. It has 55208 entries and about 200 null values in total.

In [26]:
#info about data that we have for female
df_female.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46172 entries, 0 to 46171
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     44505 non-null  object
 1   surname  46172 non-null  object
dtypes: object(2)
memory usage: 721.6+ KB


In [37]:
#number of null in the columns
df_female.isnull().sum(axis = 0)

name       1667
surname       0
dtype: int64

So, the female dataframe has 2 columns: name and surname. It has 46172  entries and 1667 null values in the name, but the surname is never null.

In [27]:
df_tennis.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186128 entries, 0 to 186127
Data columns (total 49 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   tourney_id          186073 non-null  object 
 1   tourney_name        186103 non-null  object 
 2   surface             185940 non-null  object 
 3   draw_size           186099 non-null  float64
 4   tourney_level       186099 non-null  object 
 5   tourney_date        186100 non-null  float64
 6   match_num           186101 non-null  float64
 7   winner_id           186073 non-null  float64
 8   winner_entry        25827 non-null   object 
 9   winner_name         186101 non-null  object 
 10  winner_hand         186082 non-null  object 
 11  winner_ht           49341 non-null   float64
 12  winner_ioc          186099 non-null  object 
 13  winner_age          183275 non-null  float64
 14  loser_id            186100 non-null  float64
 15  loser_entry         44154 non-null

In [41]:
#we see if the attributes have some null values
df_tennis.isnull().any()

tourney_id            True
tourney_name          True
surface               True
draw_size             True
tourney_level         True
tourney_date          True
match_num             True
winner_id             True
winner_entry          True
winner_name           True
winner_hand           True
winner_ht             True
winner_ioc            True
winner_age            True
loser_id              True
loser_entry           True
loser_name            True
loser_hand            True
loser_ht              True
loser_ioc             True
loser_age             True
score                 True
best_of               True
round                 True
minutes               True
w_ace                 True
w_df                  True
w_svpt                True
w_1stIn               True
w_1stWon              True
w_2ndWon              True
w_SvGms               True
w_bpSaved             True
w_bpFaced             True
l_ace                 True
l_df                  True
l_svpt                True
l

In [44]:
#since all the attributes has missing values, we count them
df_tennis.isnull().sum(axis = 0)

tourney_id                55
tourney_name              25
surface                  188
draw_size                 29
tourney_level             29
tourney_date              28
match_num                 27
winner_id                 55
winner_entry          160301
winner_name               27
winner_hand               46
winner_ht             136787
winner_ioc                29
winner_age              2853
loser_id                  28
loser_entry           141974
loser_name                31
loser_hand                98
loser_ht              147780
loser_ioc                 26
loser_age               6538
score                    199
best_of                   29
round                     30
minutes               104468
w_ace                 103818
w_df                  103816
w_svpt                103818
w_1stIn               103818
w_1stWon              103816
w_2ndWon              103819
w_SvGms               103817
w_bpSaved             103813
w_bpFaced             103816
l_ace         

We can notice that some attributes have very few null values, but other attributes has more than 50% of null values

## Duplicate data

In [46]:
#we see if there are duplicates in the dataset male and female
df_male.duplicated(keep='first').sum()

524

In [14]:
df_female.duplicated(keep='first').sum()

511

So, both the datasets of male and female has rows with same name and surname. These rows can be duplicates (so they correspond to the same person) or they can be homonyms. In this latter case we cannot distinguish the matches of one player by the ones of the other player.

In [15]:
#we remove (only) the duplicated rows
df_male = df_male.drop_duplicates()
df_female = df_female.drop_duplicates()

In [16]:
#see if a name can be both male and female and manage them
df_players = pd.concat([df_male, df_female])
df_players[df_players.duplicated(keep='first')==True]

Unnamed: 0,name,surname
0,Bobby,Riggs
417,Robin,White
2687,Di,Zhao
3660,J,Tobin
3918,Yi,Liu
...,...,...
44222,,Beckert
45538,J,Lambert
45597,J,Young
45658,M,Noble


In [17]:
df_players.duplicated(keep='first').sum()

74

So, there are 74 names for both players male and female.

In [18]:
df_players[df_players.duplicated(subset=['name', 'surname'], keep='first')==True]

Unnamed: 0,name,surname
0,Bobby,Riggs
417,Robin,White
2687,Di,Zhao
3660,J,Tobin
3918,Yi,Liu
...,...,...
44222,,Beckert
45538,J,Lambert
45597,J,Young
45658,M,Noble


In [19]:
#check if there are duplicated rows in the dataset
df_tennis.duplicated(keep='first').sum()

309

In [20]:
#we remove these duplicates because they contain the same information
df_tennis = df_tennis.drop_duplicates()

## Noise, outliers and data quality (data visualization here)

In [21]:
#Values in the columns with type object
for column in df_tennis.columns:
    if df_tennis[column].dtypes == "object":
        print("Distinct Values in "+str(column)+": \n", df_tennis[column].unique(), "\n")

Distinct Values in tourney_id: 
 ['2019-M020' '2019-0451' '2019-0891' ... '2020-W-ITF-USA-47A-2020'
 '2020-W-ITF-USA-48A-2020' nan] 

Distinct Values in tourney_name: 
 ['Brisbane' 'Doha' 'Pune' ... 'W100 Nicholasville KY' 'W25 Las Vegas NV'
 nan] 

Distinct Values in surface: 
 ['Hard' 'Clay' 'Grass' 'Carpet' nan] 

Distinct Values in tourney_level: 
 ['A' 'P' 'G' 'I' 'M' 'PM' 'F' 'D' 'C' '15' '25' '60' '100' '80' '10' '50'
 '75' 'O' 'W' nan] 

Distinct Values in winner_entry: 
 [nan 'PR' 'Q' 'WC' 'Alt' 'LL' 'SE' 'ALT' 'SR' 'JE' 'A' 'ITF' 'P' 'I' 'IR'
 'JR'] 

Distinct Values in winner_name: 
 ['Kei Nishikori' 'Daniil Medvedev' 'Jo-Wilfried Tsonga' ... 'Sultan Gonen'
 'Viktoria Veleva' nan] 

Distinct Values in winner_hand: 
 ['R' 'L' 'U' nan] 

Distinct Values in winner_ioc: 
 ['JPN' 'RUS' 'FRA' 'AUS' 'CAN' 'BUL' 'GBR' 'SRB' 'USA' 'LAT' 'CZE' 'EST'
 'UKR' 'NED' 'CRO' 'BLR' 'CHI' 'SUI' 'POL' 'GER' 'LUX' 'ESP' 'ITA' 'GEO'
 'HUN' 'LTU' 'ARG' 'CYP' 'BIH' 'RSA' 'BEL' 'TUN' 'IND' 'BRA' 'AU

## Missing values: default value

## Correlation

In [45]:
df_tennis.corr()

Unnamed: 0,draw_size,tourney_date,match_num,winner_id,winner_ht,winner_age,loser_id,loser_ht,loser_age,best_of,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_spectators,tourney_revenue
draw_size,1.0,0.051022,0.270213,-0.211249,0.018759,0.199777,-0.218734,-0.0072,0.206213,0.378014,...,0.095602,0.079869,0.067465,0.09314,-0.317519,0.329775,-0.316871,0.284922,0.929342,0.842169
tourney_date,0.051022,1.0,0.085142,0.160846,-0.013548,0.004799,0.149017,-0.005079,0.062022,-0.006724,...,-0.002783,0.026564,0.0189,0.020774,0.04865,0.022022,-0.016706,0.039919,0.066579,0.070467
match_num,0.270213,0.085142,1.0,-0.285763,0.06932,0.162903,-0.289177,0.049183,0.177079,0.032165,...,-0.003143,-0.000268,0.002265,0.004627,-0.206575,0.127164,-0.193372,0.129871,0.271361,0.183233
winner_id,-0.211249,0.160846,-0.285763,1.0,-0.503485,-0.495656,0.896243,-0.541209,-0.347701,-0.195736,...,-0.158258,-0.12653,0.050753,0.168187,0.383267,-0.174511,0.291999,-0.131044,-0.197485,-0.180254
winner_ht,0.018759,-0.013548,0.06932,-0.503485,1.0,0.118012,-0.510127,0.412717,0.131299,0.133758,...,0.168725,0.144348,-0.055027,-0.181758,-0.046908,0.032934,-0.026052,0.000771,0.018208,0.021137
winner_age,0.199777,0.004799,0.162903,-0.495656,0.118012,1.0,-0.355623,0.125827,0.23537,0.120443,...,0.047297,0.049812,0.0068,-0.012786,-0.389687,0.249021,-0.28794,0.191483,0.187513,0.169052
loser_id,-0.218734,0.149017,-0.289177,0.896243,-0.510127,-0.355623,1.0,-0.531784,-0.504599,-0.19517,...,-0.170572,-0.144286,0.046723,0.169612,0.374293,-0.169673,0.346377,-0.151154,-0.204755,-0.186972
loser_ht,-0.0072,-0.005079,0.049183,-0.541209,0.412717,0.125827,-0.531784,1.0,0.114431,0.128677,...,0.161488,0.162459,-0.052793,-0.169487,-0.006033,0.001513,0.001487,-0.015248,-0.007844,-0.002837
loser_age,0.206213,0.062022,0.177079,-0.347701,0.131299,0.23537,-0.504599,0.114431,1.0,0.112163,...,0.045393,0.058043,0.002361,-0.029868,-0.341424,0.212625,-0.393909,0.243347,0.194749,0.177392
best_of,0.378014,-0.006724,0.032165,-0.195736,0.133758,0.120443,-0.19517,0.128677,0.112163,1.0,...,0.256062,0.376298,0.135856,0.162483,-0.140153,0.25346,-0.128482,0.15692,0.344593,0.32527


## Statistics/distributions

In [22]:
#some statistics
#df_tennis.describe()

## Test check wikipedia-iso-country-codes

In [108]:
df_countrycode = pd.read_csv(DATASET_DIR + 'wikipedia-iso-country-codes.csv', sep=',', index_col=False) 
df_countrycode.head()

Unnamed: 0,Englishshortnamelowercase,Alpha2code,Alpha3code,Numericcode,ISO3166-2
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX
2,Albania,AL,ALB,8,ISO 3166-2:AL
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ
4,American Samoa,AS,ASM,16,ISO 3166-2:AS


In [157]:
check_cc = pd.Series(~df_tennis.winner_ioc.isin(df_countrycode.Alpha3code).values, df_tennis.winner_ioc.values)
check_cc

JPN    False
RUS    False
JPN    False
FRA    False
RUS    False
       ...  
NaN     True
NaN     True
NaN     True
RUS    False
AUS    False
Length: 186128, dtype: bool