In [1]:
import math
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from collections import defaultdict
from scipy.stats.stats import pearsonr
import pandas as pd
import os

In [2]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
df_tennis = pd.read_csv(DATASET_DIR + 'tennis_matches.csv', sep=',', index_col=0) 

#index_col=False say to not use the first column as ID
df_male = pd.read_csv(DATASET_DIR + 'male_players.csv', sep=',', index_col=False)
df_female = pd.read_csv(DATASET_DIR + 'female_players.csv', sep=',', index_col=False) 

## Print some records of the datasets

In [3]:
df_tennis.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_entry,winner_name,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_spectators,tourney_revenue
0,2019-M020,Brisbane,Hard,32.0,A,20181231.0,300.0,105453.0,,Kei Nishikori,...,20.0,14.0,10.0,15.0,9.0,3590.0,16.0,1977.0,3928.0,742618.69
1,2019-M020,Brisbane,Hard,32.0,A,20181231.0,299.0,106421.0,,Daniil Medvedev,...,7.0,10.0,10.0,13.0,16.0,1977.0,239.0,200.0,3928.0,742618.69
2,2019-M020,Brisbane,Hard,32.0,A,20181231.0,298.0,105453.0,,Kei Nishikori,...,6.0,8.0,1.0,5.0,9.0,3590.0,40.0,1050.0,3928.0,742618.69
3,2019-M020,Brisbane,Hard,32.0,A,20181231.0,297.0,104542.0,PR,Jo-Wilfried Tsonga,...,9.0,11.0,4.0,6.0,239.0,200.0,31.0,1298.0,3928.0,742618.69
4,2019-M020,Brisbane,Hard,32.0,A,20181231.0,296.0,106421.0,,Daniil Medvedev,...,19.0,15.0,2.0,4.0,16.0,1977.0,18.0,1855.0,3928.0,742618.69


In [4]:
df_male.head()

Unnamed: 0,name,surname
0,Gardnar,Mulloy
1,Pancho,Segura
2,Frank,Sedgman
3,Giuseppe,Merlo
4,Richard Pancho,Gonzales


In [5]:
df_female.head()

Unnamed: 0,name,surname
0,Bobby,Riggs
1,X,X
2,Martina,Hingis
3,Mirjana,Lucic
4,Justine,Henin


## Missing values: Null

In [6]:
#info about data that we have for male
df_male.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55208 entries, 0 to 55207
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     55031 non-null  object
 1   surname  55165 non-null  object
dtypes: object(2)
memory usage: 862.8+ KB


In [7]:
#number of null in the columns
df_male.isnull().sum(axis = 0)

name       177
surname     43
dtype: int64

So, the male dataframe has 2 columns: name and surname. It has 55208 entries and about 200 null values in total.

In [8]:
#info about data that we have for female
df_female.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46172 entries, 0 to 46171
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     44505 non-null  object
 1   surname  46172 non-null  object
dtypes: object(2)
memory usage: 721.6+ KB


In [9]:
#number of null in the columns
df_female.isnull().sum(axis = 0)

name       1667
surname       0
dtype: int64

So, the female dataframe has 2 columns: name and surname. It has 46172  entries and 1667 null values in the name, but the surname is never null.

In [10]:
df_tennis.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186128 entries, 0 to 186127
Data columns (total 49 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   tourney_id          186073 non-null  object 
 1   tourney_name        186103 non-null  object 
 2   surface             185940 non-null  object 
 3   draw_size           186099 non-null  float64
 4   tourney_level       186099 non-null  object 
 5   tourney_date        186100 non-null  float64
 6   match_num           186101 non-null  float64
 7   winner_id           186073 non-null  float64
 8   winner_entry        25827 non-null   object 
 9   winner_name         186101 non-null  object 
 10  winner_hand         186082 non-null  object 
 11  winner_ht           49341 non-null   float64
 12  winner_ioc          186099 non-null  object 
 13  winner_age          183275 non-null  float64
 14  loser_id            186100 non-null  float64
 15  loser_entry         44154 non-null

In [11]:
#we see if the attributes have some null values
df_tennis.isnull().any()

tourney_id            True
tourney_name          True
surface               True
draw_size             True
tourney_level         True
tourney_date          True
match_num             True
winner_id             True
winner_entry          True
winner_name           True
winner_hand           True
winner_ht             True
winner_ioc            True
winner_age            True
loser_id              True
loser_entry           True
loser_name            True
loser_hand            True
loser_ht              True
loser_ioc             True
loser_age             True
score                 True
best_of               True
round                 True
minutes               True
w_ace                 True
w_df                  True
w_svpt                True
w_1stIn               True
w_1stWon              True
w_2ndWon              True
w_SvGms               True
w_bpSaved             True
w_bpFaced             True
l_ace                 True
l_df                  True
l_svpt                True
l

In [12]:
#since all the attributes has missing values, we count them
df_tennis.isnull().sum(axis = 0)

tourney_id                55
tourney_name              25
surface                  188
draw_size                 29
tourney_level             29
tourney_date              28
match_num                 27
winner_id                 55
winner_entry          160301
winner_name               27
winner_hand               46
winner_ht             136787
winner_ioc                29
winner_age              2853
loser_id                  28
loser_entry           141974
loser_name                31
loser_hand                98
loser_ht              147780
loser_ioc                 26
loser_age               6538
score                    199
best_of                   29
round                     30
minutes               104468
w_ace                 103818
w_df                  103816
w_svpt                103818
w_1stIn               103818
w_1stWon              103816
w_2ndWon              103819
w_SvGms               103817
w_bpSaved             103813
w_bpFaced             103816
l_ace         

We can notice that some attributes have very few null values, but other attributes has more than 50% of null values

## Duplicate data

In [13]:
#we see if there are duplicates in the dataset male and female
df_male.duplicated(keep='first').sum()

524

In [14]:
df_female.duplicated(keep='first').sum()

511

So, both the datasets of male and female has rows with same name and surname. These rows can be duplicates (so they correspond to the same person) or they can be homonyms. In this latter case we cannot distinguish the matches of one player by the ones of the other player.

In [15]:
#we remove (only) the duplicated rows
df_male = df_male.drop_duplicates()
df_female = df_female.drop_duplicates()

In [16]:
#see if a name can be both male and female and manage them
df_players = pd.concat([df_male, df_female])
df_players[df_players.duplicated(keep='first')==True]

Unnamed: 0,name,surname
0,Bobby,Riggs
417,Robin,White
2687,Di,Zhao
3660,J,Tobin
3918,Yi,Liu
...,...,...
44222,,Beckert
45538,J,Lambert
45597,J,Young
45658,M,Noble


In [17]:
df_players.duplicated(keep='first').sum()

74

So, there are 74 names for both players male and female.

In [18]:
df_players[df_players.duplicated(subset=['name', 'surname'], keep='first')==True]

Unnamed: 0,name,surname
0,Bobby,Riggs
417,Robin,White
2687,Di,Zhao
3660,J,Tobin
3918,Yi,Liu
...,...,...
44222,,Beckert
45538,J,Lambert
45597,J,Young
45658,M,Noble


In [19]:
#check if there are duplicated rows in the dataset
df_tennis.duplicated(keep='first').sum()

309

In [20]:
#we remove these duplicates because they contain the same information
df_tennis = df_tennis.drop_duplicates()

## Noise, outliers and data quality (data visualization here)

### Invalid characters in names and surnames 

In [76]:
#see if all the names and surnames are valid (only letters)
df_male[df_male['surname'].str.count('[a-zA-Z ]')!=df_male['surname'].str.len()]

Unnamed: 0,name,surname
385,Jose,Garcia G003
457,Pedro,Gonzalez G162
858,Robert,Casey C100
1134,Robert,Phillips P239
1301,Ola,Jonsson J092
1538,Alberto,Gutierrez G230
1677,Alexander,Lindholm L174
1826,Jamie,Turturici T225
1894,Michael,Brown B395
2290,Paul,Robinson R261


In [77]:
df_male[df_male['name'].str.count('[a-zA-Z ]')!=df_male['name'].str.len()]

Unnamed: 0,name,surname
10429,,
16955,??,Baillie
17067,,Rf Le Sueur
17091,,A Riches
17093,,N Zaher
...,...,...
49692,,Benroi
49699,,Bradley
51999,,I Georgiadis
52103,,Ae Browne


In [79]:
df_female[df_female['surname'].str.count('[a-zA-Z ]')!=df_female['surname'].str.len()]

Unnamed: 0,name,surname
21238,Ekaterina,Makarova 1996
31618,Habiba,El_Anany


In [81]:
df_female[df_female['name'].str.count('[a-zA-Z ]')!=df_female['name'].str.len()]

Unnamed: 0,name,surname
3763,,Marine
16643,,Brown
16699,,Mckinney
16701,,Munsell
16746,,Raimo
...,...,...
45980,,Lloyd
45989,,Valtier
46027,,Keppel
46072,,Bavinger


So, in the male and female datasets we have some invalid names and surnames because of numbers, special characters of null values.

NOTE: "40071 	Jason "Jj",Belan 	NaN" is an error because Belan is the surname but pandas has not divided the name by the surname.

### Categorical data (tennis dataset)

In [82]:
df_tennis.dtypes.value_counts()

float64    35
object     14
dtype: int64

In [None]:
#Values in the columns with type object
for column in df_tennis.columns:
    if df_tennis[column].dtypes == "object":
        print("Distinct Values in "+str(column)+": \n", df_tennis[column].unique(), "\n")

### Numeric data

## Missing values: default value

## Correlation

## Statistics/distributions

In [None]:
#some statistics
#df_tennis.describe()