In [7]:
import math
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import collections
from scipy.stats.stats import pearsonr
import pandas as pd
import os
from datetime import date

In [8]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
df_tennis = pd.read_csv(DATASET_DIR + 'tennis_matches.csv', sep=',', index_col=0) 

#index_col=False say to not use the first column as ID
df_male = pd.read_csv(DATASET_DIR + 'male_players.csv', sep=',', index_col=False)
df_female = pd.read_csv(DATASET_DIR + 'female_players.csv', sep=',', index_col=False) 

## Print some records of the datasets

In [None]:
df_tennis.head()

In [None]:
df_male.head()

In [None]:
df_female.head()

## Missing values: Null

In [None]:
#info about data that we have for male
df_male.info()

In [None]:
#number of null in the columns
df_male.isnull().sum(axis = 0)

So, the male dataframe has 2 columns: name and surname. It has 55208 entries and about 200 null values in total.

In [None]:
#info about data that we have for female
df_female.info()

In [None]:
#number of null in the columns
df_female.isnull().sum(axis = 0)

So, the female dataframe has 2 columns: name and surname. It has 46172  entries and 1667 null values in the name, but the surname is never null.

In [None]:
df_tennis.info()

In [None]:
#we see if the attributes have some null values
df_tennis.isnull().any()

In [None]:
#since all the attributes has missing values, we count them
df_tennis.isnull().sum(axis = 0)

We can notice that some attributes have very few null values, but other attributes has more than 50% of null values

## Duplicate data

In [15]:
#we see if there are duplicates in the dataset male and female
df_male.duplicated(keep='first').sum()

524

In [16]:
df_female.duplicated(keep='first').sum()

511

So, both the datasets of male and female has rows with same name and surname. These rows can be duplicates (so they correspond to the same person) or they can be homonyms. In this latter case we cannot distinguish the matches of one player by the ones of the other player.

In [17]:
#we remove (only) the duplicated rows
df_male = df_male.drop_duplicates()
df_female = df_female.drop_duplicates()

In [18]:
#see if a name can be both male and female and manage them
df_players = pd.concat([df_male, df_female])
df_players[df_players.duplicated(keep='first')==True]

Unnamed: 0,name,surname
0,Bobby,Riggs
417,Robin,White
2687,Di,Zhao
3660,J,Tobin
3918,Yi,Liu
...,...,...
44222,,Beckert
45538,J,Lambert
45597,J,Young
45658,M,Noble


In [19]:
df_players.duplicated(keep='first').sum()

74

So, there are 74 names for both players male and female.

In [20]:
df_players[df_players.duplicated(subset=['name', 'surname'], keep='first')==True]

Unnamed: 0,name,surname
0,Bobby,Riggs
417,Robin,White
2687,Di,Zhao
3660,J,Tobin
3918,Yi,Liu
...,...,...
44222,,Beckert
45538,J,Lambert
45597,J,Young
45658,M,Noble


In [21]:
#check if there are duplicated rows in the dataset
df_tennis.duplicated(keep='first').sum()

309

In [24]:
#we remove these duplicates because they contain the same information
df_tennis = df_tennis.drop_duplicates()

## Noise, outliers and data quality (data visualization here)

### Invalid characters in names and surnames 

In [None]:
#see if all the names and surnames are valid (only letters)
df_male[df_male['surname'].str.count('[a-zA-Z ]')!=df_male['surname'].str.len()]

In [None]:
df_male[df_male['name'].str.count('[a-zA-Z ]')!=df_male['name'].str.len()]

In [None]:
df_female[df_female['surname'].str.count('[a-zA-Z ]')!=df_female['surname'].str.len()]

In [None]:
df_female[df_female['name'].str.count('[a-zA-Z ]')!=df_female['name'].str.len()]

So, in the male and female datasets we have some invalid names and surnames because of numbers, special characters of null values.

NOTE: "40071 	Jason "Jj",Belan 	NaN" is an error because Belan is the surname but pandas has not divided the name by the surname.

### Categorical data in tennis dataset

In [25]:
df_tennis.dtypes.value_counts()

float64    35
object     14
dtype: int64

In [14]:
#Values in the columns with type object
for column in df_tennis.select_dtypes(include=['object']).columns:
    print("Distinct Values in "+str(column)+": \n", df_tennis[column].unique(), "\n")

Distinct Values in tourney_id: 
 ['2019-M020' '2019-0451' '2019-0891' ... '2020-W-ITF-USA-47A-2020'
 '2020-W-ITF-USA-48A-2020' nan] 

Distinct Values in tourney_name: 
 ['Brisbane' 'Doha' 'Pune' ... 'W100 Nicholasville KY' 'W25 Las Vegas NV'
 nan] 

Distinct Values in surface: 
 ['Hard' 'Clay' 'Grass' 'Carpet' nan] 

Distinct Values in tourney_level: 
 ['A' 'P' 'G' 'I' 'M' 'PM' 'F' 'D' 'C' '15' '25' '60' '100' '80' '10' '50'
 '75' 'O' 'W' nan] 

Distinct Values in winner_entry: 
 [nan 'PR' 'Q' 'WC' 'Alt' 'LL' 'SE' 'ALT' 'SR' 'JE' 'A' 'ITF' 'P' 'I' 'IR'
 'JR'] 

Distinct Values in winner_name: 
 ['Kei Nishikori' 'Daniil Medvedev' 'Jo-Wilfried Tsonga' ... 'Sultan Gonen'
 'Viktoria Veleva' nan] 

Distinct Values in winner_hand: 
 ['R' 'L' 'U' nan] 

Distinct Values in winner_ioc: 
 ['JPN' 'RUS' 'FRA' 'AUS' 'CAN' 'BUL' 'GBR' 'SRB' 'USA' 'LAT' 'CZE' 'EST'
 'UKR' 'NED' 'CRO' 'BLR' 'CHI' 'SUI' 'POL' 'GER' 'LUX' 'ESP' 'ITA' 'GEO'
 'HUN' 'LTU' 'ARG' 'CYP' 'BIH' 'RSA' 'BEL' 'TUN' 'IND' 'BRA' 'AU

#### tourney_id

In [9]:
#check that for non null values, the first 4 char are the year
#count rows whose firts 4 char are not numbers
df_tennis[df_tennis['tourney_id'].str[:4].str.isnumeric()==False].shape[0] 

0

In [10]:
#chech if some years are in the future (so are invalid)(ignore nulls)
df_tennis[pd.to_numeric(df_tennis['tourney_id'].str[:4]).fillna(0).astype('int') > date.today().year].shape[0] 

0

In [11]:
#chech if there are invalid years because to much in the past (ignore nulls)
df_tennis[pd.to_numeric(df_tennis['tourney_id'].str[:4]).fillna(date.today().year).astype('int') < 1874 ].shape[0] 

0

So, the first 4 numbers in the tourney_id are always valid when the tourney_id id not null

In [12]:
#chech that for each tourney there is more than one match (every id appears more than once)
df_tennis[df_tennis['tourney_id'].duplicated(keep=False)==False].shape[0] 

0

In [13]:
#check how many distinct tourney are present
df_tennis["tourney_id"].value_counts().count()

4853

In [26]:
#count how many match are played for each tournamnet
#df_tennis.groupby(['tourney_id'])["match_num"].count()

tourney_id
2016-0083                  54
2016-0091                  59
2016-0213                  59
2016-0221                  47
2016-0228                  57
                           ..
2021-W-ITF-USA-12A-2021    71
2021-W-ITF-USA-13A-2021    46
2021-W-ITF-USA-14A-2021    55
2021-W-ITF-USA-15A-2021    54
2021-W-ITF-USA-16A-2021    55
Name: match_num, Length: 4853, dtype: int64

So, every torney_id appears more than once

#### tourney_name

In [None]:
#chech that for the same tourney_id we have always the same name: USEFUL?
#len(df_tennis.groupby(['tourney_id','tourney_name']).size())-len(df_tennis.groupby(['tourney_id']).size())

#### tourney_level

In [None]:
#there are no records for S and T1, and for E, J, T (but for these 3 they will appear (see pdf))
#there are, in the data, O and W that are not in the pdf

#eccetto questo al massimo farei un controllo come quello fatto per l'hand

**tourney_date**

In [None]:
#the date are in float so they need to be converted in date object 
df_tennis['tourney_date'].isnull().sum()

In [28]:
#check if there are present data greater then today
import datetime
df_tennis['tourney_date'] = pd.to_datetime(df_tennis['tourney_date'], format='%Y%m%d')
invalid_data = 0
today = pd.to_datetime(datetime.date.today())
for date in df_tennis['tourney_date']:
    if date > today:
        invalid_data +=1
print(invalid_data)

0


There aren't invalid dates

#### winner_name and loser_name

In [None]:
    # in the ranges we should consider also [-.']???

In [36]:
#check that names are valid
df_tennis[df_tennis['winner_name'].str.count("[a-zA-Z ',.-]")!=df_tennis['winner_name'].str.len()]['winner_name']

70116     Alejandro Gomez Gb42
74122     Alejandro Gomez Gb42
74136     Alejandro Gomez Gb42
74160     Alejandro Gomez Gb42
74169     Alejandro Gomez Gb42
                  ...         
186117                     NaN
186118                     NaN
186120                     NaN
186121                     NaN
186123                     NaN
Name: winner_name, Length: 105, dtype: object

In [30]:
df_tennis[df_tennis['loser_name'].str.count("[a-zA-Z ',.-]")!=df_tennis['loser_name'].str.len()]['loser_name']

1            Jo-Wilfried Tsonga
59        Christopher O'Connell
277       Christopher O'Connell
350          Jo-Wilfried Tsonga
982          Jo-Wilfried Tsonga
                  ...          
186119                      NaN
186123                      NaN
186125                      NaN
186126                      NaN
186127                      NaN
Name: loser_name, Length: 306, dtype: object

There are invalid characters in the names of some winners and some losers

#### winner_hand and loser_hand

In [None]:
#check that there are not indicated hand that are invalid (ignore nulls)
hand = ['R','L','U']
df_tennis[~df_tennis['winner_hand'].fillna('U').str.upper().isin(hand)].shape[0]

In [None]:
df_tennis[~df_tennis['loser_hand'].fillna('U').str.upper().isin(hand)].shape[0]

There are not invalid entries for the hand of winner or loser

#### winner_ioc and loser_ioc

In [None]:
df_countrycode = pd.read_csv(DATASET_DIR + 'country-codes_csv.csv', sep=',', index_col=False) 
df_countrycode.head()

In [None]:
check_cc = pd.Series(~df_tennis.winner_ioc.isin(df_countrycode.IOC).values, df_tennis.winner_ioc.values)
type(check_cc[check_cc].index)
check_cc[check_cc].index.value_counts()

In [None]:
#aggiungerei anche il controllo usando l'altro codice che avevi visto @reny così da farci un'idea di quanti di
#questi siano proprio sbagliati o solo nel formato sbaglito

#### score
https://www.wikihow.it/Tenere-il-Punteggio-a-Tennis

In [None]:
#What are the numbers between the brackets ()???

In [None]:
#una partita di tennis deve finire con almeno 2 punti di vantaggio (es 7-5, 6-4). al massimo si può controllare questo
#ma so se sia utile come info. nel caso si può anche evitare.
#sembra che quelli con le () non rispettano sta cosa. 

**best_of**

In [27]:
#check if there are different values form 3 or 5
df_tennis['best_of'].value_counts(dropna = False)

3.0    182617
5.0      3173
NaN        29
Name: best_of, dtype: int64

There are not different values from 3 and 5, a part for some null values.

### Numeric data

In [None]:
df_tennis.select_dtypes(include=['float64']).columns
for column in df_tennis.select_dtypes(include=['float64']).columns:
    print("Distinct Values in "+str(column)+": \n", df_tennis[column].unique(), "\n")

In [None]:
df_te

## Missing values: default value

## Correlation

In [None]:
df_numeric = df_tennis[df_tennis.select_dtypes(include=['float64']).columns]

In [None]:
df_numeric.corr()

inserire ragionamento sulla correlation

## Statistics/distributions

In [None]:
#some statistics
#df_tennis.describe()all