In [None]:
import math
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import collections
from scipy.stats.stats import pearsonr
import pandas as pd
import os
from datetime import date

In [None]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
df_tennis = pd.read_csv(DATASET_DIR + 'tennis_matches.csv', sep=',', index_col=0) 

#index_col=False say to not use the first column as ID
df_male = pd.read_csv(DATASET_DIR + 'male_players.csv', sep=',', index_col=False)
df_female = pd.read_csv(DATASET_DIR + 'female_players.csv', sep=',', index_col=False) 

## Print some records of the datasets

In [None]:
df_tennis.head()

In [None]:
df_male.head()

In [None]:
df_female.head()

## Missing values: Null

In [None]:
#info about data that we have for male
df_male.info()

In [None]:
#number of null in the columns
df_male.isnull().sum(axis = 0)

So, the male dataframe has 2 columns: name and surname. It has 55208 entries and about 200 null values in total.

In [None]:
#info about data that we have for female
df_female.info()

In [None]:
#number of null in the columns
df_female.isnull().sum(axis = 0)

So, the female dataframe has 2 columns: name and surname. It has 46172  entries and 1667 null values in the name, but the surname is never null.

In [None]:
df_tennis.info()

In [None]:
#we see if the attributes have some null values
df_tennis.isnull().any()

In [None]:
#since all the attributes has missing values, we count them
df_tennis.isnull().sum(axis = 0)

We can notice that some attributes have very few null values, but other attributes has more than 50% of null values

## Duplicate data

In [None]:
#we see if there are duplicates in the dataset male and female
df_male.duplicated(keep='first').sum()

In [None]:
df_female.duplicated(keep='first').sum()

So, both the datasets of male and female has rows with same name and surname. These rows can be duplicates (so they correspond to the same person) or they can be homonyms. In this latter case we cannot distinguish the matches of one player by the ones of the other player.

In [None]:
#we remove (only) the duplicated rows
df_male_no_dup = df_male.drop_duplicates()
df_female_no_dup = df_female.drop_duplicates()

In [None]:
#see if a name can be both male and female and manage them
df_players = pd.concat([df_male_no_dup, df_female_no_dup])
df_players[df_players.duplicated(keep='first')==True]

In [None]:
df_players.duplicated(keep='first').sum()

So, there are 74 names for both players male and female.

In [None]:
df_players[df_players.duplicated(subset=['name', 'surname'], keep='first')==True]

In [None]:
#check if there are duplicated rows in the dataset
df_tennis.duplicated(keep='first').sum()

## Noise, outliers and data quality (data visualization here)

|  Categorical  |   Ordinal   |      Numerical     | Ratio-Scaled |
|:-------------:|:-----------:|:------------------:|:------------:|
|   tournay_id  |  match_num  |      draw_size     |   winner_ht  |
|  tournay_name | winner_rank |       minutes      |  winner_age  |
|    surface    |             |     winner_ace     |              |
| tournay_level |             |      winner_df     |              |
|   winner_id   |             |     winner_svpt    |              |
|   winner_ioc  |             |    winner_1stln    |              |
|  winner_hand  |             |    winner_1stwon   |              |
|  winner_entry |             |    winner_2stwon   |              |
|    best_of    |             |       w_svgms      |              |
|               |             | winner_rank_points |              |
|               |             |      w_bdsaved     |              |
|               |             |      w_bdfaced     |              |

For each winner attribute there is the loser counterpart

### Invalid characters in names and surnames 

In [None]:
#see if all the names and surnames are valid (only letters)
df_male[df_male['surname'].str.count("[a-zA-Z '.-]")!=df_male['surname'].str.len()]

In [None]:
df_male[df_male['name'].str.count("[a-zA-Z ',.-]")!=df_male['name'].str.len()]

In [None]:
df_female[df_female['surname'].str.count("[a-zA-Z '.-]")!=df_female['surname'].str.len()]

In [None]:
df_female[df_female['name'].str.count("[a-zA-Z ',.-]")!=df_female['name'].str.len()]

So, in the male and female datasets we have some invalid names and surnames because of numbers, special characters of null values.

NOTE: "40071 	Jason "Jj",Belan 	NaN" is an error because Belan is the surname but pandas has not divided the name by the surname.

### Analysis of tennis dataset

In [None]:
df_tennis.dtypes.value_counts()

In [None]:
#Values in the columns with type object
for column in df_tennis.select_dtypes(include=['object']).columns:
    print("Distinct Values in "+str(column)+": \n", df_tennis[column].unique(), "\n")

#### tourney_id

In [None]:
#check that for non null values, the first 4 char are the year
#count rows whose firts 4 char are not numbers
df_tennis[df_tennis['tourney_id'].str[:4].str.isnumeric()==False].shape[0] 

In [None]:
#chech if some years are in the future (so are invalid)(ignore nulls)
df_tennis[pd.to_numeric(df_tennis['tourney_id'].str[:4]).fillna(0).astype('int') > date.today().year].shape[0] 

In [None]:
#chech if there are invalid years because to much in the past (ignore nulls)
df_tennis[pd.to_numeric(df_tennis['tourney_id'].str[:4]).fillna(date.today().year).astype('int') < 1874 ].shape[0] 

So, the first 4 numbers in the tourney_id are always valid when the tourney_id id not null

In [None]:
#chech that for each tourney there is more than one match (every id appears more than once)
df_tennis[df_tennis['tourney_id'].duplicated(keep=False)==False].shape[0] 

So, every torney_id appears more than once

In [None]:
#check how many distinct tourney are present
df_tennis["tourney_id"].value_counts().count()

In [None]:
#count how many match are played for each tournamnet
#df_tennis.groupby(['tourney_id'])["match_num"].count()

#### tourney_name

In [None]:
#chech that for the same tourney_id we have always the same name
len(df_tennis.groupby(['tourney_id','tourney_name']).size())-len(df_tennis.groupby(['tourney_id']).size())

In [None]:
df_tennis.groupby(['tourney_id','tourney_name']).apply(print)

In [None]:
len(df_tennis.groupby(['tourney_name','tourney_id']).size())-len(df_tennis.groupby(['tourney_name']).size())

For a certain tourney_id we may have more tourney_names and for a tourney_name we can have more tourney_ids

In [None]:
#len(df_tennis["tourney_name"].unique()) #consider also the nan
df_tennis["tourney_name"].value_counts().count() #do not consider also the nan

#### tourney_level

In [None]:
levels_man = ['G', 'M', 'A', 'C', 'S', 'F', 'D']

In [None]:
levels_woman = levels_man + ['P', 'PM', 'I', 'T1']
levels_woman_man = ['E','J','T'] #not present in the dataset yet (documentation)
all_levels = levels_man + levels_woman + levels_woman_man

In [None]:
#check what are other codes that can appear (for women)
df_other_levels = df_tennis[~df_tennis['tourney_level'].isin(all_levels)]
#get codes about the prize money
df_other_levels[df_other_levels['tourney_level'].str.isnumeric()==True]['tourney_level'].unique()

These are the numeric codes that appear in the dataset

In [None]:
#get the other codes not cited in the document and that are not prize
df_other_levels[df_other_levels['tourney_level'].str.isnumeric()==False]['tourney_level'].unique()

We have 2 more codes respect the one expressely indicated by the documentation

In [None]:
#check if there are at least one row for each cited code
list(set(all_levels) - set(df_tennis[df_tennis['tourney_level'].str.isnumeric()==False]['tourney_level'].unique()))

For these codes there are no rows in the dataset

In [None]:
#get the occurrenes of each level
df_tennis["tourney_level"].value_counts()

#### winner_name and loser_name

In [None]:
#check that names are valid
df_tennis[df_tennis['winner_name'].str.count("[a-zA-Z ',.-]")!=df_tennis['winner_name'].str.len()]['winner_name']

In [None]:
df_tennis[df_tennis['loser_name'].str.count("[a-zA-Z ',.-]")!=df_tennis['loser_name'].str.len()]['loser_name']

There are invalid characters in the names of some winners and some losers

#### winner_hand and loser_hand

In [None]:
#check that there are not indicated hand that are invalid (ignore nulls)
hand = ['R','L','U']
df_tennis[~df_tennis['winner_hand'].fillna('U').str.upper().isin(hand)].shape[0]

In [None]:
df_tennis[~df_tennis['loser_hand'].fillna('U').str.upper().isin(hand)].shape[0]

There are not invalid entries for the hand of winner or loser

#### winner_ioc and loser_ioc, International Olympic Code validity check

In [None]:
df_countrycode = pd.read_csv(DATASET_DIR + 'country-codes_csv.csv', sep=',', index_col=False) 

##### Wrong codes winner_ioc

In [None]:
w_check_cc = pd.Series(~df_tennis.winner_ioc.isin(df_countrycode.IOC).values, df_tennis.winner_ioc.values)
w_check = w_check_cc[w_check_cc].index
w_check.value_counts()

##### Wrong codes loser_ioc

In [None]:
l_check_cc = pd.Series(~df_tennis.loser_ioc.isin(df_countrycode.IOC).values, df_tennis.loser_ioc.values)
l_check = l_check_cc[l_check_cc].index
l_check.value_counts()

We can verify that the list of IOC codes that is incorrect is not in ISO format by mistake.

In [None]:
i = w_check.unique()
type(i)
for c in i:
    exist =  df_countrycode["ISO3166-1-Alpha-3"].str.contains('MNE').any()
    print(c + " " + str(exist))

In [None]:
i = l_check.unique()
type(i)
for c in i:
    exist =  df_countrycode["ISO3166-1-Alpha-3"].str.contains('MNE').any()
    print(c + " " + str(exist))

**best_of**

In [None]:
#check if there are different values form 3 or 5
df_tennis['best_of'].value_counts(dropna = False)

There are not different values from 3 and 5, a part for some null values.

#### score
https://www.wikihow.it/Tenere-il-Punteggio-a-Tennis

If the **match** is at best of 3 then a player, to win, must win 2 sets. If instead it's at best of 5 the playes must win 3 sets.

Every sets is composed by **games**. The winner is the player that wins 6 games with at least 2 games od advantage (for example 6-4, 6-3, ..., but not 6-5).

In the case of 6-5 the first player wins the set is win the following game (7-5).

In case pf 6-6 the **Tie-Break** is played. The Tie-Break is won by the player that is the first to do 7 points with an advantage of 2 (so, for example, 7-5, 7-4, ...). If both the players do 6 points then wins the first that have 2 points of advantage on the adversary (for example 8-6, 9-7, 10-8, ...)

In [None]:
#check that all the scores of the match are valid. (we do not consider nulls)
df_tennis_score = df_tennis[~df_tennis['score'].isna()]

Walkover ("WO" or "w/o")- Unopposed victory. A walkover is awarded when the opponent fails to start the match for any reason, such as injury.

Retirement ("ret") - Player's withdrawal during a match, causing the player to forfeit the tournament. Usually this happens due to injury

Default :def - Disqualification of a player in a match by the chair umpire after the player has received four code violation warnings, generally for their conduct on court. A default can occur with less than four code violations warnings if the code violation is judged severe enough to warrant it. A double default occurs when both players are disqualified. Defaults also occur when a player misses a match with no valid excuse. Defaults are considered losses.

Bye :bye - Automatic advancement of a player to the next round of a tournament without facing an opponent. Byes are often awarded in the first round to the top-seeded players in a tournament

In [None]:
def Retirement(s):
    if s.lower() in "retirement." or s.lower()=="ret.":
        return True
    else:
        return False
    
def Walkover(s):
    if s.lower()=="w/o" or s.lower()=="wo" or s.lower() in "walkover":
        return True
    else:
        return False
    
def Default(s):
    if s.lower() in "default." or s.lower()=="def.":
        return True
    else:
        return False
    
def Bye(s):
    if s.lower() in "bye.":
        return True
    else:
        return False

Checking on best_of = 3

In [None]:
#analysis on the best of 3

#count errors with more than 3 games
best_5 = 0
#error because less of 2 games without valid reasons
count_less_2 = 0

#number of walkover
walkover = 0
#errors using RET instead of WO
wrong_walkover = 0
#number of defaults
default = 0
#number of byes
bye = 0

#to count the invalid results of the sets
invalid_set = 0

for match in df_tennis_score[df_tennis_score['best_of'] == 3]['score']:
    sets = match.split( )
    if len(sets)==1 and Walkover(sets[0]):
        walkover+=1
        continue
    if len(sets)==1 and Retirement(sets[0]):
        wrong_walkover+=1
        continue
    if len(sets)==1 and Default(sets[0]):
        default+=1
        continue
    if len(sets)==1 and Bye(sets[0]):
        bye+=1
        continue
    if len(sets)<2:
        count_less_2+=1
        continue
    #we cannot have more than 3 sets unless having a "ret" or "def" at the end
    #probably these are best of 5 and not best of 3
    if (len(sets)==4 and not Retirement(sets[3]) and not Default(sets[3])) or \
        (len(sets)==5 and not Retirement(sets[3]) and not Retirement(sets[4]) and \
        not Default(sets[3]) and not Default(sets[4])) or len(sets)>5:
        best_5+=1
        continue
    
    #here we have best of 3 (we can have lenght>3 because of ret and def)
    #we are sure that if len > 3 then we have "rer of "def so we can ignore them to check scores
    
    #for each set x-y we can have 6-7 or |x-y|>=2 with max(y,x) = 6
    for (i,set) in enumerate(sets):
        if not Retirement(set) and not Default(set):
            if "[" in set or "]" in set:
                set = set.replace("[", "")
                set = set.replace("]", "")
            if "(" in set or ")" in set:
                points = set[0:set.index("(")].split("-")
                if "6" not in points or "7" not in points:
                    print(sets)
                    invalid_set+=1
            else:
                if set.strip()=="6-6":
                    invalid_set+=1
                else: 
                    points = set.split("-")
                    #print(sets)
                    x = int(points[0])
                    y = int(points[1])
                    if max(x,y) == 7:
                        if min(x,y)!=5:
                            invalid_set+=1
                    elif max(x,y)!= 6:
                        invalid_set+=1
                    elif max(x,y)==6:
                        if min(x,y)>4:
                            invalid_set+=1
            
            
    
    #if len == 2 allora il confronto x-y x>y deve dare sempre true o sempre false inoltre |x-y|>2 tranne per 6-7
    
    
    #if len == 3 (o più) allora il confronto x-y x>y deve dare 2 volte true e 1 false (o viceversa)
    #inoltre |x-y|>2 tranne per 6-7
    
    

Checking on best_of = 5

In [None]:
print('walkover', walkover)
print('wrong_walkover', wrong_walkover)
print('default', default)
print('bye', bye)
print('more than 3 matches: probably best of 5:', best_5)
print('errors: less than 2 matches', count_less_2)

In [None]:
print('Invalid results for set', invalid_set)

#### match_num

In [None]:
df_tennis['match_num'].isnull().sum()

#### winner_id and loser_id

### Numeric data in tennis dataset

In [None]:
df_tennis.select_dtypes(include=['float64']).columns
for column in df_tennis.select_dtypes(include=['float64']).columns:
    print("Distinct Values in "+str(column)+": \n", df_tennis[column].unique(), "\n")

#### draw_size

In [None]:
#count the row with an invalid number (negative or less than 2)
df_tennis[df_tennis['draw_size'] < 2].shape[0]

All the numbers are valid (we consider only the numbers and not the nulls)

**tourney_date**

In [None]:
#the date are in float so they need to be converted in date object 
df_tennis['tourney_date'].isnull().sum()

In [None]:
#check if there are present data greater then today
import datetime
df_tennis['tourney_date'] = pd.to_datetime(df_tennis['tourney_date'], format='%Y%m%d')
invalid_data = 0
today = pd.to_datetime(datetime.date.today())
for date in df_tennis['tourney_date']:
    if date > today:
        invalid_data +=1
print(invalid_data)

There aren't invalid dates

#### winner_ht and loser_ht

#### winner_age and loser_age

#### w_df

#### minutes

In [None]:
df_tennis.loc[df_tennis['minutes'] <= 0, 'minutes'].count()

In [None]:
df_tennis['minutes'].mean()

There are 128 entry with a match duration equal to 0, a tannis match duration is on average 40 minutes our mean is 97.67.

In [None]:
#TO DO:
#controllare che i match con minuti 0 siano match con BYE e WOLKOVER (in questo caso la partita non viene giocata)

#### w_ace, w_df and w_svpt

#### w_1stIn

#### w_1stWon and w_2ndWon

#### w_SvGms,  w_bpSaved and w_bpFaced

#### l_ace, l_df and l_svpt

#### l_1stIn

## Missing values: default value

## Correlation

In [None]:
df_numeric = df_tennis[df_tennis.select_dtypes(include=['float64']).columns]

In [None]:
#start with a filter on the correlation of |0.90|

In [None]:
df_numeric.corr()

inserire ragionamento sulla correlation

## Statistics/distributions

In [None]:
#some statistics
#df_tennis.describe()