In [2]:
import math
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import collections
from scipy.stats.stats import pearsonr
import pandas as pd
import os

In [3]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
df_tennis = pd.read_csv(DATASET_DIR + 'tennis_matches.csv', sep=',', index_col=0) 

#index_col=False say to not use the first column as ID
df_male = pd.read_csv(DATASET_DIR + 'male_players.csv', sep=',', index_col=False)
df_female = pd.read_csv(DATASET_DIR + 'female_players.csv', sep=',', index_col=False) 

## Print some records of the datasets

In [4]:
df_tennis.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_entry,winner_name,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,tourney_spectators,tourney_revenue
0,2019-M020,Brisbane,Hard,32.0,A,20181231.0,300.0,105453.0,,Kei Nishikori,...,20.0,14.0,10.0,15.0,9.0,3590.0,16.0,1977.0,3928.0,742618.69
1,2019-M020,Brisbane,Hard,32.0,A,20181231.0,299.0,106421.0,,Daniil Medvedev,...,7.0,10.0,10.0,13.0,16.0,1977.0,239.0,200.0,3928.0,742618.69
2,2019-M020,Brisbane,Hard,32.0,A,20181231.0,298.0,105453.0,,Kei Nishikori,...,6.0,8.0,1.0,5.0,9.0,3590.0,40.0,1050.0,3928.0,742618.69
3,2019-M020,Brisbane,Hard,32.0,A,20181231.0,297.0,104542.0,PR,Jo-Wilfried Tsonga,...,9.0,11.0,4.0,6.0,239.0,200.0,31.0,1298.0,3928.0,742618.69
4,2019-M020,Brisbane,Hard,32.0,A,20181231.0,296.0,106421.0,,Daniil Medvedev,...,19.0,15.0,2.0,4.0,16.0,1977.0,18.0,1855.0,3928.0,742618.69


In [5]:
df_male.head()

Unnamed: 0,name,surname
0,Gardnar,Mulloy
1,Pancho,Segura
2,Frank,Sedgman
3,Giuseppe,Merlo
4,Richard Pancho,Gonzales


In [6]:
df_female.head()

Unnamed: 0,name,surname
0,Bobby,Riggs
1,X,X
2,Martina,Hingis
3,Mirjana,Lucic
4,Justine,Henin


## Missing values: Null

In [None]:
#info about data that we have for male
df_male.info()

In [None]:
#number of null in the columns
df_male.isnull().sum(axis = 0)

So, the male dataframe has 2 columns: name and surname. It has 55208 entries and about 200 null values in total.

In [None]:
#info about data that we have for female
df_female.info()

In [None]:
#number of null in the columns
df_female.isnull().sum(axis = 0)

So, the female dataframe has 2 columns: name and surname. It has 46172  entries and 1667 null values in the name, but the surname is never null.

In [None]:
df_tennis.info()

In [None]:
#we see if the attributes have some null values
df_tennis.isnull().any()

In [None]:
#since all the attributes has missing values, we count them
df_tennis.isnull().sum(axis = 0)

We can notice that some attributes have very few null values, but other attributes has more than 50% of null values

## Duplicate data

In [None]:
#we see if there are duplicates in the dataset male and female
df_male.duplicated(keep='first').sum()

In [None]:
df_female.duplicated(keep='first').sum()

So, both the datasets of male and female has rows with same name and surname. These rows can be duplicates (so they correspond to the same person) or they can be homonyms. In this latter case we cannot distinguish the matches of one player by the ones of the other player.

In [None]:
#we remove (only) the duplicated rows
df_male = df_male.drop_duplicates()
df_female = df_female.drop_duplicates()

In [None]:
#see if a name can be both male and female and manage them
df_players = pd.concat([df_male, df_female])
df_players[df_players.duplicated(keep='first')==True]

In [None]:
df_players.duplicated(keep='first').sum()

So, there are 74 names for both players male and female.

In [None]:
df_players[df_players.duplicated(subset=['name', 'surname'], keep='first')==True]

In [None]:
#check if there are duplicated rows in the dataset
df_tennis.duplicated(keep='first').sum()

In [None]:
#we remove these duplicates because they contain the same information
df_tennis = df_tennis.drop_duplicates()

## Noise, outliers and data quality (data visualization here)

### Invalid characters in names and surnames 

In [None]:
#see if all the names and surnames are valid (only letters)
df_male[df_male['surname'].str.count('[a-zA-Z ]')!=df_male['surname'].str.len()]

In [None]:
df_male[df_male['name'].str.count('[a-zA-Z ]')!=df_male['name'].str.len()]

In [None]:
df_female[df_female['surname'].str.count('[a-zA-Z ]')!=df_female['surname'].str.len()]

In [None]:
df_female[df_female['name'].str.count('[a-zA-Z ]')!=df_female['name'].str.len()]

So, in the male and female datasets we have some invalid names and surnames because of numbers, special characters of null values.

NOTE: "40071 	Jason "Jj",Belan 	NaN" is an error because Belan is the surname but pandas has not divided the name by the surname.

### Categorical data (tennis dataset)

In [None]:
df_tennis.dtypes.value_counts()

In [None]:
#Values in the columns with type object
for column in df_tennis.columns:
    if df_tennis[column].dtypes == "object":
        print("Distinct Values in "+str(column)+": \n", df_tennis[column].unique(), "\n")

### Numeric data

## Missing values: default value

## Correlation

## Statistics/distributions

In [None]:
#some statistics
#df_tennis.describe()all

## Test check wikipedia-iso-country-codes

In [9]:
df_countrycode = pd.read_csv(DATASET_DIR + 'country-codes_csv.csv', sep=',', index_col=False) 
df_countrycode.head()

Unnamed: 0,FIFA,Dial,ISO3166-1-Alpha-3,MARC,is_independent,ISO3166-1-numeric,GAUL,FIPS,WMO,ISO3166-1-Alpha-2,...,Sub-region Name,official_name_ru,Global Name,Capital,Continent,TLD,Languages,Geoname ID,CLDR display name,EDGAR
0,TPE,886,TWN,ch,Yes,158.0,925,TW,,TW,...,,,,Taipei,AS,.tw,"zh-TW,zh,nan,hak",1668284.0,Taiwan,
1,AFG,93,AFG,af,Yes,4.0,1,AF,AF,AF,...,Southern Asia,Афганистан,World,Kabul,AS,.af,"fa-AF,ps,uz-AF,tk",1149361.0,Afghanistan,B2
2,ALB,355,ALB,aa,Yes,8.0,3,AL,AB,AL,...,Southern Europe,Албания,World,Tirana,EU,.al,"sq,el",783754.0,Albania,B3
3,ALG,213,DZA,ae,Yes,12.0,4,AG,AL,DZ,...,Northern Africa,Алжир,World,Algiers,AF,.dz,ar-DZ,2589581.0,Algeria,B4
4,ASA,1-684,ASM,as,Territory of US,16.0,5,AQ,,AS,...,Polynesia,Американское Самоа,World,Pago Pago,OC,.as,"en-AS,sm,to",5880801.0,American Samoa,B5


In [30]:
check_cc = pd.Series(~df_tennis.winner_ioc.isin(df_countrycode.IOC).values, df_tennis.winner_ioc.values)
type(check_cc[check_cc].index)
check_cc[check_cc].index.value_counts()

MNE    198
TRI     21
PHL     18
UNK      8
SGP      8
DEU      8
POC      5
GRC      3
NLD      2
NGA      1
dtype: int64