In [None]:
import math
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import collections
from scipy.stats.stats import pearsonr
import pandas as pd
import os
from datetime import date

In [None]:
# load of the data
DATASET_DIR = "dataset" + os.path.sep
df_tennis = pd.read_csv(DATASET_DIR + 'tennis_matches.csv', sep=',', index_col=0) 

#index_col=False say to not use the first column as ID
df_male = pd.read_csv(DATASET_DIR + 'male_players.csv', sep=',', index_col=False)
df_female = pd.read_csv(DATASET_DIR + 'female_players.csv', sep=',', index_col=False) 

## Print some records of the datasets

In [None]:
df_tennis.head()

In [None]:
df_male.head()

In [None]:
df_female.head()

## Missing values: Null

In [None]:
#info about data that we have for male
df_male.info()

In [None]:
#number of null in the columns
df_male.isnull().sum(axis = 0)

So, the male dataframe has 2 columns: name and surname. It has 55208 entries and about 200 null values in total.

In [None]:
#info about data that we have for female
df_female.info()

In [None]:
#number of null in the columns
df_female.isnull().sum(axis = 0)

So, the female dataframe has 2 columns: name and surname. It has 46172  entries and 1667 null values in the name, but the surname is never null.

In [None]:
df_tennis.info()

In [None]:
#we see if the attributes have some null values
df_tennis.isnull().any()

In [None]:
#since all the attributes has missing values, we count them
df_tennis.isnull().sum(axis = 0)

We can notice that some attributes have very few null values, but other attributes has more than 50% of null values

## Duplicate data

In [None]:
#we see if there are duplicates in the dataset male and female
df_male.duplicated(keep='first').sum()

In [None]:
df_female.duplicated(keep='first').sum()

So, both the datasets of male and female has rows with same name and surname. These rows can be duplicates (so they correspond to the same person) or they can be homonyms. In this latter case we cannot distinguish the matches of one player by the ones of the other player.

In [None]:
#we remove (only) the duplicated rows
df_male = df_male.drop_duplicates()
df_female = df_female.drop_duplicates()

In [None]:
#see if a name can be both male and female and manage them
df_players = pd.concat([df_male, df_female])
df_players[df_players.duplicated(keep='first')==True]

In [None]:
df_players.duplicated(keep='first').sum()

So, there are 74 names for both players male and female.

In [None]:
df_players[df_players.duplicated(subset=['name', 'surname'], keep='first')==True]

In [None]:
#check if there are duplicated rows in the dataset
df_tennis.duplicated(keep='first').sum()

In [None]:
#we remove these duplicates because they contain the same information
df_tennis = df_tennis.drop_duplicates()

## Noise, outliers and data quality (data visualization here)

### Invalid characters in names and surnames 

In [None]:
#see if all the names and surnames are valid (only letters)
df_male[df_male['surname'].str.count('[a-zA-Z ]')!=df_male['surname'].str.len()]

In [None]:
df_male[df_male['name'].str.count('[a-zA-Z ]')!=df_male['name'].str.len()]

In [None]:
df_female[df_female['surname'].str.count('[a-zA-Z ]')!=df_female['surname'].str.len()]

In [None]:
df_female[df_female['name'].str.count('[a-zA-Z ]')!=df_female['name'].str.len()]

So, in the male and female datasets we have some invalid names and surnames because of numbers, special characters of null values.

NOTE: "40071 	Jason "Jj",Belan 	NaN" is an error because Belan is the surname but pandas has not divided the name by the surname.

### Categorical data in tennis dataset

In [None]:
df_tennis.dtypes.value_counts()

In [None]:
#Values in the columns with type object
for column in df_tennis.select_dtypes(include=['object']).columns:
    print("Distinct Values in "+str(column)+": \n", df_tennis[column].unique(), "\n")

#### tourney_id

In [None]:
#check that for non null values, the first 4 char are the year
#count rows whose firts 4 char are not numbers
df_tennis[df_tennis['tourney_id'].str[:4].str.isnumeric()==False].shape[0] 

In [None]:
#chech if some years are in the future (so are invalid)(ignore nulls)
df_tennis[pd.to_numeric(df_tennis['tourney_id'].str[:4]).fillna(0).astype('int') > date.today().year].shape[0] 

In [None]:
#chech if there are invalid years because to much in the past (ignore nulls)
df_tennis[pd.to_numeric(df_tennis['tourney_id'].str[:4]).fillna(date.today().year).astype('int') < 1874 ].shape[0] 

So, the first 4 numbers in the tourney_id are always valid when the tourney_id id not null

In [None]:
#chech that for each tourney there is more than one match (every id appears more than once)
df_tennis[df_tennis['tourney_id'].duplicated(keep=False)==False].shape[0] 

So, every torney_id appears more than once

#### tourney_name

In [None]:
#chech that for the same tourney_id we have always the same name: USEFUL?
#len(df_tennis.groupby(['tourney_id','tourney_name']).size())-len(df_tennis.groupby(['tourney_id']).size())

#### tourney_level

In [None]:
#there are no records for S and T1, and for E, J, T (but for these 3 they will appear (see pdf))
#there are, in the data, O and W that are not in the pdf

#eccetto questo al massimo farei un controllo come quello fatto per l'hand

#### winner_name and loser_name

In [None]:
# in the ranges we should consider also [-.']???

In [None]:
#check that names are valid
df_tennis[df_tennis['winner_name'].str.count('[a-zA-Z ]')!=df_tennis['winner_name'].str.len()]['winner_name']

In [None]:
df_tennis[df_tennis['loser_name'].str.count("[a-zA-Z ]")!=df_tennis['loser_name'].str.len()]['loser_name']

There are invalid characters in the names of some winners and some losers

#### winner_hand and loser_hand

In [None]:
#check that there are not indicated hand that are invalid (ignore nulls)
hand = ['R','L','U']
df_tennis[~df_tennis['winner_hand'].fillna('U').str.upper().isin(hand)].shape[0]

In [None]:
df_tennis[~df_tennis['loser_hand'].fillna('U').str.upper().isin(hand)].shape[0]

There are not invalid entries for the hand of winner or loser

#### winner_ioc and loser_ioc

In [None]:
df_countrycode = pd.read_csv(DATASET_DIR + 'country-codes_csv.csv', sep=',', index_col=False) 
df_countrycode.head()

In [None]:
check_cc = pd.Series(~df_tennis.winner_ioc.isin(df_countrycode.IOC).values, df_tennis.winner_ioc.values)
type(check_cc[check_cc].index)
check_cc[check_cc].index.value_counts()

In [None]:
#aggiungerei anche il controllo usando l'altro codice che avevi visto @reny così da farci un'idea di quanti di
#questi siano proprio sbagliati o solo nel formato sbaglito

#### score

In [None]:
#What are the numbers between the brackets ()???

In [None]:
#una partita di tennis deve finire con almeno 2 punti di vantaggio (es 7-5, 6-4). al massimo si può controllare questo
#ma so se sia utile come info. nel caso si può anche evitare.
#sembra che quelli con le () non rispettano sta cosa. 

### Numeric data

In [None]:
df_tennis.select_dtypes(include=['float64']).columns
for column in df_tennis.select_dtypes(include=['float64']).columns:
    print("Distinct Values in "+str(column)+": \n", df_tennis[column].unique(), "\n")

## Missing values: default value

## Correlation

In [None]:
df_numeric = df_tennis[df_tennis.select_dtypes(include=['float64']).columns]

In [None]:
df_numeric.corr()

inserire ragionamento sulla correlation

## Statistics/distributions

In [None]:
#some statistics
#df_tennis.describe()all