# Athletes

In [None]:
import pandas as pd

In [154]:
pd.set_option('display.max_rows', None)

In [155]:
df = pd.read_csv(
    '../data/marathon.csv',
    sep=',',
    header=None,
    names=[
        'Rank',
        'Time',
        'Name',
        'Country',
        'Date of Birth',
        'Place',
        'City',
        'Date',
        'Gender',
    ]
)

In [160]:
df = df.sort_values('Time')

In [161]:
df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], format='%d.%m.%y', errors='coerce')
df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y', errors='coerce')

In [162]:
df.head()

Unnamed: 0,Rank,Time,Name,Country,Date of Birth,Place,City,Date,Gender
0,1,2:00:35,Kelvin Kiptum,KEN,1999-12-02,1,Chicago,2023-10-08,Men
1,2,2:01:09,Eliud Kipchoge,KEN,1984-11-05,1,Berlin,2022-09-25,Men
2,3,2:01:25,Kelvin Kiptum,KEN,1999-12-02,1,London,2023-04-23,Men
3,4,2:01:39,Eliud Kipchoge,KEN,1984-11-05,1,Berlin,2018-09-16,Men
4,5,2:01:41,Kenenisa Bekele,ETH,1982-06-13,1,Berlin,2019-09-29,Men


How many entries do we have?

In [163]:
df.shape[0]

9410

How many athletes are there?

In [164]:
df['Name'].nunique()

2822

How many nationalities?

In [165]:
df['Country'].nunique()

78

What are the top nationalities?

In [166]:
df.groupby('Country')\
    .size()\
    .sort_values(ascending=False)

Country
KEN    3064
ETH    2252
JPN    1005
USA     305
RUS     266
CHN     200
ITA     163
GER     143
MAR     139
GBR     120
ESP     107
ERI     106
POR     101
AUS      95
RSA      86
POL      84
BRN      79
FRA      66
UGA      64
MEX      62
NED      60
TAN      60
ROU      59
PRK      54
KOR      53
UKR      53
BEL      51
BRA      43
NOR      36
ISR      33
CAN      30
BLR      30
NZL      22
SUI      21
SWE      21
LAT      20
NAM      19
TUR      19
IRL      17
PER      15
LTU      12
ECU      12
KGZ      11
CRO      10
FIN      10
EST      10
DJI       9
HUN       9
MGL       9
QAT       8
ALG       8
BDI       7
SLO       7
CZE       6
ZIM       5
COD       5
ARG       5
FRG       5
SRB       4
KAZ       4
COL       4
LES       3
GDR       3
DEN       3
MDA       3
BOL       2
RWA       2
CYP       1
SVK       1
PAR       1
CHI       1
PAN       1
UAE       1
BEl       1
LUX       1
AUT       1
UZB       1
MRI       1
dtype: int64

Who are the top runners?

In [167]:
df.groupby('Gender').head(5)\
    [['Name', 'Time', 'Gender']]

Unnamed: 0,Name,Time,Gender
0,Kelvin Kiptum,2:00:35,Men
1,Eliud Kipchoge,2:01:09,Men
2,Kelvin Kiptum,2:01:25,Men
3,Eliud Kipchoge,2:01:39,Men
4,Kenenisa Bekele,2:01:41,Men
4909,Tigist Assefa,2:11:53,Women
4910,Sifan Hassan,2:13:44,Women
4911,Brigid Kosgei,2:14:04,Women
4912,Ruth Chepngetich,2:14:18,Women
4913,Amane Beriso,2:14:58,Women


What is the best running time by country?

In [172]:
df.groupby(['Country'])['Time'].min()\
    .sort_values(ascending=True)

Country
KEN     2:00:35
ETH     2:01:41
TAN     2:03:00
BEL     2:03:36
TUR     2:04:16
ERI     2:04:35
BRN     2:04:43
UGA     2:04:48
BRA     2:04:51
JPN     2:04:56
NED     2:04:56
GER     2:04:58
USA    2:04:58a
SUI     2:05:10
GBR     2:05:11
MAR     2:05:12
FRA     2:05:22
ISR     2:05:33
CAN     2:05:36
NOR     2:05:48
ESP     2:06:25
RSA     2:06:33
POR     2:06:36
DJI     2:06:43
ZIM     2:06:48
BDI     2:07:13
UKR     2:07:15
ITA     2:07:16
QAT     2:07:19
MEX    2:07:19a
KOR     2:07:20
CHN     2:07:30
AUS     2:07:31
POL     2:07:39
PER     2:07:40
BOL     2:07:49
RWA     2:08:18
NZL     2:08:19
MDA     2:08:32
COD     2:08:40
NAM     2:08:40
MGL     2:08:50
EST     2:08:53
RUS     2:09:07
IRL    2:09:15a
FRG     2:09:23
ARG     2:09:36
DEN     2:09:43
LES     2:09:47
SWE     2:09:47
PAN     2:09:49
ECU     2:09:49
SVK     2:09:53
ALG     2:09:54
GDR     2:09:55
AUT     2:10:06
PAR     2:10:11
BEl     2:10:17
CYP     2:10:20
CHI     2:10:26
ROU     2:19:27
LAT     2:22:56
