# Assumptions (potiential problems)
 - ath_name == '' means race cancelled (checked for a few of events)
 - ['season', 'venue', 'event', 'date', 'gender'] identify uniquely a race
     - we do not get the same number of races as https://en.wikipedia.org/wiki/FIS_Alpine_Ski_World_Cup

In [1]:
import numpy as np
import pandas as pd

In [11]:
def parseCSV(isMale=True):
    gender = 'm' if isMale else 'f'
    df = pd.read_csv(f'../data/wc{gender}.csv')
    df = df.replace(np.nan, '', regex=True)
    df.date = pd.to_datetime(df.date, format='%Y-%m-%d')
    df['gender'] = gender
    return df

In [3]:
def remove_cancelled_race(df):
    return df[df['ath_name']!='']

In [15]:
def number_season(df):
    return len(df['season'].unique())

def number_races(df):
    return len(df[['season', 'venue', 'event','date', 'gender']].drop_duplicates())

def number_athletes(df):
    return len(df['ath_name'].unique())

def number_country(df):
    return len(df['ath_country'].unique())

def number_races_events(df):
    return df[['season', 'venue', 'event','date','gender']].drop_duplicates()[['venue', 'event']].groupby('event').count()

In [16]:
def show_basic_stats(df):
    print('# seasons  :', number_season(df))
    print('# races    :', number_races(df))
    print('# athletes :', number_athletes(df))
    print('# country  :', number_country(df))
    print(number_races_events(df))

In [17]:
dfm = remove_cancelled_race(parseCSV())
dff = remove_cancelled_race(parseCSV(False))
both = pd.concat([dfm, dff])
print('-----  World Cup Men -----')
show_basic_stats(dfm)

print('-----  World Cup Women -----')
show_basic_stats(dff)

print('-----  World Cup Women -----')
show_basic_stats(both)

-----  World Cup Men -----
# seasons  : 54
# races    : 1797
# athletes : 1730
# country  : 69
              venue
event              
Combined        134
Downhill        495
Giant Slalom    423
Parallel         24
Slalom          504
Super G         217
-----  World Cup Women -----
# seasons  : 54
# races    : 1680
# athletes : 1407
# country  : 57
              venue
event              
Combined        106
Downhill        417
Giant Slalom    418
Parallel         23
Slalom          478
Super G         238
-----  World Cup Women -----
# seasons  : 54
# races    : 3477
# athletes : 3137
# country  : 77
              venue
event              
Combined        240
Downhill        912
Giant Slalom    841
Parallel         47
Slalom          982
Super G         455


In [14]:
dff

Unnamed: 0,season,date,venue,country,event,ath_rank,ath_name,ath_country,ath_time_run_1,ath_time_run_2,ath_time,ath_time_diff,ath_ski,ath_id,gender
0,1967,1967-01-07,Oberstaufen,GER,Slalom,1,Nancy Greene,CAN,0,0,7933,0,,nancy_greene_can_wgrnna,f
1,1967,1967-01-07,Oberstaufen,GER,Slalom,2,Fernande Bochatay,SUI,0,0,7943,10,,fernande_bochatay_sui_wbchf,f
2,1967,1967-01-07,Oberstaufen,GER,Slalom,3,Annie Famose,FRA,0,0,8069,136,,annie_famose_fra_wfmsan,f
3,1967,1967-01-07,Oberstaufen,GER,Slalom,4,Florence Steurer,FRA,0,0,8097,164,,florence_steurer_fra_wstrfl,f
4,1967,1967-01-07,Oberstaufen,GER,Slalom,5,Christine Goitschel,FRA,0,0,8114,181,,christine_goitschel_fra_wgtsc,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55276,2020,2020-02-29,La Thuile,ITA,Super G,AB,Fabiana Dorigo,GER,0,0,0,0,,fabiana_dorigo_ger_206793,f
55277,2020,2020-02-29,La Thuile,ITA,Super G,AB,Tiffany Gauthier,FRA,0,0,0,0,,tiffany_gauthier_fra_197383,f
55278,2020,2020-02-29,La Thuile,ITA,Super G,AB,Jasmina Suter,SUI,0,0,0,0,,jasmina_suter_sui_516394,f
55279,2020,2020-02-29,La Thuile,ITA,Super G,AB,Kira Weidle,GER,0,0,0,0,,kira_weidle_ger_206668,f
