In [1]:
import json

import numpy as np
import pandas as pd
from toolz.curried import *
import missingno
from matplotlib import pyplot as plt

In [6]:
def load_ranking_df():
    filename = 'data/rankings.csv'
    ranking_df = pd.read_csv(filename)
    clean_rankings = ranking_df.drop(columns=['Unnamed: 0'])
    clean_rankings = clean_rankings.drop(0)
    return clean_rankings

def load_athlete_df():
    # When loading, you may want to change the years to ints (future porter: I don't know why I put this here...)
    filename = 'data/athlete_data.csv'
    athlete_df = pd.read_csv(filename)
    clean_athletes = athlete_df.set_index('id')
    clean_athletes.index = clean_athletes.index.rename('ID')
    clean_athletes = clean_athletes.drop(columns='age')

    bad_ages_mask = ~clean_athletes['birth_year'].isin(np.arange(1990-80, 2020-3))
    bad_ages = clean_athletes[bad_ages_mask]['birth_year'].unique()
    bad_ages = bad_ages[1:] # Take the nan out
    clean_athletes.loc[clean_athletes['birth_year'].isin(bad_ages)] = np.nan

    clean_athletes['height'] = clean_athletes['height'].str.slice(0,-3)
    clean_athletes['height'] = clean_athletes['height'].astype(float)
    clean_athletes['weight'] = clean_athletes['weight'].str.slice(0,-2)
    clean_athletes['weight'] = clean_athletes['weight'].astype(float)
    clean_athletes = clean_athletes.rename({'first_name': 'First Name', 'last_name': 'Last Name'}, axis=1)
    
    # Get rid of really short and light athletes (probably kids)
#     clean_athletes.loc[clean_athletes['height'] < 100] = np.nan
#     clean_athletes.loc[clean_athletes['weight'] < 20] = np.nan
    
    return clean_athletes

events = None

def load_event_df():
    if not events:
        with open('data/athlete_comps.json') as f:
            data = json.loads(f.read())
            athlete_comp_result = {athlete_id: merge(*[{comp['comp']: comp['result']} for comp in comps]) for athlete_id, comps in data.items()}
            event_df = pd.read_json(json.dumps(athlete_comp_result), orient='index')
        return event_df
    return events

In [13]:
rankings = load_ranking_df()
print(rankings[:10])
print()
print(rankings[-10:])

       ID  Rank    Last Name First Name  Points Event Gender  Year
1    8372     1        Ondra       Adam   300.0  lead    MEN  2019
2   56609     2  Ginés López    Alberto   256.0  lead    MEN  2019
3    5089     3       McColl       Sean   206.0  lead    MEN  2019
4   14023     4       Harada        Kai   195.0  lead    MEN  2019
5    8323     5     Ghisolfi    Stefano   190.0  lead    MEN  2019
6   11392     6      Shimizu     Hiroto   180.0  lead    MEN  2019
7   10172     7        Fujii     Kokoro   165.0  lead    MEN  2019
8    9058     8        Megos  Alexander   165.0  lead    MEN  2019
9   10775     9       Bailey       Sean   158.0  lead    MEN  2019
10  10951    10      Lehmann     Sascha   152.0  lead    MEN  2019

        ID  Rank       Last Name First Name  Points Event Gender  Year
8622  1047    73            Dunn    Claudie     3.0  lead  WOMEN  1991
8623  1324    74          Farmer     Rachel     3.0  lead  WOMEN  1991
8624  1095    75         Koliada      Elena     3

In [15]:
athletes = load_athlete_df()
athletes

Unnamed: 0_level_0,First Name,Last Name,country,birth_year,height,weight
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
51001,Moritz,Simmet,GER,1999.0,,
51002,Martin,Eisensteger,GER,1997.0,,
51004,Vinzenz,Kreuzer,GER,1996.0,,
51005,Joschua,Gosda,GER,1995.0,,
51007,Lukas,Achermann,SUI,2003.0,176.0,66.0
...,...,...,...,...,...,...
61870,Kevin,Mattiazzo,SUI,2004.0,152.0,36.0
61874,Maja,Scharnweber,GER,2008.0,,
61982,Sten Liam,Lengsfeld,SUI,2002.0,,
61994,Paula,Kaan,GER,2003.0,,
