#### Things I've Cleaned
 - Extra column and row on rankings
 - Dropped ages column cuz we have birth years
 - Fixed impossible birth years
 - Took suffixes of heights and converted to floats
 
 - NOTE: There are youth people. This is something that might be tricky
 - NOTE: We might want the names of the events

#### Things We Might Want
 - We should go in here and make sure the country codes all map to something
 - We might want a smaller dataset that just has the athletes with the most data and stuff

In [22]:
import json

import numpy as np
import pandas as pd
from toolz.curried import *
import missingno

In [8]:
def load_ranking_df():
    filename = '../data/rankings.csv'
    ranking_df = pd.read_csv(filename)
    return ranking_df

def load_athlete_df():
    # When loading, you may want to change the years to ints
    filename = '../data/athlete_data.csv'
    athlete_df = pd.read_csv(filename)
    return athlete_df

def load_event_df():
    with open('../data/athlete_comps.json') as f:
        data = json.loads(f.read())
        athlete_comp_result = {athlete_id: merge(*[{comp['comp']: comp['result']} for comp in comps]) for athlete_id, comps in data.items()}
        event_df = pd.read_json(json.dumps(athlete_comp_result), orient='index')
    return event_df

In [68]:
rankings = load_ranking_df()
clean_rankings = rankings.drop(columns=['Unnamed: 0'])
clean_rankings = clean_rankings.drop(0)
# print('dtypes:\n', clean_rankings.dtypes)
# print()
# print('Missing values:\n', clean_rankings.isna().sum())
# missingno.matrix(clean_rankings)

In [196]:
athletes = load_athlete_df()
clean_athletes = athletes.set_index('id')
clean_athletes = clean_athletes.drop(columns='age')

bad_ages_mask = ~clean_athletes['birth_year'].isin(np.arange(1990-80, 2020-3))
bad_ages = clean_athletes[bad_ages_mask]['birth_year'].unique()
# print(bad_ages)
bad_ages = bad_ages[1:] # Take the nan out
# print(bad_ages)
clean_athletes.loc[clean_athletes['birth_year'].isin(bad_ages)] = np.nan

clean_athletes['height'] = clean_athletes['height'].str.slice(0,-3)
clean_athletes['height'] = clean_athletes['height'].astype(float)
clean_athletes.loc[clean_athletes['height'] < 100] = np.nan

clean_athletes['weight'] = clean_athletes['weight'].str.slice(0,-2)
clean_athletes['weight'] = clean_athletes['weight'].astype(float)
clean_athletes.loc[clean_athletes['weight'] < 20] = np.nan

print(clean_athletes['country'].unique())

['GER' 'SUI' 'AUT' 'LUX' 'HKG' 'SWE' 'INA' 'NOR' 'CHN' 'POL' 'ESP' 'BEL'
 'CZE' 'HUN' 'USA' 'FRA' 'SVK' 'GBR' 'ITA' 'IRI' 'RUS' 'UKR' 'SLO' 'CAM'
 'AUS' 'RSA' 'ROU' 'CAN' 'CHI' 'ECU' 'IRL' 'BRA' 'KOR' 'TUR' 'JPN' 'LTU'
 'POR' 'NZL' 'GRE' 'BLR' 'THA' 'MGL' 'PAK' 'ISL' 'CRO' 'GEO' 'TPE' 'NED'
 'KAZ' 'UZB' 'MEX' 'BUL' 'FIN' 'DEN' 'IND' 'LAT' 'NC' 'GUA' 'ARG' 'SGP'
 'VEN' 'MKD' 'ISR' 'COL' nan 'KGZ' 'CYP' 'SRB' 'JOR' 'HON' 'MAS' 'LIB'
 'IRQ' 'PHI' 'AZE' 'EST' 'MAC' 'XYZ' 'AND' 'CRC' 'LKA' 'NEP' 'BRN' 'PER'
 'BIH' 'ESA' 'ALB' 'MON' 'BOL' 'HRV']


In [144]:
# Check birth year
# Change height to float
# Change weight to float?

[nan '']


Unnamed: 0,5378,5301,5300,5297,5246,5244,5243,9541,8181,8178,...,1543,1542,1434,1515,1514,1559,6246,856,836,785
51001,21.0,8.0,13.0,9.0,12.0,11.0,15.0,,,,...,,,,,,,,,,
51004,25.0,,16.0,,18.0,,20.0,,,,...,,,,,,,,,,
51012,17.0,4.0,5.0,6.0,13.0,12.0,7.0,,,,...,,,,,,,,,,
51027,14.0,6.0,,,,11.0,18.0,,,,...,,,,,,,,,,
51213,7.0,,3.0,5.0,,,12.0,,,,...,,,,,,,,,,
