## Import Dependencies

In [1]:
# importing modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os


## Read athlete height dataset

In [2]:
# athlete events df from csv
athlete_events_file = os.path.join("..","data", "rawData","athlete_events.csv")
athlete_events_df = pd.read_csv(athlete_events_file, encoding="ISO-8859-1")
athlete_events_df.head()
# len(athlete_events_df)


Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


## Removing winter olympics

In [13]:
summer_df = athlete_events_df.loc[athlete_events_df['Season']=='Summer']

len(summer_df)

222552

## Find rows where athlete height are missing

In [14]:
# looking at records missing height
missing_df = summer_df.loc[summer_df['Height'].isnull()]
# missing_df.head()

len(missing_df)

51857

## verifying missing height row count

In [15]:
null_height_group = missing_df.groupby('Year')['ID'].count()

null_height_sum = null_height_group.sum()

null_height_sum



51857

## get full count by year to compare to missing heights

In [16]:
year_count = summer_df.groupby('Year')['ID'].count()
# year_count

missing_percent = 100* (null_height_group / year_count)

missing_percent

Year
1896    87.894737
1900    94.008264
1904    83.627978
1906    85.170225
1908    84.682361
1912    82.153465
1920    82.129543
1924    83.088095
1928    82.912660
1932    65.746042
1936    83.768829
1948    83.559719
1952    75.356711
1956    54.320265
1960     4.692696
1964     3.531550
1968     1.106195
1972     2.775621
1976     4.143039
1980     3.198442
1984     4.463719
1988     2.641854
1992    19.449796
1996    13.577649
2000     0.889950
2004     0.267797
2008     1.110131
2012     1.300310
2016     1.285798
Name: ID, dtype: float64

In [23]:
summer_clean = summer_df.loc[summer_df['Year']>=1960]

# len(summer_clean) 166,267
df = summer_clean.dropna(how='any',subset=['Height'])
df.head()


Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
31,12,Jyri Tapani Aalto,M,31.0,172.0,70.0,Finland,FIN,2000 Summer,2000,Summer,Sydney,Badminton,Badminton Men's Singles,
32,13,Minna Maarit Aalto,F,30.0,159.0,55.5,Finland,FIN,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Women's Windsurfer,
33,13,Minna Maarit Aalto,F,34.0,159.0,55.5,Finland,FIN,2000 Summer,2000,Summer,Sydney,Sailing,Sailing Women's Windsurfer,


## look at sports by year (distribution)

In [41]:
# sports = df['Sport'].unique().tolist()

# sports
sportsByYear = df.groupby(['Sport','Year'])['Year'].count()

df2 = pd.DataFrame(sportsByYear)

group2 = df2.groupby('Sport')['Year'].count()

sportList = pd.DataFrame(group2)
sportList.head()

keepSports = sportList.loc[sportList['Year']==15]

keepSports

reallyCleanData = pd.merge(keepSports, df, on='Sport', how='inner')

reallyCleanData.head()

# gb2 = sportsByYear.groupby('Year')['Sport'].count()
# gb2


Unnamed: 0,Sport,Year_x,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year_y,Season,City,Event,Medal
0,Athletics,15,18,Timo Antero Aaltonen,M,31.0,189.0,130.0,Finland,FIN,2000 Summer,2000,Summer,Sydney,Athletics Men's Shot Put,
1,Athletics,15,34,Jamale (Djamel-) Aarrass (Ahrass-),M,30.0,187.0,76.0,France,FRA,2012 Summer,2012,Summer,London,"Athletics Men's 1,500 metres",
2,Athletics,15,49,Moonika Aava,F,24.0,168.0,65.0,Estonia,EST,2004 Summer,2004,Summer,Athina,Athletics Women's Javelin Throw,
3,Athletics,15,49,Moonika Aava,F,28.0,168.0,65.0,Estonia,EST,2008 Summer,2008,Summer,Beijing,Athletics Women's Javelin Throw,
4,Athletics,15,52,Patrick Abada,M,22.0,189.0,80.0,France,FRA,1976 Summer,1976,Summer,Montreal,Athletics Men's Pole Vault,


In [7]:
# noc regions
regions_file = os.path.join("..", "data", "rawData", "noc_regions.csv")
regions_df = pd.read_csv(regions_file, encoding="ISO-8859-1")
regions_df.head()

Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,
