In [None]:
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
life_exp = pd.read_csv('../data/life-expectancy.csv')
life_exp

In [None]:
life_exp = life_exp.rename(columns= {
    "Entity": "Country Name", 
    "Code": "Country Code", 
    "Period life expectancy at birth - Sex: total - Age: 0": "Life Expectancy"})
life_exp

In [None]:
usa_life_exp = life_exp.loc[(life_exp['Country Code'] == 'USA') & (life_exp['Year'] >= 1960)].reset_index(drop=True)
usa_life_exp

In [None]:
usa_life_exp.to_csv('usa_life_exp.csv', index=False)

In [None]:
gdp_per_capita = pd.read_csv('../data/gdp_per_cap_Data.csv')
gdp_per_capita = gdp_per_capita.drop(columns = ['Series Code'])
gdp_per_capita

In [None]:
usa_gdp = gdp_per_capita.loc[gdp_per_capita['Country Code'] == 'USA'].reset_index(drop=True)
usa_gdp

In [None]:
col_list = usa_gdp.columns.to_list()[0:]

In [None]:
usa_gdp = usa_gdp.melt(id_vars= ['Series Name', 'Country Name', 'Country Code'], value_vars = col_list)
usa_gdp = usa_gdp.rename(columns = {'variable': 'Year'})
usa_gdp

In [None]:
usa_gdp['Year'] = usa_gdp['Year'].str[:4]
usa_gdp

In [None]:
usa_pop = usa_gdp.loc[usa_gdp['Series Name'] == 'Population, total']
usa_pop = usa_pop.drop(columns = ['Series Name']).rename(columns = {'value': 'Population'}).reset_index(drop=True)
usa_pop

In [None]:
usa_gdp = usa_gdp.loc[usa_gdp['Series Name'] == 'GDP per capita (current US$)']
usa_gdp = usa_gdp.drop(columns= ['Series Name']).rename(columns= {'value': 'GDP Per Capita'})
usa_gdp

In [None]:
usa_gdp.to_csv('usa_gdp_per_cap.csv', index=False)

In [None]:
noc_regions = pd.read_csv('../data/noc_regions.csv')
noc_regions
# NOC= Nationaal Olympic Committee 3-Letter Code

In [None]:
noc_regions = noc_regions.rename(columns = {'region': 'Country Name', 'notes': 'Region'})
noc_regions

In [None]:
ath_events = pd.read_csv('../data/athlete_events.csv')
ath_events
# Height = cm | Weight = kg

In [None]:
ath_events['Birth Year'] = ath_events.Year - ath_events.Age
ath_events

In [None]:
usa_df = ath_events.loc[(ath_events['NOC'] == 'USA') & (ath_events['Year'] >= 1984)].reset_index(drop=True)
usa_df

In [None]:
usa_df[['Team', 'Games', 'Sport', 'Event', 'Medal']].drop_duplicates().Medal.value_counts(dropna=False)

Who were the top 5 athletes to earn the most medals overall (gold, silver, and bronze)?

In [None]:
medalists = usa_df.loc[(usa_df['Medal'] == 'Gold') | (usa_df['Medal'] == 'Silver') | (usa_df['Medal'] == 'Bronze')].reset_index(drop=True)
medalists

In [None]:
medalists['Name'].value_counts().head()

In [None]:
top5 = medalists.loc[medalists['Name'].isin(['Michael Fred Phelps, II', 'Jennifer Elisabeth "Jenny" Thompson (-Cumpelik)',
                    'Dara Grace Torres (-Hoffman, -Minas)', 'Ryan Steven Lochte', 'Natalie Anne Coughlin (-Hall)'])].reset_index(drop=True)
top5

In [None]:
top5[['Name', 'Medal']].value_counts()

In [None]:
data = {'Name': ['Michael Fred Phelps, II', 'Jennifer Elisabeth "Jenny" Thompson (-Cumpelik)',
                    'Dara Grace Torres (-Hoffman, -Minas)', 'Ryan Steven Lochte', 'Natalie Anne Coughlin (-Hall)'],
        'Gold': [23, 8, 4, 6, 3],
        'Silver': [3, 3, 4, 3, 4],
        'Bronze': [2, 1, 4, 3, 5]}

top5_medal_count = pd.DataFrame(data)

top5_medal_count

In [None]:
top5_medal_count.plot(kind='bar', stacked=True, x='Name', color=['yellow', 'silver', 'brown'])

plt.xlabel('Athlete')
plt.ylabel('Medal Tally')
plt.title('Top 5 Athletes w/ Most Medals')
plt.legend(bbox_to_anchor = (1, 0.8), loc = 'upper left')
plt.show()

How have the Olympic Summer and Winter Games evolved over time in terms of events and athlete performance from 1984-2016 for team USA? 

In [None]:
usa_winter = usa_df.loc[usa_df['Season'] == 'Winter']
usa_winter = usa_winter.loc[(usa_winter['Medal'] == 'Gold') | (usa_winter['Medal'] == 'Silver') | (usa_winter['Medal'] == 'Bronze')]
usa_winter = usa_winter.loc[usa_winter['Year'] >= 1984].reset_index(drop=True)
usa_winter

In [None]:
usa_summer = usa_df.loc[usa_df['Season'] == 'Summer']
usa_summer = usa_summer.loc[(usa_summer['Medal'] == 'Gold') | (usa_summer['Medal'] == 'Silver') | (usa_summer['Medal'] == 'Bronze')]
usa_summer = usa_summer.loc[usa_summer['Year'] >= 1984]
usa_summer

In [None]:
winter_medals = usa_winter[['Year', 'Medal']].value_counts().to_frame()
winter_medals.sort_values(by='Year')

In [None]:
data = {'Year': [1984, 1988, 1992, 1994, 1998, 2002, 2006, 2010, 2014],
        'Medal Total': [9, 3, 14, 19, 34, 84, 52, 97, 64]}
winter_tally = pd.DataFrame(data)
winter_tally

In [None]:
summer_medals = usa_summer[['Year', 'Medal']].value_counts().to_frame()
summer_medals.sort_values(by='Year')

In [None]:
data = {'Year': [1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016],
        'Medal Total': [(186+50+116), (54+87+66), (50+89+85), (159+52+48), (130+51+61), (75+117+71), (80+110+127), (57+145+46), (71+54+139)]}
summer_tally = pd.DataFrame(data)
summer_tally

In [None]:
data = {'Year': [1984, 1988, 1992, '1994|1996', '1998|2000', '2002|2004', '2006|2008', '2010|2012', '2014|2016'],
        'Winter': [9, 3, 14, 19, 34, 84, 52, 97, 64],
        'Summer': [(186+50+116), (54+87+66), (50+89+85), (159+52+48), (130+51+61), (75+117+71), (80+110+127), (57+145+46), (71+54+139)]}
olympic_tally = pd.DataFrame(data)
olympic_tally

In [None]:
olympic_tally = pd.DataFrame({ 
        'Winter': [9, 3, 14, 19, 34, 84, 52, 97, 64],
        'Summer': [(186+50+116), (54+87+66), (50+89+85), (159+52+48), (130+51+61), (75+117+71), (80+110+127), (57+145+46), (71+54+139)]},
                             index=[1984, 1988, 1992, '1994|1996', '1998|2000', '2002|2004', '2006|2008', '2010|2012', '2014|2016'])
trend_lines = olympic_tally.plot.line()
plt.xlabel('Year')
plt.ylabel('Medal Tally')
plt.title('Changes Over Time')
plt.legend(bbox_to_anchor= (1, 0.8), loc= 'upper left')
plt.xticks(rotation =45)
plt.show()

Which sport does team USA earn the most medals? Of this sport, what are the top 3 events team USA? 

In [None]:
frames = [usa_winter, usa_summer]
olympics = pd.concat(frames).reset_index(drop=True)
olympics

In [None]:
medals_by_sport = olympics[['Sport', 'Medal']]
grouped = medals_by_sport.groupby('Sport')
sport_medals = grouped['Medal'].value_counts()
print(sport_medals)

In [None]:
sport_medals = sport_medals.to_frame().reset_index()

In [None]:
sport_medals.sort_values(by='count', ascending=False).head(25)

In [None]:
sport_medals.loc[(sport_medals['Sport'] == 'Swimming') | (sport_medals['Sport'] == 'Athletics')]

In [None]:
data = {'Medal': ['Gold', 'Silver', 'Bronze'], 'value': [343, 136, 83]}

sns.barplot(x='Medal', y='value', data=data)

plt.title('Top Sport by Medals')
plt.xlabel('Medal Type')
plt.ylabel('Total Medals')

plt.show()

In [None]:
medals_by_event = olympics.loc[olympics['Sport'] == 'Swimming'][['Event', 'Medal']]
grouped2 = medals_by_event.groupby('Event')
event_medals = grouped2['Medal'].value_counts().to_frame().reset_index()
event_medals.sort_values(by='count', ascending=False).head(25)

In [None]:
event_medals.loc[event_medals['Event'] == "Swimming Men's 4 x 100 metres Medley Relay"]

In [None]:
event_medals.loc[event_medals['Event'] == "Swimming Women's 4 x 100 metres Medley Relay"].sum()

In [None]:
event_medals.loc[event_medals['Event'] == "Swimming Men's 4 x 200 metres Freestyle Relay"].sum()

In [None]:
event_medals.loc[event_medals['Event'] == "Swimming Men's 4 x 100 metres Freestyle Relay"].sum()

In [None]:
event_medals.loc[event_medals['Event'] == "Swimming Women's 4 x 200 metres Freestyle Relay"].sum()

In [None]:
data = [71, 69, 60]
keys = ["Swimming Women's 4 x 100 metres Medley Relay", "Swimming Men's 4 x 100 metres Medley Relay", "Swimming Men's 4 x 100 metres Freestyle Relay"]

plt.pie(data, labels=keys, colors=['blue', 'yellow', 'orange'], autopct='%.0f%%')
plt.show()

Are there any trends during this range of time for team USA’s performance in the Winter and the Summer Games? 

In [None]:
usa_df['Name'].drop_duplicates()

From 1984 – 2016, who were the top 5 performers for the sport where they earned the most medals? What is the life expectancy for the overall top performing athlete and the GDP per capita for the year they earned the most medals?

In [None]:
sport_medals = sport_medals.sort_values(by='count', ascending=False).reset_index(drop=True)
sport_medals

In [None]:
sport_medals.head(25)

In [None]:
sport_medals.loc[sport_medals['Sport'] == 'Swimming']['count'].sum()

In [None]:
sport_medals.loc[sport_medals['Sport'] == 'Athletics']['count'].sum()

In [None]:
sport_medals.loc[sport_medals['Sport'] == 'Basketball']['count'].sum()

In [None]:
data = {'Sport': ['Swimming', 'Athletics', 'Basketball'], 'Total Medals': [562, 369, 215]}
custom_palette = ["#0081C8", "#FCB131", "#00A651"]

ax = sns.barplot(x='Sport', y='Total Medals', data=data, palette=custom_palette, hue='Sport', legend=False)
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
ax.bar_label(ax.containers[2])
plt.title('Top Sports by Medals')
plt.xlabel('Sport Type')
plt.ylabel('Total Medals')
plt.savefig('top_sports.png', transparent=True)

plt.show()