## Data Exploration - 120 years of Olympics

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df_athletes = pd.read_csv('./data/athlete_events.csv')

In [None]:
print('Number of entries {}'.format(df_athletes.shape[0]))
print('Features:')
for x in list(df_athletes.columns):
    print('\t *\t{}'.format(x))

In [None]:
df_athletes.head()

### Countries with more medals

In [None]:
medals = df_athletes.dropna().groupby(['Sex', 'NOC', 'Medal']).count().reset_index()

In [None]:
f, ax = plt.subplots(1,3, figsize=(20,5), sharey=True)
for idx, med in enumerate(['Gold', 'Silver', 'Bronze']):
    comitee = pd.pivot_table(medals[medals.Medal == med], index='NOC', columns='Sex').fillna(0)
    comitee[('ID', 'Total')] = comitee[('ID', 'F')] + comitee[('ID', 'M')]
    comitee.sort_values([('ID', 'Total')]).iloc[-20:,:2].plot(kind='bar', stacked=True, ax=ax[idx], title='Comitee with highest number of {} medals'.format(med))

In [None]:
df_athletes['type_event'] = df_athletes['Event'].apply(lambda x: ' '.join((x.split()[2:])))

### Number of athletes per year

In [None]:
f, ax = plt.subplots(figsize=(20,5))
df_athletes[df_athletes.Season == 'Summer'].groupby('Year').agg({'Name':'nunique'}).\
plot(kind='bar', ax=ax, title='Number of athletes by year')
ax.set_xlabel('Year')
ax.set_ylabel('Number of athletes')
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(20,5))
pd.pivot_table(df_athletes[df_athletes.Season == 'Summer'].groupby(['Year', 'Sex']).agg({'Name':'count'}), \
              index='Year', columns='Sex').\
plot(kind='bar', ax=ax, stacked=True, title='Number of athletes by year')
ax.set_xlabel('Year')
ax.set_ylabel('Number of athletes')
ax.legend(['Female', 'Male'])
plt.show()

In [None]:
print('Number of unique athletes: {}'.format(df_athletes.Name.nunique()))
print('Number of unique athletes (Summer): {}'.format(\
        df_athletes[df_athletes.Season == 'Summer'].Name.nunique()))
print('Number of unique athletes (Winter): {}'.format(\
        df_athletes[df_athletes.Season == 'Winter'].Name.nunique()))
print('Number of unique host cities: {}'.format(df_athletes.City.nunique()))
print('Number of unique host cities (Summer): {}'.format(\
        df_athletes[df_athletes.Season == 'Summer'].City.nunique()))
print('Number of unique host cities (Winter): {}'.format(\
        df_athletes[df_athletes.Season == 'Winter'].City.nunique()))

In [None]:
print('Athletes participating in both summer and winter games: ')
a = set(df_athletes[df_athletes.Season == 'Summer'].Name.unique())
b = set(df_athletes[df_athletes.Season == 'Winter'].Name.unique())
print(len(a.intersection(b)))
df_athletes[df_athletes.Name.isin(list(a.intersection(b)))].Sport.unique()

### Host cities

In [None]:
df_cities = pd.read_csv('./data/host_cities.csv')
df_cities.head()

## Summer Olympics

### Sports by year

In [None]:
sport_by_year = df_athletes[df_athletes.Season == 'Summer'][['Year', 'Sport']].groupby('Year')['Sport'].apply(set)

In [None]:
f, ax = plt.subplots(figsize=(20,5))
df_athletes[df_athletes.Season == 'Summer'].groupby('Year').agg({'Sport':'nunique'}).\
plot(kind='bar', ax=ax, title='Number of sports by year')
ax.set_xlabel('Year')
ax.set_ylabel('Number of different sports')
plt.show()

In [None]:
years = sorted(df_athletes[df_athletes.Season == 'Summer'].Year.unique())
disappearing_sports = [(years[i], list(sport_by_year[years[i-1]] - sport_by_year[years[i]])) for i in range(1,len(years))]
new_sports = [(years[i], list(sport_by_year[years[i]] - sport_by_year[years[i-1]])) for i in range(1,len(years))]

In [None]:
# adding the first year
new_sports.insert(0, (years[0], list(sport_by_year[years[0]])))
disappearing_sports.insert(0, (years[0], []))

In [None]:
for e, d in zip(new_sports, disappearing_sports):
    print('Year: ', e[0])
    print(' New sports: ', e[1])
    print('Disappearing sports:')
    print(d[1],'\n')

In [None]:
pd.DataFrame([(x,y[1],z[1]) for x, y, z in zip(years, new_sports, disappearing_sports)], columns=['Year', 'New', 'Disappearing'])

In [None]:
sport_year = pd.pivot_table(df_athletes[df_athletes.Season == 'Summer'][['Year', 'Sport', 'ID']].groupby(['Year', 'Sport'])\
               .count().reset_index(),
               index='Sport', columns='Year').fillna(0)

In [None]:
f, ax = plt.subplots(figsize=(20,5))
df_athletes[df_athletes.Season == 'Summer'][['Year', 'Sport']].groupby(['Sport'])\
               .nunique().sort_values('Year').plot(kind='bar',y='Year', ax=ax)
plt.show()

In [None]:
# Winter sports in the summer games
df_athletes[(df_athletes.Season == 'Summer') & \
            (df_athletes.Sport.isin(df_athletes[df_athletes.Season == 'Winter'].Sport.unique()))].\
groupby(['Sport', 'Year']).nunique()

### Countries by year

In [None]:
country_per_year = df_athletes[df_athletes.Season == 'Summer'].groupby(['Year'])['NOC'].nunique()

f, ax = plt.subplots(figsize=(20,5))
country_per_year.plot(kind='bar', title='Number of countries per year', ax=ax)
ax.set_xlabel('Year')
ax.set_ylabel('Number of different countries')
plt.show();

In [None]:
years = sorted(df_athletes[df_athletes.Season == 'Summer'].Year.unique())
noc_per_year = df_athletes[df_athletes.Season == 'Summer'][['Year', 'NOC']].groupby('Year')['NOC'].apply(set)

disappearing_noc = [(years[i], list(noc_per_year[years[i-1]] - noc_per_year[years[i]])) for i in range(1,len(years))]
new_noc = [(years[i], list(noc_per_year[years[i]] - noc_per_year[years[i-1]])) for i in range(1,len(years))]

In [None]:
# adding the first year
new_noc.insert(0, (years[0], list(noc_per_year[years[0]])))
disappearing_noc.insert(0, (years[0], []))

In [None]:
df_noc = pd.read_csv('./data/noc_regions.csv')
df_noc.head()

In [None]:
# create dictionary with NOC and regions
noc_dict = df_noc[['NOC', 'region']].set_index('NOC').to_dict()['region']

# add missing entry
noc_dict['SGP'] = 'Singapore'

In [None]:
for e, d in zip(new_noc, disappearing_noc):
    print('Year: ', e[0])
    print('\n\t New country (NOC): ', [noc_dict[i] for i in e[1]])
    print('\n\t Disappearing country (NOC):', [noc_dict[i] for i in d[1]],'\n\n')

### Host cities

In [None]:
df_host_cities = df_athletes[['City', 'Year']].drop_duplicates().reset_index(drop=True)

In [None]:
df_host_cities = df_host_cities.join(df_cities.set_index('City'), on='City')

In [None]:
df_host_cities

In [None]:
f, ax = plt.subplots(figsize=(20,5))
df_host_cities[df_host_cities['Season'] == 'Summer'].groupby('Country')['City']\
    .count().sort_values().plot(kind='bar', ax=ax, title='Years helding the Summer Olympic Games')
ax.set_ylabel('Years helding the Games')
ax.set_yticks(range(0,5))
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(20,5))
df_host_cities[df_host_cities['Season'] == 'Summer'].groupby('Continent')['City']\
    .count().sort_values().plot(kind='bar', ax=ax, title='Years helding the Summer Olympic Games')
ax.set_ylabel('Years helding the Games')
ax.set_yticks(range(0,19,2))
plt.show()