# Exercises

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from dateutil.relativedelta import relativedelta

from pylab import rcParams
rcParams['figure.figsize'] = 10, 5

In [2]:
% matplotlib inline

In [3]:
# Read in the athletes data
df = pd.read_csv('athletes.csv')

## 1. What was the average age in male and female athletes?

In [4]:
df.dtypes

id               int64
name            object
nationality     object
sex             object
dob             object
height         float64
weight         float64
sport           object
gold             int64
silver           int64
bronze           int64
dtype: object

In [5]:
df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0


In [6]:
# Need to convert dob to a datetime
# Check how datetime works
dt = datetime.strptime('10/17/69', '%m/%d/%y')
print(dt)

1969-10-17 00:00:00


In [7]:
# Set opening day for 2016 olympics
Odt = datetime.strptime('08/05/16', '%m/%d/%y')
print(Odt)

2016-08-05 00:00:00


In [8]:
relativedelta(Odt, dt).years

46

In [9]:
def Age(row):
    try:
        dt = datetime.strptime(row['dob'], '%m/%d/%y')
        return relativedelta(Odt, dt).years
    except:
        pass

In [10]:
df['age'] = df.apply(lambda x: Age(x), axis=1)
df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,age
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0,46.0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0,29.0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1,24.0
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0,25.0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0,25.0


In [11]:
dfM = df[df['sex'] == 'male']
dfF = df[df['sex'] == 'female']

print('The average age among male athletes is: {}'.format(dfM['age'].mean()))
print('The average age among female athletes is: {}'.format(dfF['age'].mean()))

The average age among male athletes is: 26.024320909665192
The average age among female athletes is: 25.44418828049952


## 2. What are the most common Dates of Birth?

*To clarify - day, month, year*

In [12]:
# This answers the question correct and was my first answer
df['dob'].value_counts()

3/5/88      9
2/18/93     9
12/20/90    9
3/1/89      8
4/3/88      8
12/14/89    8
6/19/91     8
5/2/90      8
6/9/89      8
3/3/93      8
7/30/93     8
4/29/88     8
4/5/89      7
1/13/90     7
4/10/90     7
10/25/94    7
2/23/93     7
12/6/91     7
7/16/91     7
8/17/92     7
4/10/92     7
8/27/90     7
4/5/94      7
7/11/90     7
3/7/94      7
6/20/94     7
4/20/94     7
3/1/90      7
7/2/86      7
1/10/92     7
           ..
2/27/94     1
3/17/83     1
6/5/73      1
4/25/85     1
11/18/97    1
11/20/79    1
12/19/96    1
11/17/95    1
9/29/90     1
11/26/94    1
10/1/89     1
5/22/84     1
4/14/87     1
5/13/84     1
7/31/88     1
7/11/86     1
11/29/93    1
1/15/81     1
8/27/95     1
4/20/86     1
6/9/95      1
8/19/82     1
1/9/79      1
12/27/78    1
9/7/87      1
8/14/76     1
11/6/86     1
7/28/84     1
12/12/85    1
2/4/97      1
Name: dob, dtype: int64

In [13]:
# This is the solution videos answer
dfDOB = df.groupby('dob')['id'].count().reset_index()
dfDOB.columns = ['dob', 'count']
dfDOB = dfDOB.sort_values(by='count', ascending=False)

In [14]:
dfDOB.head()

Unnamed: 0,dob,count
2699,3/5/88,9
1997,2/18/93,9
1573,12/20/90,9
1472,12/14/89,8
4169,6/9/89,8


## 3. How about the most common birthdays?

*To clarify - day, month*

In [15]:
def Date(row):
    try:
        return str(row['dob'].split('/')[:2])
    except:
        pass

In [17]:
dummy = df.apply(lambda x: Date(x), axis=1)
dummy = pd.DataFrame(dummy)
dummy.columns = ['date']
dummy.head()

Unnamed: 0,date
0,"['10', '17']"
1,"['9', '23']"
2,"['5', '27']"
3,"['1', '2']"
4,"['11', '26']"


In [19]:
dummy['date'].value_counts().reset_index().head()

Unnamed: 0,index,date
0,"['1', '1']",58
1,"['2', '5']",51
2,"['2', '10']",48
3,"['9', '19']",47
4,"['1', '20']",47


## 4. What are the Countries with more than 100 medals?

In [20]:
df.head()

Unnamed: 0,id,name,nationality,sex,dob,height,weight,sport,gold,silver,bronze,age
0,736041664,A Jesus Garcia,ESP,male,10/17/69,1.72,64.0,athletics,0,0,0,46.0
1,532037425,A Lam Shin,KOR,female,9/23/86,1.68,56.0,fencing,0,0,0,29.0
2,435962603,Aaron Brown,CAN,male,5/27/92,1.98,79.0,athletics,0,0,1,24.0
3,521041435,Aaron Cook,MDA,male,1/2/91,1.83,80.0,taekwondo,0,0,0,25.0
4,33922579,Aaron Gate,NZL,male,11/26/90,1.81,71.0,cycling,0,0,0,25.0


## 5. Create a bar or pie chart for the results of the previous exercise.

## 6. Male weightlifting competitions are divided into 8 weight classes. Can you estimate these weight classes by looking at the data? Hint: Create a scatter plot with Body weight on the x-axis and choose height as y.

## 7. Generate a histogram of male and female height distribution among all participants.

## 8. Using the Seaborn package create a box plot for male and female height distribution among all participants.

## 9. _Optional_: What else would you try?