# Olympic Data

## Setup

In [65]:
# Import dependencies
import pandas as pd
from sqlalchemy import create_engine 

## Import csvs into dataframes

In [81]:
# Import csvs into data frames
athlete = pd.read_csv('athlete_events.csv')
regions = pd.read_csv('noc_regions.csv')
summer = pd.read_csv('summer.csv')
winter = pd.read_csv('winter.csv')
wdi = pd.read_csv('WDIData.csv')

## Print heads of tables

In [34]:
athlete.head()

### PLAN FOR THIS TABLE ###
# - Drop Games column
# - Split names column into first and last.

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [35]:
regions.head()

### PLAN FOR THIS TABLE ###
# - Drop notes column
# - Add country code column.

Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,


In [36]:
summer.head()

### PLAN FOR THIS TABLE ###
# - Split the athlete column into first and last name columns.

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [37]:
winter.head()

### PLAN FOR THIS TABLE ###
# - Split the athlete column into first and last name columns (same as for the summer table).

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1924,Chamonix,Biathlon,Biathlon,"BERTHET, G.",FRA,Men,Military Patrol,Bronze
1,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, C.",FRA,Men,Military Patrol,Bronze
2,1924,Chamonix,Biathlon,Biathlon,"MANDRILLON, Maurice",FRA,Men,Military Patrol,Bronze
3,1924,Chamonix,Biathlon,Biathlon,"VANDELLE, André",FRA,Men,Military Patrol,Bronze
4,1924,Chamonix,Biathlon,Biathlon,"AUFDENBLATTEN, Adolf",SUI,Men,Military Patrol,Gold


## Athletes Table

In [38]:
# Drop games column
athlete.drop('Games', axis=1, inplace=True)

In [39]:
# Add a last name column which is empty to start
athlete['last_name'] = ''

# Rename column headers to be lowercased
athlete.columns = ['id', 'name', 'sex', 'age', 'height', 'weight', 'team', 'noc', 'year', 'season', 'city', 'sport', 'event', 'medal', 'last_name']

In [40]:
# Split the Name column into Last Name and First Name

list_split_names = [str(name).split() for name in athlete["name"]]
last_name_list = [a.pop(-1) for a in list_split_names]
first_name_list = [' '.join(item) for item in list_split_names]

# Reassign the columns in the dataframe

athlete['name'] = first_name_list
athlete['last_name'] = last_name_list

In [41]:
athlete = athlete[['id', 'last_name', 'name', 'sex', 'age', 'height', 'weight', 'team', 'noc', 'year', 'season', 'city', 'sport', 'event', 'medal']]

In [42]:
athlete.rename(columns={'name':'first_name'}, inplace=True)

In [43]:
athlete.head()

Unnamed: 0,id,last_name,first_name,sex,age,height,weight,team,noc,year,season,city,sport,event,medal
0,1,Dijiang,A,M,24.0,180.0,80.0,China,CHN,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,Lamusi,A,M,23.0,170.0,60.0,China,CHN,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Aaby,Gunnar Nielsen,M,24.0,,,Denmark,DEN,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Aabye,Edgar Lindenau,M,34.0,,,Denmark/Sweden,DEN,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Aaftink,Christine Jacoba,F,21.0,185.0,82.0,Netherlands,NED,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


## Regions Table

In [44]:
# Drop notes column
regions.drop('notes', axis=1, inplace=True)

In [45]:
regions.head()

Unnamed: 0,NOC,region
0,AFG,Afghanistan
1,AHO,Curacao
2,ALB,Albania
3,ALG,Algeria
4,AND,Andorra


In [46]:
# Rename NOC column to be lower case
regions.rename(columns = {'NOC': 'noc', 'region': 'name'}, inplace=True)

In [47]:
regions.head()

Unnamed: 0,noc,name
0,AFG,Afghanistan
1,AHO,Curacao
2,ALB,Albania
3,ALG,Algeria
4,AND,Andorra


In [48]:
# Pull just the two columns I want from the wdi table
wdi = wdi[['Country Name', 'Country Code']]

In [49]:
# Rename the columns and drop duplicate rows
wdi.columns = ['name', 'country_code']
wdi = wdi.drop_duplicates()

In [50]:
# Join the trimmed wdi dataframe with regions dataframe on the name column. Call this the new regions dataframe.
regions = pd.merge(wdi, regions, how='outer', on='name')

# Reorder the columns
regions = regions[['noc', 'country_code', 'name']]
regions.columns = ['noc', 'country_code', 'country_name']
regions.head()

Unnamed: 0,noc,country_code,country_name
0,,ARB,Arab World
1,,CSS,Caribbean small states
2,,CEB,Central Europe and the Baltics
3,,EAR,Early-demographic dividend
4,,EAS,East Asia & Pacific


## Summer Table

In [82]:
# Change column names to be in lower case
summer.columns = ['year', 'city', 'sport', 'discipline', 'athlete_name', 'country_code', 'sex', 'event', 'medal']

In [83]:
# Add a new column so we can split names, and reorder the columns
summer['athlete_first_name'] = ''
summer = summer[['year', 'city', 'sport', 'discipline', 'athlete_name', 'athlete_first_name', 'country_code', 'sex', 'event', 'medal']]

In [84]:
# Split the Name column into Last Name and First Name

list_split_names = [str(name).split(',') for name in summer["athlete_name"]]
last_name_list = [a.pop(0) for a in list_split_names]
first_name_list = [' '.join(item) for item in list_split_names]
first_name_list = [item.lstrip() for item in first_name_list]

# Reassign the columns in the dataframe

summer['athlete_name'] = last_name_list
summer['athlete_first_name'] = first_name_list

In [85]:
# Rename the name column to refer to last name
summer.rename(columns = {'athlete_name': 'athlete_last_name'}, inplace=True)

In [86]:
# Add an id column
summer['id'] = [x + 1 for x in range(len(summer['sport']))]

In [87]:
# Reorder columns so 'id' column is first
summer = summer[['id', 'year', 'city', 'sport', 'discipline', 'athlete_last_name', 'athlete_first_name', 'country_code', 'sex', 'event', 'medal']]

In [88]:
summer.head()

Unnamed: 0,id,year,city,sport,discipline,athlete_last_name,athlete_first_name,country_code,sex,event,medal
0,1,1896,Athens,Aquatics,Swimming,HAJOS,Alfred,HUN,Men,100M Freestyle,Gold
1,2,1896,Athens,Aquatics,Swimming,HERSCHMANN,Otto,AUT,Men,100M Freestyle,Silver
2,3,1896,Athens,Aquatics,Swimming,DRIVAS,Dimitrios,GRE,Men,100M Freestyle For Sailors,Bronze
3,4,1896,Athens,Aquatics,Swimming,MALOKINIS,Ioannis,GRE,Men,100M Freestyle For Sailors,Gold
4,5,1896,Athens,Aquatics,Swimming,CHASAPIS,Spiridon,GRE,Men,100M Freestyle For Sailors,Silver


## Winter Table

In [89]:
# Change column names to be in lower case
winter.columns = ['year', 'city', 'sport', 'discipline', 'athlete_name', 'country_code', 'sex', 'event', 'medal']

In [90]:
# Add a new column so we can split names, and reorder the columns
winter['athlete_first_name'] = ''
winter = winter[['year', 'city', 'sport', 'discipline', 'athlete_name', 'athlete_first_name', 'country_code', 'sex', 'event', 'medal']]

In [91]:
# Split the Name column into Last Name and First Name

list_split_names = [str(name).split(',') for name in winter["athlete_name"]]
last_name_list = [a.pop(0) for a in list_split_names]
first_name_list = [' '.join(item) for item in list_split_names]
first_name_list = [item.lstrip() for item in first_name_list]

# Reassign the columns in the dataframe

winter['athlete_name'] = last_name_list
winter['athlete_first_name'] = first_name_list

In [92]:
# Rename the name column to refer to last name
winter.rename(columns = {'athlete_name': 'athlete_last_name'}, inplace=True)

In [93]:
# Add an id column
winter['id'] = [x + 1 for x in range(len(winter['sport']))]

In [94]:
# Reorder columns so 'id' column is first
winter = winter[['id', 'year', 'city', 'sport', 'discipline', 'athlete_last_name', 'athlete_first_name', 'country_code', 'sex', 'event', 'medal']]

In [95]:
winter.head()

Unnamed: 0,id,year,city,sport,discipline,athlete_last_name,athlete_first_name,country_code,sex,event,medal
0,1,1924,Chamonix,Biathlon,Biathlon,BERTHET,G.,FRA,Men,Military Patrol,Bronze
1,2,1924,Chamonix,Biathlon,Biathlon,MANDRILLON,C.,FRA,Men,Military Patrol,Bronze
2,3,1924,Chamonix,Biathlon,Biathlon,MANDRILLON,Maurice,FRA,Men,Military Patrol,Bronze
3,4,1924,Chamonix,Biathlon,Biathlon,VANDELLE,André,FRA,Men,Military Patrol,Bronze
4,5,1924,Chamonix,Biathlon,Biathlon,AUFDENBLATTEN,Adolf,SUI,Men,Military Patrol,Gold
