In [4]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from secret import username, password

# Extract CSV into DataFrame

In [7]:
file = 'Resources/athlete_events.csv'

In [8]:
olympics_info = pd.read_csv(file)
olympics_info.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


# Transform DataFrame


#    1) Create "Sport" DataFrame and Clean

In [9]:
# Create a filtered dataframe "Sport" from specific columns
sports_cols = ["Sport"]
sport_df = olympics_info[sports_cols].copy()


# Rename the column header
sport_df = sport_df.rename(columns={"Sport": "sport"})

sport_df.head()

Unnamed: 0,sport
0,Basketball
1,Judo
2,Football
3,Tug-Of-War
4,Speed Skating


In [10]:
# Stats 
sport_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   sport   271116 non-null  object
dtypes: object(1)
memory usage: 2.1+ MB


In [11]:
# Find duplicates, if any
duplicateSportDF = sport_df[sport_df.duplicated()]

print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateSportDF.head(20))
print(duplicateSportDF.count())

Duplicate Rows except first occurrence based on all columns are :
                   sport
5          Speed Skating
6          Speed Skating
7          Speed Skating
8          Speed Skating
9          Speed Skating
11  Cross Country Skiing
12  Cross Country Skiing
13  Cross Country Skiing
14  Cross Country Skiing
15  Cross Country Skiing
16  Cross Country Skiing
17  Cross Country Skiing
18  Cross Country Skiing
19  Cross Country Skiing
20  Cross Country Skiing
21  Cross Country Skiing
22  Cross Country Skiing
23  Cross Country Skiing
24  Cross Country Skiing
25  Cross Country Skiing
sport    271050
dtype: int64


In [12]:
# Clean the data by dropping duplicates 
sport_df.drop_duplicates("sport", inplace=True)

sport_df

Unnamed: 0,sport
0,Basketball
1,Judo
2,Football
3,Tug-Of-War
4,Speed Skating
...,...
21488,Jeu De Paume
29994,Roque
30323,Alpinism
50275,Basque Pelota


In [13]:
# Clean df stats
sport_df.count()

sport    66
dtype: int64

#   2) Create "Event" DataFrame and Clean

In [14]:
# Create a filtered dataframe "Events" from specific columns
event_df = pd.DataFrame(olympics_info, columns= ["Event"].copy())

# Rename the column header
event_df = event_df.rename(columns={"Event": "event_name"})


event_df.head()

Unnamed: 0,event_name
0,Basketball Men's Basketball
1,Judo Men's Extra-Lightweight
2,Football Men's Football
3,Tug-Of-War Men's Tug-Of-War
4,Speed Skating Women's 500 metres


In [15]:
# Stats
print(event_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 1 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   event_name  271116 non-null  object
dtypes: object(1)
memory usage: 2.1+ MB
None


In [16]:
# Find duplicates, if any based on the Event column
duplicateEventDF = event_df[event_df.duplicated(['event_name'])]
sort_by_event = duplicateEventDF.sort_values('event_name')

print("Duplicate Rows except first occurrence based the Event column are :")
print(sort_by_event.head())
print(sort_by_event.count())

Duplicate Rows except first occurrence based the Event column are :
                          event_name
85467   Alpine Skiing Men's Combined
70241   Alpine Skiing Men's Combined
58962   Alpine Skiing Men's Combined
154995  Alpine Skiing Men's Combined
70243   Alpine Skiing Men's Combined
event_name    270351
dtype: int64


In [17]:
# Clean the data by dropping duplicates and setting the index
event_df.drop_duplicates("event_name", inplace=True)

event_df.head()

Unnamed: 0,event_name
0,Basketball Men's Basketball
1,Judo Men's Extra-Lightweight
2,Football Men's Football
3,Tug-Of-War Men's Tug-Of-War
4,Speed Skating Women's 500 metres


In [18]:
event_df = event_df.sort_values("event_name")
event_df 

Unnamed: 0,event_name
214105,Aeronautics Mixed Aeronautics
67,Alpine Skiing Men's Combined
59,Alpine Skiing Men's Downhill
61,Alpine Skiing Men's Giant Slalom
62,Alpine Skiing Men's Slalom
...,...
10885,"Wrestling Women's Flyweight, Freestyle"
1552,"Wrestling Women's Heavyweight, Freestyle"
1285,"Wrestling Women's Light-Heavyweight, Freestyle"
1667,"Wrestling Women's Lightweight, Freestyle"


In [19]:
# Clean df stats
print(event_df.count())

event_name    765
dtype: int64


# Create database connection

In [22]:
connection_string = f'{username}:{password}@localhost:5432/olympics_db'
engine = create_engine(f'postgresql://{connection_string}')

In [23]:
# Confirm tables
engine.table_names()

['country',
 'athlete',
 'event',
 'sport',
 'olympic_season',
 'athlete_event',
 'country_stats']

# Load DataFrames into database

In [24]:
sport_df.to_sql(name='sport', con=engine, if_exists='append', index=False)

In [25]:
event_df.to_sql(name='event', con=engine, if_exists='append', index=False)

# Confirm DataFrame loaded correctly

In [26]:
pd.read_sql_query('select * from sport', con=engine).head()

Unnamed: 0,id,sport
0,1,Basketball
1,2,Judo
2,3,Football
3,4,Tug-Of-War
4,5,Speed Skating


In [27]:
pd.read_sql_query('select * from event', con=engine).head()

Unnamed: 0,id,event_name,athlete_id,sport_id,olympic_season_id
0,1,Aeronautics Mixed Aeronautics,,,
1,2,Alpine Skiing Men's Combined,,,
2,3,Alpine Skiing Men's Downhill,,,
3,4,Alpine Skiing Men's Giant Slalom,,,
4,5,Alpine Skiing Men's Slalom,,,
