## Load Dependencies

In [2]:
import pandas as pd
from sqlalchemy import create_engine

## Extract

### Store CSVs into DataFrames

In [3]:
athlete_events = "Resources/athlete_events.csv"
athlete_events_df = pd.read_csv(athlete_events)
athlete_events_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


## Transform

### Clean athlete events data 

In [8]:
athlete_events_df=athlete_events_df.fillna(0)
athlete_events_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,0
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,0
2,3,Gunnar Nielsen Aaby,M,24.0,0.0,0.0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,0
3,4,Edgar Lindenau Aabye,M,34.0,0.0,0.0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,0


### Check Data Types

In [4]:
athlete_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      271116 non-null  int64  
 1   Name    271116 non-null  object 
 2   Sex     271116 non-null  object 
 3   Age     261642 non-null  float64
 4   Height  210945 non-null  float64
 5   Weight  208241 non-null  float64
 6   Team    271116 non-null  object 
 7   NOC     271116 non-null  object 
 8   Games   271116 non-null  object 
 9   Year    271116 non-null  int64  
 10  Season  271116 non-null  object 
 11  City    271116 non-null  object 
 12  Sport   271116 non-null  object 
 13  Event   271116 non-null  object 
 14  Medal   39783 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 31.0+ MB


### Convert Date to datetime

In [8]:
# bitcoin_df['Date']=pd.to_datetime(bitcoin_df['Date']) For reference


### Convert float64 to int

In [10]:
athlete_events_df[['Age','Height','Weight']] = athlete_events_df[['Age','Height','Weight']].astype(int)


### Select DF Columns to Import


In [12]:
athlete_events_df2 = athlete_events_df[["Name","Sex","Height","Weight","Team","NOC","Games","Year","Season",\
                                        "City","Sport","Event","Medal"]]
athlete_events_df2.head()

Unnamed: 0,Name,Sex,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,A Dijiang,M,180,80,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,0
1,A Lamusi,M,170,60,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,0
2,Gunnar Nielsen Aaby,M,0,0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,0
3,Edgar Lindenau Aabye,M,0,0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,Christine Jacoba Aaftink,F,185,82,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,0


In [13]:
athlete_events_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271116 entries, 0 to 271115
Data columns (total 13 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Name    271116 non-null  object
 1   Sex     271116 non-null  object
 2   Height  271116 non-null  int32 
 3   Weight  271116 non-null  int32 
 4   Team    271116 non-null  object
 5   NOC     271116 non-null  object
 6   Games   271116 non-null  object
 7   Year    271116 non-null  int64 
 8   Season  271116 non-null  object
 9   City    271116 non-null  object
 10  Sport   271116 non-null  object
 11  Event   271116 non-null  object
 12  Medal   271116 non-null  object
dtypes: int32(2), int64(1), object(10)
memory usage: 24.8+ MB


## Load

### Connect to local database

In [19]:
#Create Postgres Database 
rds_connection_string = "postgres:postgres@localhost:5432/Olympics_Project"
engine = create_engine(f'postgresql://{rds_connection_string}')

### Check for tables

In [None]:
# Need to create SQL database first
#engine.table_names()

### Use pandas to load csv converted DataFrame into database

In [24]:
# Need to create SQL database first
# athlete_events_df2.to_sql(name='Athlete_Events', con=engine, if_exists='append', index=False)


### Confirm data has been added by querying the tables

In [14]:
# Need to create SQL database first
# pd.read_sql_query('select * from Athlete_Events', con=engine).head()