In [12]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

### Extract CSVs into DataFrames

In [2]:
#import olympic data and extract to dataframe
olympic_medal = "Tokyo 2021 dataset.csv"
olympic_medal_df = pd.read_csv(olympic_medal)
olympic_medal_df.head()

Unnamed: 0,Rank,Team/NOC,Gold Medal,Silver Medal,Bronze Medal,Total,Rank by Total,NOCCode
0,1,United States of America,39,41,33,113,1,USA
1,2,People's Republic of China,38,32,18,88,2,CHN
2,3,Japan,27,14,17,58,5,JPN
3,4,Great Britain,22,21,22,65,4,GBR
4,5,ROC,20,28,23,71,3,ROC


In [13]:
#import country_height data and extract to dataframe
country_height = "Height of Male and Female by Country 2022.csv"
country_height_df = pd.read_csv(country_height)
country_height_df.head()

Unnamed: 0,Rank,Country Name,Male Height in Cm,Female Height in Cm,Male Height in Ft,Female Height in Ft
0,1,Netherlands,183.78,170.36,6.03,5.59
1,2,Montenegro,183.3,169.96,6.01,5.58
2,3,Estonia,182.79,168.66,6.0,5.53
3,4,Bosnia and Herzegovina,182.47,167.47,5.99,5.49
4,5,Iceland,182.1,168.91,5.97,5.54


### Transform Olympic DataFrame

In [15]:
# Create a filtered dataframe from specific columns
olympic_col = ["Team/NOC", "Gold Medal", "Silver Medal", "Bronze Medal", "Total"]
olympic_transformed = olympic_medal_df[olympic_col].copy()

# Rename the column headers
olympic_transformed = olympic_transformed.rename(columns={"Team/NOC": "country",
                                                          "Gold Medal": "gold_count",
                                                          "Silver Medal": "silver_count",
                                                          "Bronze Medal": "bronze_count",
                                                          "Total": "total_count"})

# Clean the data by dropping duplicates and setting the index
olympic_transformed.drop_duplicates("country", inplace=True)
olympic_transformed.set_index("country", inplace=True)

olympic_transformed.head()

Unnamed: 0_level_0,gold_count,silver_count,bronze_count,total_count
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
United States of America,39,41,33,113
People's Republic of China,38,32,18,88
Japan,27,14,17,58
Great Britain,22,21,22,65
ROC,20,28,23,71


In [16]:


olympic_transformed.reset_index(inplace = True, drop = False)
olympic_transformed.head()

Unnamed: 0,country,gold_count,silver_count,bronze_count,total_count
0,United States of America,39,41,33,113
1,People's Republic of China,38,32,18,88
2,Japan,27,14,17,58
3,Great Britain,22,21,22,65
4,ROC,20,28,23,71


In [17]:
# olympic_transformed['country'] = olympic_transformed['country'].replace(['ROC'],'Russia')


olympic_transformed['country'] = olympic_transformed['country'].replace(
['United States of America',
 "People's Republic of China",
 'Great Britain',
 'ROC',
 'Republic of Korea',
 'Islamic Republic of Iran',
'Chinese Taipei',
'Hong Kong, China',
"CÃ´te d'Ivoire",
"Republic of Moldova",
"Syrian Arab Republic"],
    
    ['United States',
    "China",
    'United Kingdom',
    'Russia',
    'South Korea',
    'Iran',
    "Taiwan",
    'Hong Kong',
    "Ivory Coast",
    "Moldova",
    "Syria"])


olympic_transformed.set_index("country", inplace=True)
olympic_transformed                                                                 

Unnamed: 0_level_0,gold_count,silver_count,bronze_count,total_count
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
United States,39,41,33,113
China,38,32,18,88
Japan,27,14,17,58
United Kingdom,22,21,22,65
Russia,20,28,23,71
...,...,...,...,...
Ghana,0,0,1,1
Grenada,0,0,1,1
Kuwait,0,0,1,1
Moldova,0,0,1,1


### Transform Country DataFrame

In [18]:
# Create a filtered dataframe from specific columns
country_col = ["Country Name", "Male Height in Cm", "Female Height in Cm"]
country_transformed = country_height_df[country_col].copy()

# Rename the column headers
country_transformed = country_transformed.rename(columns={"Country Name": "country",
                                                          "Male Height in Cm": "male_height_cm",
                                                          "Female Height in Cm": "female_height_cm"})

# Clean the data by dropping duplicates and setting the index
country_transformed.drop_duplicates("country", inplace=True)
country_transformed.set_index("country", inplace=True)

#insert average height
country_transformed["average_height_cm"]= (country_transformed["male_height_cm"] + country_transformed["female_height_cm"])/2

country_transformed.head()

Unnamed: 0_level_0,male_height_cm,female_height_cm,average_height_cm
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Netherlands,183.78,170.36,177.07
Montenegro,183.3,169.96,176.63
Estonia,182.79,168.66,175.725
Bosnia and Herzegovina,182.47,167.47,174.97
Iceland,182.1,168.91,175.505


### Create database connection

In [19]:
connection_string = "postgres:brianh1234@localhost:5432/etl-project"
engine = create_engine(f'postgresql://{connection_string}')

In [20]:
# Confirm tables
engine.table_names()

  engine.table_names()


['olympics_country', 'country_height']

### Load DataFrames into database

In [23]:
olympic_transformed.to_sql(name='olympics_country', con=engine, if_exists='append', index=True)

In [24]:
country_transformed.to_sql(name='country_height', con=engine, if_exists='append', index=True)