In [1]:
import pandas as pd
from sqlalchemy import create_engine
from secret import username, password
import numpy as np

# Extract CSV into DataFrame

In [2]:
file = 'Resources/athlete_events.csv'
p_file = 'Resources/population_GDP.csv'

In [3]:
olympics_info = pd.read_csv(file)
olympics_info.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [4]:
population_data = pd.read_csv(p_file)
population_data.head()

Unnamed: 0,Country,Code,Population,GDP per Capita
0,Afghanistan,AFG,32526562.0,594.323081
1,Albania,ALB,2889167.0,3945.217582
2,Algeria,ALG,39666519.0,4206.031232
3,American Samoa*,ASA,55538.0,
4,Andorra,AND,70473.0,


# Transform DataFrame


#    1) Create "Sport" DataFrame and Clean

In [5]:
# Create a filtered dataframe "Sport" from specific columns
sports_cols = ["Sport", "Event"]
sport_df= olympics_info[sports_cols].copy()

# Rename the column headers
# sport_df = sport_df.rename(columns={"Sport": "sport", "Event": "event"})



sport_df.head()

Unnamed: 0,Sport,Event
0,Basketball,Basketball Men's Basketball
1,Judo,Judo Men's Extra-Lightweight
2,Football,Football Men's Football
3,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
4,Speed Skating,Speed Skating Women's 500 metres


In [6]:
# Stats 
sport_df.count()

Sport    271116
Event    271116
dtype: int64

In [7]:
# Find duplicates, if any
duplicateSportDF = sport_df[sport_df.duplicated("Sport")]

print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateSportDF)
print(duplicateSportDF.count())

Duplicate Rows except first occurrence based on all columns are :
                Sport                                     Event
5       Speed Skating        Speed Skating Women's 1,000 metres
6       Speed Skating          Speed Skating Women's 500 metres
7       Speed Skating        Speed Skating Women's 1,000 metres
8       Speed Skating          Speed Skating Women's 500 metres
9       Speed Skating        Speed Skating Women's 1,000 metres
...               ...                                       ...
271111           Luge                Luge Mixed (Men)'s Doubles
271112    Ski Jumping  Ski Jumping Men's Large Hill, Individual
271113    Ski Jumping        Ski Jumping Men's Large Hill, Team
271114      Bobsleigh                      Bobsleigh Men's Four
271115      Bobsleigh                      Bobsleigh Men's Four

[271050 rows x 2 columns]
Sport    271050
Event    271050
dtype: int64


In [8]:
# Clean the data by dropping duplicates 
sport_table = sport_df.drop_duplicates("Sport")
sport_table.head()

Unnamed: 0,Sport,Event
0,Basketball,Basketball Men's Basketball
1,Judo,Judo Men's Extra-Lightweight
2,Football,Football Men's Football
3,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
4,Speed Skating,Speed Skating Women's 500 metres


In [9]:
sport_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66 entries, 0 to 214105
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Sport   66 non-null     object
 1   Event   66 non-null     object
dtypes: object(2)
memory usage: 1.5+ KB


In [10]:
# Sort Event ascending order
sport_table = sport_table.sort_values('Event')
sport_table

Unnamed: 0,Sport,Event
214105,Aeronautics,Aeronautics Mixed Aeronautics
59,Alpine Skiing,Alpine Skiing Men's Downhill
30323,Alpinism,Alpinism Mixed Alpinism
287,Archery,Archery Women's Individual
58,Art Competitions,"Art Competitions Mixed Sculpturing, Unknown Event"
...,...,...
3,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
291,Volleyball,Volleyball Men's Volleyball
88,Water Polo,Water Polo Men's Water Polo
80,Weightlifting,Weightlifting Women's Super-Heavyweight


In [11]:
sport_table.insert(0, 'id', range(1, 1 + len(sport_table)))

sport_table.head()

Unnamed: 0,id,Sport,Event
214105,1,Aeronautics,Aeronautics Mixed Aeronautics
59,2,Alpine Skiing,Alpine Skiing Men's Downhill
30323,3,Alpinism,Alpinism Mixed Alpinism
287,4,Archery,Archery Women's Individual
58,5,Art Competitions,"Art Competitions Mixed Sculpturing, Unknown Event"


In [12]:
# Clean df stats
print(sport_table.count())

id       66
Sport    66
Event    66
dtype: int64


In [13]:
# Create a new filtered dataframe "Sport" from specific columns
sports_cols = ["Sport"]
sport_final = olympics_info[sports_cols].copy()

# Rename the column headers
sport_final = sport_final.rename(columns={"Sport": "sport"})

# Delete duplicates
sport_final = sport_final.drop_duplicates("sport")

sport_final.head()






Unnamed: 0,sport
0,Basketball
1,Judo
2,Football
3,Tug-Of-War
4,Speed Skating


In [14]:
# Sort Event ascending order
sport_final = sport_final.sort_values('sport')
sport_final

Unnamed: 0,sport
214105,Aeronautics
59,Alpine Skiing
30323,Alpinism
287,Archery
58,Art Competitions
...,...
3,Tug-Of-War
291,Volleyball
88,Water Polo
80,Weightlifting


In [15]:
sport_final.insert(0, 'id', range(1, 1 + len(sport_final)))

sport_final.head()

Unnamed: 0,id,sport
214105,1,Aeronautics
59,2,Alpine Skiing
30323,3,Alpinism
287,4,Archery
58,5,Art Competitions


In [16]:
sport_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66 entries, 214105 to 81
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      66 non-null     int32 
 1   sport   66 non-null     object
dtypes: int32(1), object(1)
memory usage: 1.3+ KB


#   2) Create "Event" DataFrame and Clean

In [23]:
# Create a filtered dataframe "Events" from specific columns
event_df = pd.DataFrame(olympics_info, columns= ["Event", "Name", "City", "Year", "Season"].copy())

event_df.head()

Unnamed: 0,Event,Name,City,Year,Season
0,Basketball Men's Basketball,A Dijiang,Barcelona,1992,Summer
1,Judo Men's Extra-Lightweight,A Lamusi,London,2012,Summer
2,Football Men's Football,Gunnar Nielsen Aaby,Antwerpen,1920,Summer
3,Tug-Of-War Men's Tug-Of-War,Edgar Lindenau Aabye,Paris,1900,Summer
4,Speed Skating Women's 500 metres,Christine Jacoba Aaftink,Calgary,1988,Winter


In [24]:
# Stats
print(event_df.count())

Event     271116
Name      271116
City      271116
Year      271116
Season    271116
dtype: int64


In [25]:
# Find duplicates, if any based on the Event column
duplicateEventDF = event_df[event_df.duplicated(['Event'])]
sort_by_event = duplicateEventDF.sort_values('Event')

print("Duplicate Rows except first occurrence based the Event column are :")
print(sort_by_event.head())
print(sort_by_event.count())

Duplicate Rows except first occurrence based the Event column are :
                               Event                          Name  \
85467   Alpine Skiing Men's Combined               Jrg Grnenfelder   
70241   Alpine Skiing Men's Combined                Chad Fleischer   
58962   Alpine Skiing Men's Combined  Philippe Marie Eugne d'Ursel   
154995  Alpine Skiing Men's Combined  Robert Lloyd "Barney" McLean   
70243   Alpine Skiing Men's Combined                Chad Fleischer   

                City  Year  Season  
85467         Nagano  1998  Winter  
70241    Lillehammer  1994  Winter  
58962   Sankt Moritz  1948  Winter  
154995  Sankt Moritz  1948  Winter  
70243         Nagano  1998  Winter  
Event     270351
Name      270351
City      270351
Year      270351
Season    270351
dtype: int64


In [26]:
# Clean the data by dropping duplicates and setting the index
event_df.drop_duplicates("Event", inplace=True)

event_df.head()

Unnamed: 0,Event,Name,City,Year,Season
0,Basketball Men's Basketball,A Dijiang,Barcelona,1992,Summer
1,Judo Men's Extra-Lightweight,A Lamusi,London,2012,Summer
2,Football Men's Football,Gunnar Nielsen Aaby,Antwerpen,1920,Summer
3,Tug-Of-War Men's Tug-Of-War,Edgar Lindenau Aabye,Paris,1900,Summer
4,Speed Skating Women's 500 metres,Christine Jacoba Aaftink,Calgary,1988,Winter


In [27]:
event_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 765 entries, 0 to 214105
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Event   765 non-null    object
 1   Name    765 non-null    object
 2   City    765 non-null    object
 3   Year    765 non-null    int64 
 4   Season  765 non-null    object
dtypes: int64(1), object(4)
memory usage: 35.9+ KB


In [28]:
# Sort Event ascending order
event_df = event_df.sort_values('Event')
event_df

Unnamed: 0,Event,Name,City,Year,Season
214105,Aeronautics Mixed Aeronautics,Hermann Schreiber,Berlin,1936,Summer
67,Alpine Skiing Men's Combined,Kjetil Andr Aamodt,Lillehammer,1994,Winter
59,Alpine Skiing Men's Downhill,Kjetil Andr Aamodt,Albertville,1992,Winter
61,Alpine Skiing Men's Giant Slalom,Kjetil Andr Aamodt,Albertville,1992,Winter
62,Alpine Skiing Men's Slalom,Kjetil Andr Aamodt,Albertville,1992,Winter
...,...,...,...,...,...
10885,"Wrestling Women's Flyweight, Freestyle",Haley Ruth Augello,Rio de Janeiro,2016,Summer
1552,"Wrestling Women's Heavyweight, Freestyle",Yasemin Adar,Rio de Janeiro,2016,Summer
1285,"Wrestling Women's Light-Heavyweight, Freestyle",Mara Jos Acosta Acosta,Rio de Janeiro,2016,Summer
1667,"Wrestling Women's Lightweight, Freestyle",Aminat Oluwafunmilayo Adeniyi,Rio de Janeiro,2016,Summer


In [29]:
event_df.insert(0, 'id', range(1, 1 + len(event_df)))
# event_df.set_index("id", inplace=True)

event_df.head()

Unnamed: 0,id,Event,Name,City,Year,Season
214105,1,Aeronautics Mixed Aeronautics,Hermann Schreiber,Berlin,1936,Summer
67,2,Alpine Skiing Men's Combined,Kjetil Andr Aamodt,Lillehammer,1994,Winter
59,3,Alpine Skiing Men's Downhill,Kjetil Andr Aamodt,Albertville,1992,Winter
61,4,Alpine Skiing Men's Giant Slalom,Kjetil Andr Aamodt,Albertville,1992,Winter
62,5,Alpine Skiing Men's Slalom,Kjetil Andr Aamodt,Albertville,1992,Winter


# Create dataframe  for table Olympic Season

In [30]:
#Create dataframe from specific columns for table Olympic Season
oly_season_columns = ["Year", "City", "Season"]
new_oly_season_data_df = olympics_info[oly_season_columns].copy()

                                                      

#Clean the data by dropping duplicates and setting index
new_oly_season_data_df.drop_duplicates("Year", inplace=True)
# new_oly_season_data_df["id"]= new_oly_season_data_df.index+1
#new_oly_season_data_df.set_index("id", inplace=True)
new_oly_season_data_df.head()



Unnamed: 0,Year,City,Season
0,1992,Barcelona,Summer
1,2012,London,Summer
2,1920,Antwerpen,Summer
3,1900,Paris,Summer
4,1988,Calgary,Winter


In [31]:
new_oly_season_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 0 to 3079
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Year    35 non-null     int64 
 1   City    35 non-null     object
 2   Season  35 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.1+ KB


In [32]:
new_oly_season_data_df.insert(0, 'id', range(1, 1 + len(new_oly_season_data_df)))

new_oly_season_data_df.head()

Unnamed: 0,id,Year,City,Season
0,1,1992,Barcelona,Summer
1,2,2012,London,Summer
2,3,1920,Antwerpen,Summer
3,4,1900,Paris,Summer
4,5,1988,Calgary,Winter


# Create dataframe for table Athlete

In [33]:
# Create dataframe from specific columns for table Athlete
athlete_columns = ["Name", "Sex", "NOC"]
new_athlete_data_df = olympics_info[athlete_columns].copy()



#Rename the columns headers
# new_athlete_data_df = new_athlete_data_df.rename(columns={"Name": "name",
#                                                           "Sex": "sex",
#                                                           "NOC":"country_code"})

new_athlete_data_df.drop_duplicates("Name", inplace=True)
    
# new_athlete_data_df.insert(0, 'id', range(1, 1 + len(new_athlete_data_df)))



new_athlete_data_df.head()


Unnamed: 0,Name,Sex,NOC
0,A Dijiang,M,CHN
1,A Lamusi,M,CHN
2,Gunnar Nielsen Aaby,M,DEN
3,Edgar Lindenau Aabye,M,DEN
4,Christine Jacoba Aaftink,F,NED


In [34]:
new_athlete_data_df.insert(0, 'id', range(1, 1 + len(new_athlete_data_df)))
# new_athlete_data_df.drop_duplicates(subset='name')

new_athlete_data_df

Unnamed: 0,id,Name,Sex,NOC
0,1,A Dijiang,M,CHN
1,2,A Lamusi,M,CHN
2,3,Gunnar Nielsen Aaby,M,DEN
3,4,Edgar Lindenau Aabye,M,DEN
4,5,Christine Jacoba Aaftink,F,NED
...,...,...,...,...
271108,134728,Aleksandr Viktorovich Zyuzin,M,RUS
271110,134729,Olga Igorevna Zyuzkova,F,BLR
271111,134730,Andrzej ya,M,POL
271112,134731,Piotr ya,M,POL


# Create country df

In [None]:
#create country df

country_pd = population_data[['Code','Country']]

country_rename = country_pd.rename(columns={'Code':'country_code', 'Country':'country'})

country_rename['id'] = ""

country_rename['id'] = country_rename['id'].index + 1

country = country_rename[['id','country_code','country']]

country.head()

#  Joins

In [35]:
event_athlete_df = pd.merge(event_df, new_athlete_data_df, how='inner', on='Name')
event_athlete_df


Unnamed: 0,id_x,Event,Name,City,Year,Season,id_y,Sex,NOC
0,1,Aeronautics Mixed Aeronautics,Hermann Schreiber,Berlin,1936,Summer,106890,M,SUI
1,2,Alpine Skiing Men's Combined,Kjetil Andr Aamodt,Lillehammer,1994,Winter,20,M,NOR
2,3,Alpine Skiing Men's Downhill,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR
3,4,Alpine Skiing Men's Giant Slalom,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR
4,5,Alpine Skiing Men's Slalom,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR
...,...,...,...,...,...,...,...,...,...
760,761,"Wrestling Women's Flyweight, Freestyle",Haley Ruth Augello,Rio de Janeiro,2016,Summer,5926,F,USA
761,762,"Wrestling Women's Heavyweight, Freestyle",Yasemin Adar,Rio de Janeiro,2016,Summer,861,F,TUR
762,763,"Wrestling Women's Light-Heavyweight, Freestyle",Mara Jos Acosta Acosta,Rio de Janeiro,2016,Summer,719,F,VEN
763,764,"Wrestling Women's Lightweight, Freestyle",Aminat Oluwafunmilayo Adeniyi,Rio de Janeiro,2016,Summer,921,F,NGR


In [36]:
#Rename the columns headers
event_athlete_df = event_athlete_df.rename(columns={"id_x": "id", "id_y": "athlete_id"})
event_athlete_df

Unnamed: 0,id,Event,Name,City,Year,Season,athlete_id,Sex,NOC
0,1,Aeronautics Mixed Aeronautics,Hermann Schreiber,Berlin,1936,Summer,106890,M,SUI
1,2,Alpine Skiing Men's Combined,Kjetil Andr Aamodt,Lillehammer,1994,Winter,20,M,NOR
2,3,Alpine Skiing Men's Downhill,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR
3,4,Alpine Skiing Men's Giant Slalom,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR
4,5,Alpine Skiing Men's Slalom,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR
...,...,...,...,...,...,...,...,...,...
760,761,"Wrestling Women's Flyweight, Freestyle",Haley Ruth Augello,Rio de Janeiro,2016,Summer,5926,F,USA
761,762,"Wrestling Women's Heavyweight, Freestyle",Yasemin Adar,Rio de Janeiro,2016,Summer,861,F,TUR
762,763,"Wrestling Women's Light-Heavyweight, Freestyle",Mara Jos Acosta Acosta,Rio de Janeiro,2016,Summer,719,F,VEN
763,764,"Wrestling Women's Lightweight, Freestyle",Aminat Oluwafunmilayo Adeniyi,Rio de Janeiro,2016,Summer,921,F,NGR


In [37]:
event_sport_df = pd.merge(event_athlete_df, sport_table, how='inner', on='Event')
event_sport_df

Unnamed: 0,id_x,Event,Name,City,Year,Season,athlete_id,Sex,NOC,id_y,Sport
0,1,Aeronautics Mixed Aeronautics,Hermann Schreiber,Berlin,1936,Summer,106890,M,SUI,1,Aeronautics
1,3,Alpine Skiing Men's Downhill,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR,2,Alpine Skiing
2,12,Alpinism Mixed Alpinism,Charles Granville Bruce,Chamonix,1924,Winter,15615,M,GBR,3,Alpinism
3,39,Archery Women's Individual,Khadija Abbouda,Beijing,2008,Summer,153,F,MAR,4,Archery
4,69,"Art Competitions Mixed Sculpturing, Unknown Event",Win Valdemar Aaltonen,London,1948,Summer,19,M,FIN,5,Art Competitions
...,...,...,...,...,...,...,...,...,...,...,...
61,710,Tug-Of-War Men's Tug-Of-War,Edgar Lindenau Aabye,Paris,1900,Summer,4,M,DEN,62,Tug-Of-War
62,711,Volleyball Men's Volleyball,Mahmoud Abd El-Kader,Beijing,2008,Summer,155,M,EGY,63,Volleyball
63,713,Water Polo Men's Water Polo,Johan Aantjes,Los Angeles,1984,Summer,27,M,NED,64,Water Polo
64,735,Weightlifting Women's Super-Heavyweight,Andreea Aanei,Rio de Janeiro,2016,Summer,22,F,ROU,65,Weightlifting


In [38]:
event_sport_df = event_sport_df.rename(columns={"id_x": "id", "id_y": "sport_id"})
event_sport_df

Unnamed: 0,id,Event,Name,City,Year,Season,athlete_id,Sex,NOC,sport_id,Sport
0,1,Aeronautics Mixed Aeronautics,Hermann Schreiber,Berlin,1936,Summer,106890,M,SUI,1,Aeronautics
1,3,Alpine Skiing Men's Downhill,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR,2,Alpine Skiing
2,12,Alpinism Mixed Alpinism,Charles Granville Bruce,Chamonix,1924,Winter,15615,M,GBR,3,Alpinism
3,39,Archery Women's Individual,Khadija Abbouda,Beijing,2008,Summer,153,F,MAR,4,Archery
4,69,"Art Competitions Mixed Sculpturing, Unknown Event",Win Valdemar Aaltonen,London,1948,Summer,19,M,FIN,5,Art Competitions
...,...,...,...,...,...,...,...,...,...,...,...
61,710,Tug-Of-War Men's Tug-Of-War,Edgar Lindenau Aabye,Paris,1900,Summer,4,M,DEN,62,Tug-Of-War
62,711,Volleyball Men's Volleyball,Mahmoud Abd El-Kader,Beijing,2008,Summer,155,M,EGY,63,Volleyball
63,713,Water Polo Men's Water Polo,Johan Aantjes,Los Angeles,1984,Summer,27,M,NED,64,Water Polo
64,735,Weightlifting Women's Super-Heavyweight,Andreea Aanei,Rio de Janeiro,2016,Summer,22,F,ROU,65,Weightlifting


In [39]:
event_olympics_season_df = pd.merge(event_sport_df, new_oly_season_data_df, how='inner', on='Year')
event_olympics_season_df.head()

Unnamed: 0,id_x,Event,Name,City_x,Year,Season_x,athlete_id,Sex,NOC,sport_id,Sport,id_y,City_y,Season_y
0,1,Aeronautics Mixed Aeronautics,Hermann Schreiber,Berlin,1936,Summer,106890,M,SUI,1,Aeronautics,27,Berlin,Summer
1,419,Modern Pentathlon Men's Individual,Silvano Abba,Berlin,1936,Summer,103,M,ITA,37,Modern Pentathlon,27,Berlin,Summer
2,561,"Shooting Men's Small-Bore Rifle, Prone, 50 metres",Hakon Aasns,Berlin,1936,Summer,45,M,NOR,48,Shooting,27,Berlin,Summer
3,3,Alpine Skiing Men's Downhill,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR,2,Alpine Skiing,1,Barcelona,Summer
4,160,Basketball Men's Basketball,A Dijiang,Barcelona,1992,Summer,1,M,CHN,9,Basketball,1,Barcelona,Summer


In [40]:
#Rename the columns headers
event_table_final = event_olympics_season_df.rename(columns={"id_x": "id", "id_y": "olympic_season_id", "Event": "event_name"})
event_table_final.head()

Unnamed: 0,id,event_name,Name,City_x,Year,Season_x,athlete_id,Sex,NOC,sport_id,Sport,olympic_season_id,City_y,Season_y
0,1,Aeronautics Mixed Aeronautics,Hermann Schreiber,Berlin,1936,Summer,106890,M,SUI,1,Aeronautics,27,Berlin,Summer
1,419,Modern Pentathlon Men's Individual,Silvano Abba,Berlin,1936,Summer,103,M,ITA,37,Modern Pentathlon,27,Berlin,Summer
2,561,"Shooting Men's Small-Bore Rifle, Prone, 50 metres",Hakon Aasns,Berlin,1936,Summer,45,M,NOR,48,Shooting,27,Berlin,Summer
3,3,Alpine Skiing Men's Downhill,Kjetil Andr Aamodt,Albertville,1992,Winter,20,M,NOR,2,Alpine Skiing,1,Barcelona,Summer
4,160,Basketball Men's Basketball,A Dijiang,Barcelona,1992,Summer,1,M,CHN,9,Basketball,1,Barcelona,Summer


In [41]:
event_table_final.drop(['Name', 'City_x', 'Year', 'Season_x', 'Sex', 'NOC', 'Sport', 'City_y', 'Season_y'], axis=1, inplace=True)
event_table_final

Unnamed: 0,id,event_name,athlete_id,sport_id,olympic_season_id
0,1,Aeronautics Mixed Aeronautics,106890,1,27
1,419,Modern Pentathlon Men's Individual,103,37,27
2,561,"Shooting Men's Small-Bore Rifle, Prone, 50 metres",45,48,27
3,3,Alpine Skiing Men's Downhill,20,2,1
4,160,Basketball Men's Basketball,1,9,1
...,...,...,...,...,...
61,463,Rugby Sevens Women's Rugby Sevens,440,46,20
62,706,Trampolining Men's Individual,561,60,20
63,709,Triathlon Women's Olympic Distance,577,61,20
64,735,Weightlifting Women's Super-Heavyweight,22,65,20


# Scratchpad


In [None]:
joined_df = pd.merge(new_athlete_data_df,country, how='inner', on='country_code')
joined_df 

In [None]:
athlete_df = joined_df.drop(['country_code', 'country'], axis=1)
athlete_df = athlete_df.rename(columns={'id_x':'id', 'id_y':'nationality'})
athlete_df

In [None]:
event_joined_df = pd.merge(new_athlete_data_df,country, how='inner', on='country_code')
joined_df 

In [None]:
athlete_id = country_pd.rename(columns={'Code':'country_code', 'Country':'country'})

# Create database connection

In [42]:
connection_string = f'{username}:{password}@localhost:5432/olympics_db'
engine = create_engine(f'postgresql://{connection_string}')

In [43]:
# Confirm tables
engine.table_names()

['country',
 'athlete',
 'event',
 'sport',
 'olympic_season',
 'athlete_event',
 'country_stats']

# Load DataFrames into database

In [21]:
sport_final.to_sql(name='sport', con=engine, if_exists='append', index=False)

In [45]:
event_table_final.to_sql(name='event', con=engine, if_exists='append', index=False)

# Confirm data uploaded correctly

In [22]:
pd.read_sql_query('select * from sport', con=engine).head()

Unnamed: 0,id,sport
0,1,Aeronautics
1,2,Alpine Skiing
2,3,Alpinism
3,4,Archery
4,5,Art Competitions
