## Preprocess Data to perform Liner Regression
---

In [1]:
import pandas as pd

In [2]:
olympics_data = pd.read_csv('olympics_data.csv')
print(olympics_data.shape)
olympics_data.head()

(271116, 16)


Unnamed: 0,sno,ID,Name,Sex,Age,Height,Weight,Country,NOC,Games,Year,Season,City,Sport,Event,Medal
0,0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


### Summer Olympics Data
---

In [3]:
# Filtering the dataFrame for 'Summer' season
summer_df = olympics_data[olympics_data['Season'] == 'Summer']

print(summer_df.shape)
summer_df.head()

(222552, 16)


Unnamed: 0,sno,ID,Name,Sex,Age,Height,Weight,Country,NOC,Games,Year,Season,City,Sport,Event,Medal
0,0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
26,26,8,"Cornelia ""Cor"" Aalten (-Strannood)",F,18.0,168.0,,Netherlands,NED,1932 Summer,1932,Summer,Los Angeles,Athletics,Athletics Women's 100 metres,


### Host Country Data
---

In [5]:
host_df = pd.read_csv('summer_host_country.csv')
host_df

Unnamed: 0,Year,Host_Country
0,1896,Greece
1,1900,France
2,1904,USA
3,1906,Greece
4,1908,UK
5,1912,Sweden
6,1920,Belgium
7,1924,France
8,1928,Netherlands
9,1932,USA


### Merge summer_df and host_df
---

In [6]:
olympicData = pd.merge(summer_df, host_df, how = 'left', on = 'Year')
print(olympicData.shape)
olympicData.head()

(222552, 17)


Unnamed: 0,sno,ID,Name,Sex,Age,Height,Weight,Country,NOC,Games,Year,Season,City,Sport,Event,Medal,Host_Country
0,0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,Spain
1,1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,UK
2,2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,,Belgium
3,3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,France
4,26,8,"Cornelia ""Cor"" Aalten (-Strannood)",F,18.0,168.0,,Netherlands,NED,1932 Summer,1932,Summer,Los Angeles,Athletics,Athletics Women's 100 metres,,USA


### Count of Athletes, Sports and Events
---

In [7]:
athlete_df = olympicData[['Year','Host_Country','NOC','Country','Name','Sport','Event']]

print(athlete_df.shape)
athlete_df.head()

(222552, 7)


Unnamed: 0,Year,Host_Country,NOC,Country,Name,Sport,Event
0,1992,Spain,CHN,China,A Dijiang,Basketball,Basketball Men's Basketball
1,2012,UK,CHN,China,A Lamusi,Judo,Judo Men's Extra-Lightweight
2,1920,Belgium,DEN,Denmark,Gunnar Nielsen Aaby,Football,Football Men's Football
3,1900,France,DEN,Denmark,Edgar Lindenau Aabye,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
4,1932,USA,NED,Netherlands,"Cornelia ""Cor"" Aalten (-Strannood)",Athletics,Athletics Women's 100 metres


In [8]:
# Sort the dataframe by 'Year'(ascending) and 'Country'(ascending)
athlete_df = athlete_df.sort_values(['Year','Country'],ascending = [True,True])\
                       .reset_index(drop = True)

print(athlete_df.shape)
athlete_df.head(10)

(222552, 7)


Unnamed: 0,Year,Host_Country,NOC,Country,Name,Sport,Event
0,1896,Greece,AUS,Australia,"Edwin Harold ""Teddy"" Flack",Tennis,Tennis Men's Singles
1,1896,Greece,AUS,Australia,"Edwin Harold ""Teddy"" Flack",Tennis,Tennis Men's Doubles
2,1896,Greece,AUS,Australia,"Edwin Harold ""Teddy"" Flack",Athletics,Athletics Men's 800 metres
3,1896,Greece,AUS,Australia,"Edwin Harold ""Teddy"" Flack",Athletics,"Athletics Men's 1,500 metres"
4,1896,Greece,AUS,Australia,"Edwin Harold ""Teddy"" Flack",Athletics,Athletics Men's Marathon
5,1896,Greece,AUT,Austria,Otto Herschmann,Swimming,Swimming Men's 100 metres Freestyle
6,1896,Greece,AUT,Austria,Paul Neumann (-Newman),Swimming,Swimming Men's 500 metres Freestyle
7,1896,Greece,AUT,Austria,Paul Neumann (-Newman),Swimming,"Swimming Men's 1,200 metres Freestyle"
8,1896,Greece,AUT,Austria,Felix Adolf Schmal,Cycling,Cycling Men's 333 metres Time Trial
9,1896,Greece,AUT,Austria,Felix Adolf Schmal,Fencing,"Fencing Men's Sabre, Individual"


In [9]:
athlete_df = athlete_df.groupby(['Year','Host_Country','Country','NOC'])[['Name','Sport','Event']]\
                        .nunique()\
                        .reset_index()

print(athlete_df.shape)
athlete_df.head(10)

(2810, 7)


Unnamed: 0,Year,Host_Country,Country,NOC,Name,Sport,Event
0,1896,Greece,Australia,AUS,1,2,5
1,1896,Greece,Austria,AUT,3,3,8
2,1896,Greece,Denmark,DEN,3,5,12
3,1896,Greece,France,FRA,12,6,18
4,1896,Greece,Germany,GER,19,6,27
5,1896,Greece,Greece,GRE,102,9,39
6,1896,Greece,Hungary,HUN,7,6,14
7,1896,Greece,Italy,ITA,1,1,1
8,1896,Greece,Sweden,SWE,1,2,5
9,1896,Greece,Switzerland,SUI,3,2,5


In [10]:
athlete_df = athlete_df.rename(columns={'Name':'Athletes', 'Sport':'Sports', 'Event':'Events'})\
                        .reset_index(drop = True)
print(athlete_df.shape)
athlete_df.head(10)

(2810, 7)


Unnamed: 0,Year,Host_Country,Country,NOC,Athletes,Sports,Events
0,1896,Greece,Australia,AUS,1,2,5
1,1896,Greece,Austria,AUT,3,3,8
2,1896,Greece,Denmark,DEN,3,5,12
3,1896,Greece,France,FRA,12,6,18
4,1896,Greece,Germany,GER,19,6,27
5,1896,Greece,Greece,GRE,102,9,39
6,1896,Greece,Hungary,HUN,7,6,14
7,1896,Greece,Italy,ITA,1,1,1
8,1896,Greece,Sweden,SWE,1,2,5
9,1896,Greece,Switzerland,SUI,3,2,5


In [11]:
# Sort the dataframe by 'Year'(ascending) and 'Athlete Count'(descending)
athlete_df = athlete_df.sort_values(['Year','Athletes'],ascending = [True,False])\
                       .reset_index(drop = True)

print(athlete_df.shape)
athlete_df.head(10)

(2810, 7)


Unnamed: 0,Year,Host_Country,Country,NOC,Athletes,Sports,Events
0,1896,Greece,Greece,GRE,102,9,39
1,1896,Greece,Germany,GER,19,6,27
2,1896,Greece,USA,USA,14,3,16
3,1896,Greece,France,FRA,12,6,18
4,1896,Greece,UK,GBR,10,7,19
5,1896,Greece,Hungary,HUN,7,6,14
6,1896,Greece,Austria,AUT,3,3,8
7,1896,Greece,Denmark,DEN,3,5,12
8,1896,Greece,Switzerland,SUI,3,2,5
9,1896,Greece,Australia,AUS,1,2,5


### Medal Count
---

In [12]:
# Selecting only required columns
medals_df = olympicData[['Year','Host_Country','NOC','Country','Event','Medal']]

# Dropping NaN values in 'Medal' column as we are inerested in only medal count
medals_df = medals_df.dropna()

# Drop duplicate rows for team events.
medals_df = medals_df.drop_duplicates(['Country','Year','Event','Medal'])

medals_df = medals_df.rename(columns={'Medal': 'Medals'})\
                        .reset_index(drop = True)

print(medals_df.shape)
medals_df.head(10)

(16052, 6)


Unnamed: 0,Year,Host_Country,NOC,Country,Event,Medals
0,1900,France,DEN,Denmark,Tug-Of-War Men's Tug-Of-War,Gold
1,1920,Belgium,FIN,Finland,Swimming Men's 200 metres Breaststroke,Bronze
2,1920,Belgium,FIN,Finland,Swimming Men's 400 metres Breaststroke,Bronze
3,1948,UK,FIN,Finland,Gymnastics Men's Individual All-Around,Bronze
4,1948,UK,FIN,Finland,Gymnastics Men's Team All-Around,Gold
5,1948,UK,FIN,Finland,Gymnastics Men's Horse Vault,Gold
6,1948,UK,FIN,Finland,Gymnastics Men's Pommelled Horse,Gold
7,1952,Finland,FIN,Finland,Gymnastics Men's Team All-Around,Bronze
8,2008,China,NOR,Norway,Handball Women's Handball,Gold
9,1920,Belgium,NOR,Norway,"Gymnastics Men's Team All-Around, Free System",Silver


In [13]:
medals_df = medals_df[['Year','Host_Country','NOC','Country','Medals']]\
                .sort_values(['Year','Country'], ascending = [True,True])\
                .reset_index(drop = True)

print(medals_df.shape)
medals_df.head(10)

(16052, 5)


Unnamed: 0,Year,Host_Country,NOC,Country,Medals
0,1896,Greece,AUS,Australia,Bronze
1,1896,Greece,AUS,Australia,Gold
2,1896,Greece,AUS,Australia,Gold
3,1896,Greece,AUT,Austria,Silver
4,1896,Greece,AUT,Austria,Gold
5,1896,Greece,AUT,Austria,Bronze
6,1896,Greece,AUT,Austria,Bronze
7,1896,Greece,AUT,Austria,Gold
8,1896,Greece,DEN,Denmark,Silver
9,1896,Greece,DEN,Denmark,Gold


In [14]:
medals_df['Gold'] = 0
medals_df['Silver'] = 0
medals_df['Bronze'] = 0

medals_df.loc[medals_df['Medals'] == 'Gold', 'Gold'] = 1
medals_df.loc[medals_df['Medals'] == 'Silver', 'Silver'] = 1
medals_df.loc[medals_df['Medals'] == 'Bronze', 'Bronze'] = 1

print(medals_df.shape)
medals_df.head(10)

(16052, 8)


Unnamed: 0,Year,Host_Country,NOC,Country,Medals,Gold,Silver,Bronze
0,1896,Greece,AUS,Australia,Bronze,0,0,1
1,1896,Greece,AUS,Australia,Gold,1,0,0
2,1896,Greece,AUS,Australia,Gold,1,0,0
3,1896,Greece,AUT,Austria,Silver,0,1,0
4,1896,Greece,AUT,Austria,Gold,1,0,0
5,1896,Greece,AUT,Austria,Bronze,0,0,1
6,1896,Greece,AUT,Austria,Bronze,0,0,1
7,1896,Greece,AUT,Austria,Gold,1,0,0
8,1896,Greece,DEN,Denmark,Silver,0,1,0
9,1896,Greece,DEN,Denmark,Gold,1,0,0


In [15]:
medals_df = medals_df.drop(['Medals'], axis=1)
print(medals_df.shape)
medals_df.head(10)

(16052, 7)


Unnamed: 0,Year,Host_Country,NOC,Country,Gold,Silver,Bronze
0,1896,Greece,AUS,Australia,0,0,1
1,1896,Greece,AUS,Australia,1,0,0
2,1896,Greece,AUS,Australia,1,0,0
3,1896,Greece,AUT,Austria,0,1,0
4,1896,Greece,AUT,Austria,1,0,0
5,1896,Greece,AUT,Austria,0,0,1
6,1896,Greece,AUT,Austria,0,0,1
7,1896,Greece,AUT,Austria,1,0,0
8,1896,Greece,DEN,Denmark,0,1,0
9,1896,Greece,DEN,Denmark,1,0,0


In [16]:
# Find total count of medals per year per country
medals_df = medals_df.groupby(['Year','Host_Country','NOC','Country'], sort = False).sum().reset_index()
print(medals_df.shape)
medals_df.head(10)

(1275, 7)


Unnamed: 0,Year,Host_Country,NOC,Country,Gold,Silver,Bronze
0,1896,Greece,AUS,Australia,2,0,1
1,1896,Greece,AUT,Austria,2,1,2
2,1896,Greece,DEN,Denmark,1,2,3
3,1896,Greece,FRA,France,5,4,2
4,1896,Greece,GER,Germany,7,5,2
5,1896,Greece,GRE,Greece,10,17,17
6,1896,Greece,HUN,Hungary,2,1,3
7,1896,Greece,SUI,Switzerland,1,2,0
8,1896,Greece,GBR,UK,3,3,3
9,1896,Greece,USA,USA,11,6,2


In [17]:
medals_df['Medals'] = medals_df['Gold'] + medals_df['Silver'] + medals_df['Bronze']

print(medals_df.shape)
medals_df.head(10)

(1275, 8)


Unnamed: 0,Year,Host_Country,NOC,Country,Gold,Silver,Bronze,Medals
0,1896,Greece,AUS,Australia,2,0,1,3
1,1896,Greece,AUT,Austria,2,1,2,5
2,1896,Greece,DEN,Denmark,1,2,3,6
3,1896,Greece,FRA,France,5,4,2,11
4,1896,Greece,GER,Germany,7,5,2,14
5,1896,Greece,GRE,Greece,10,17,17,44
6,1896,Greece,HUN,Hungary,2,1,3,6
7,1896,Greece,SUI,Switzerland,1,2,0,3
8,1896,Greece,GBR,UK,3,3,3,9
9,1896,Greece,USA,USA,11,6,2,19


In [18]:
medals_df = medals_df.sort_values(['Year','Medals','Country'], ascending = [True,False,True])

medals_df = medals_df.reset_index(drop=True)

print(medals_df.shape)
medals_df.head(10)

(1275, 8)


Unnamed: 0,Year,Host_Country,NOC,Country,Gold,Silver,Bronze,Medals
0,1896,Greece,GRE,Greece,10,17,17,44
1,1896,Greece,USA,USA,11,6,2,19
2,1896,Greece,GER,Germany,7,5,2,14
3,1896,Greece,FRA,France,5,4,2,11
4,1896,Greece,GBR,UK,3,3,3,9
5,1896,Greece,DEN,Denmark,1,2,3,6
6,1896,Greece,HUN,Hungary,2,1,3,6
7,1896,Greece,AUT,Austria,2,1,2,5
8,1896,Greece,AUS,Australia,2,0,1,3
9,1896,Greece,SUI,Switzerland,1,2,0,3


### Merging Athlete_count & Medals_count
---

In [19]:
final_df = athlete_df.merge(medals_df, how='left').fillna(0)
final_df = final_df.sort_values(['Year','Medals','Country'], ascending = [True,False,True])

print(final_df.shape)
final_df.head(10)

(2810, 11)


Unnamed: 0,Year,Host_Country,Country,NOC,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,1896,Greece,Greece,GRE,102,9,39,10.0,17.0,17.0,44.0
2,1896,Greece,USA,USA,14,3,16,11.0,6.0,2.0,19.0
1,1896,Greece,Germany,GER,19,6,27,7.0,5.0,2.0,14.0
3,1896,Greece,France,FRA,12,6,18,5.0,4.0,2.0,11.0
4,1896,Greece,UK,GBR,10,7,19,3.0,3.0,3.0,9.0
7,1896,Greece,Denmark,DEN,3,5,12,1.0,2.0,3.0,6.0
5,1896,Greece,Hungary,HUN,7,6,14,2.0,1.0,3.0,6.0
6,1896,Greece,Austria,AUT,3,3,8,2.0,1.0,2.0,5.0
9,1896,Greece,Australia,AUS,1,2,5,2.0,0.0,1.0,3.0
8,1896,Greece,Switzerland,SUI,3,2,5,1.0,2.0,0.0,3.0


In [20]:
final_df['Medals'] = final_df['Medals'].astype(int)

final_df['Gold'] = final_df['Gold'].astype(int)

final_df['Silver'] = final_df['Silver'].astype(int)

final_df['Bronze'] = final_df['Bronze'].astype(int)

print(final_df.shape)
final_df.head(10)

(2810, 11)


Unnamed: 0,Year,Host_Country,Country,NOC,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,1896,Greece,Greece,GRE,102,9,39,10,17,17,44
2,1896,Greece,USA,USA,14,3,16,11,6,2,19
1,1896,Greece,Germany,GER,19,6,27,7,5,2,14
3,1896,Greece,France,FRA,12,6,18,5,4,2,11
4,1896,Greece,UK,GBR,10,7,19,3,3,3,9
7,1896,Greece,Denmark,DEN,3,5,12,1,2,3,6
5,1896,Greece,Hungary,HUN,7,6,14,2,1,3,6
6,1896,Greece,Austria,AUT,3,3,8,2,1,2,5
9,1896,Greece,Australia,AUS,1,2,5,2,0,1,3
8,1896,Greece,Switzerland,SUI,3,2,5,1,2,0,3


In [21]:
# final_df['Host'] = [1 if (final_df['Host'] == final_df['Host_Country'])  else 0 for cell in medals_df['Medals']]

final_df['Host'] = 0

final_df.loc[final_df['Country'] == final_df['Host_Country'], 'Host'] = 1

print(final_df.shape)
final_df.head(10)

(2810, 12)


Unnamed: 0,Year,Host_Country,Country,NOC,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Host
0,1896,Greece,Greece,GRE,102,9,39,10,17,17,44,1
2,1896,Greece,USA,USA,14,3,16,11,6,2,19,0
1,1896,Greece,Germany,GER,19,6,27,7,5,2,14,0
3,1896,Greece,France,FRA,12,6,18,5,4,2,11,0
4,1896,Greece,UK,GBR,10,7,19,3,3,3,9,0
7,1896,Greece,Denmark,DEN,3,5,12,1,2,3,6,0
5,1896,Greece,Hungary,HUN,7,6,14,2,1,3,6,0
6,1896,Greece,Austria,AUT,3,3,8,2,1,2,5,0
9,1896,Greece,Australia,AUS,1,2,5,2,0,1,3,0
8,1896,Greece,Switzerland,SUI,3,2,5,1,2,0,3,0


In [22]:
final_df = final_df[['Year', 'Country', 'Host', 'Athletes', 'Sports', 'Events', 'Gold', 'Silver', 'Bronze', 'Medals']]
print(final_df.shape)
final_df.head(10)

(2810, 10)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,1896,Greece,1,102,9,39,10,17,17,44
2,1896,USA,0,14,3,16,11,6,2,19
1,1896,Germany,0,19,6,27,7,5,2,14
3,1896,France,0,12,6,18,5,4,2,11
4,1896,UK,0,10,7,19,3,3,3,9
7,1896,Denmark,0,3,5,12,1,2,3,6
5,1896,Hungary,0,7,6,14,2,1,3,6
6,1896,Austria,0,3,3,8,2,1,2,5
9,1896,Australia,0,1,2,5,2,0,1,3
8,1896,Switzerland,0,3,2,5,1,2,0,3


In [23]:
final_df.to_csv('summer_athlete_medals_count.csv',index = False)