## Linear Regression  to predict Olympic Medal count for top 25 countries
---

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

### Top 25 countries that won most medals
---

In [2]:
top_df = pd.read_csv('top_25.csv')
print(top_df.shape)
top_df.head()

(25, 9)


Unnamed: 0,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,USA,4,10362,565,4341,1035,802,707,2544
1,Russia,1,6323,404,3191,592,498,487,1577
2,Germany,2,8471,510,3766,442,457,490,1389
3,UK,3,7634,525,3665,278,316,298,892
4,France,2,7023,540,3479,233,255,282,770


In [3]:
# Assigning Id to each Country
top_df['CountryId'] = [(x+1) for x in range(25)]

print(top_df.shape)
top_df.head()

(25, 10)


Unnamed: 0,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId
0,USA,4,10362,565,4341,1035,802,707,2544,1
1,Russia,1,6323,404,3191,592,498,487,1577,2
2,Germany,2,8471,510,3766,442,457,490,1389,3
3,UK,3,7634,525,3665,278,316,298,892,4
4,France,2,7023,540,3479,233,255,282,770,5


In [4]:
top_df = top_df[['CountryId','Country']]
print(top_df.shape)
top_df.head()

(25, 2)


Unnamed: 0,CountryId,Country
0,1,USA
1,2,Russia
2,3,Germany
3,4,UK
4,5,France


### Summer Olympic data with athlete, sports, events and medals count
---

In [5]:
df = pd.read_csv('summer_athlete_medals_count.csv')
print(df.shape)
df.head()

(2790, 10)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals
0,1896,Greece,1,102,9,39,10,17,17,44
1,1896,USA,0,14,3,16,11,6,2,19
2,1896,Germany,0,19,6,27,7,5,2,14
3,1896,France,0,12,6,18,5,4,2,11
4,1896,UK,0,10,7,19,3,3,3,9


In [6]:
# Adding country_id column to summer df
df = pd.merge(df,top_df, how = 'left', on = 'Country')
print(df.shape)
df.head()

(2790, 11)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId
0,1896,Greece,1,102,9,39,10,17,17,44,25.0
1,1896,USA,0,14,3,16,11,6,2,19,1.0
2,1896,Germany,0,19,6,27,7,5,2,14,3.0
3,1896,France,0,12,6,18,5,4,2,11,5.0
4,1896,UK,0,10,7,19,3,3,3,9,4.0


In [7]:
# Dropping the countries that are not in top 25
df = df.dropna()
print(df.shape)
df.head()

(634, 11)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId
0,1896,Greece,1,102,9,39,10,17,17,44,25.0
1,1896,USA,0,14,3,16,11,6,2,19,1.0
2,1896,Germany,0,19,6,27,7,5,2,14,3.0
3,1896,France,0,12,6,18,5,4,2,11,5.0
4,1896,UK,0,10,7,19,3,3,3,9,4.0


In [8]:
df['Athletes per sport'] = round(df['Athletes']/df['Sports'], 2)
print(df.shape)
df.head()

(634, 12)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,CountryId,Athletes per sport
0,1896,Greece,1,102,9,39,10,17,17,44,25.0,11.33
1,1896,USA,0,14,3,16,11,6,2,19,1.0,4.67
2,1896,Germany,0,19,6,27,7,5,2,14,3.0,3.17
3,1896,France,0,12,6,18,5,4,2,11,5.0,2.0
4,1896,UK,0,10,7,19,3,3,3,9,4.0,1.43


In [9]:
# Bringing 'CountryId' column to the front
col = df.pop('CountryId')
df.insert(0,'CountryId', col)

print(df.shape)
df.head()

(634, 12)


Unnamed: 0,CountryId,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Athletes per sport
0,25.0,1896,Greece,1,102,9,39,10,17,17,44,11.33
1,1.0,1896,USA,0,14,3,16,11,6,2,19,4.67
2,3.0,1896,Germany,0,19,6,27,7,5,2,14,3.17
3,5.0,1896,France,0,12,6,18,5,4,2,11,2.0
4,4.0,1896,UK,0,10,7,19,3,3,3,9,1.43


In [10]:
df = df.sort_values(['Year', 'Medals', 'Country'], ascending = [True, False, True])
print(df.shape)
df.head()

(634, 12)


Unnamed: 0,CountryId,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Athletes per sport
0,25.0,1896,Greece,1,102,9,39,10,17,17,44,11.33
1,1.0,1896,USA,0,14,3,16,11,6,2,19,4.67
2,3.0,1896,Germany,0,19,6,27,7,5,2,14,3.17
3,5.0,1896,France,0,12,6,18,5,4,2,11,2.0
4,4.0,1896,UK,0,10,7,19,3,3,3,9,1.43


In [11]:
df = pd.get_dummies(df, columns = ['Country'])
print(df.shape)
df.head()

(634, 36)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,...,Country_Netherlands,Country_Norway,Country_Poland,Country_Romania,Country_Russia,Country_South Korea,Country_Sweden,Country_Switzerland,Country_UK,Country_USA
0,25.0,1896,1,102,9,39,10,17,17,44,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1896,0,14,3,16,11,6,2,19,...,0,0,0,0,0,0,0,0,0,1
2,3.0,1896,0,19,6,27,7,5,2,14,...,0,0,0,0,0,0,0,0,0,0
3,5.0,1896,0,12,6,18,5,4,2,11,...,0,0,0,0,0,0,0,0,0,0
4,4.0,1896,0,10,7,19,3,3,3,9,...,0,0,0,0,0,0,0,0,1,0


## Predicting medals for 2020
---

In [12]:
predict_year = 2020

### Train data
---

In [13]:
train_df = df[df['Year'] < predict_year]
print(train_df.shape)
train_df.head()

(634, 36)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,...,Country_Netherlands,Country_Norway,Country_Poland,Country_Romania,Country_Russia,Country_South Korea,Country_Sweden,Country_Switzerland,Country_UK,Country_USA
0,25.0,1896,1,102,9,39,10,17,17,44,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1896,0,14,3,16,11,6,2,19,...,0,0,0,0,0,0,0,0,0,1
2,3.0,1896,0,19,6,27,7,5,2,14,...,0,0,0,0,0,0,0,0,0,0
3,5.0,1896,0,12,6,18,5,4,2,11,...,0,0,0,0,0,0,0,0,0,0
4,4.0,1896,0,10,7,19,3,3,3,9,...,0,0,0,0,0,0,0,0,1,0


In [14]:
X = train_df.drop(['Gold', 'Silver', 'Bronze', 'Medals'], axis=1)

y1 = train_df['Gold'].values.reshape(-1, 1)
y2 = train_df['Silver'].values.reshape(-1, 1)
y3 = train_df['Bronze'].values.reshape(-1, 1)
y4 = train_df['Medals'].values.reshape(-1, 1)

print(X.shape, y1.shape, y2.shape, y3.shape, y4.shape)

(634, 32) (634, 1) (634, 1) (634, 1) (634, 1)


### 2020 Test data

#### Read athlete sport data for the year 2020

In [15]:
df_2020 = pd.read_csv('2020_athlete_sport_count.csv')
print(df_2020.shape)
df_2020.head()

(25, 7)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,CountryId
0,2020,USA,0,340,25,191,1
1,2020,Russia,0,169,18,82,2
2,2020,Germany,0,208,21,104,3
3,2020,UK,0,247,18,94,4
4,2020,France,0,220,21,114,5


In [16]:
df_2020['Athletes per sport'] = round(df_2020['Athletes']/df_2020['Sports'], 2)
print(df_2020.shape)
df_2020.head()

(25, 8)


Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,CountryId,Athletes per sport
0,2020,USA,0,340,25,191,1,13.6
1,2020,Russia,0,169,18,82,2,9.39
2,2020,Germany,0,208,21,104,3,9.9
3,2020,UK,0,247,18,94,4,13.72
4,2020,France,0,220,21,114,5,10.48


In [17]:
# Bringing 'CountryId' column to the front
col = df_2020.pop('CountryId')
df_2020.insert(0,'CountryId', col)

print(df_2020.shape)
df_2020.head()

(25, 8)


Unnamed: 0,CountryId,Year,Country,Host,Athletes,Sports,Events,Athletes per sport
0,1,2020,USA,0,340,25,191,13.6
1,2,2020,Russia,0,169,18,82,9.39
2,3,2020,Germany,0,208,21,104,9.9
3,4,2020,UK,0,247,18,94,13.72
4,5,2020,France,0,220,21,114,10.48


In [18]:
df_2020 = pd.get_dummies(df_2020, columns = ['Country'])
print(df_2020.shape)
df_2020.head()

(25, 32)


Unnamed: 0,CountryId,Year,Host,Athletes,Sports,Events,Athletes per sport,Country_Australia,Country_Belgium,Country_Bulgaria,...,Country_Netherlands,Country_Norway,Country_Poland,Country_Romania,Country_Russia,Country_South Korea,Country_Sweden,Country_Switzerland,Country_UK,Country_USA
0,1,2020,0,340,25,191,13.6,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,2020,0,169,18,82,9.39,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,3,2020,0,208,21,104,9.9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,2020,0,247,18,94,13.72,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,2020,0,220,21,114,10.48,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Test Train Split & Standard Scaler
---

In [19]:
# Use train_test_split to create training and testing data

from sklearn.model_selection import train_test_split

# Gold
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y1, random_state=2)

# Silver
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y2, random_state=2)

# Bronze
X3_train, X3_test, y3_train, y3_test = train_test_split(X, y3, random_state=2)

# Total Medals
X4_train, X4_test, y4_train, y4_test = train_test_split(X, y4, random_state=2)

In [20]:
print(X1_train.shape, y1_train.shape, X1_test.shape,  y1_test.shape)
print(X2_train.shape, y2_train.shape, X2_test.shape,  y2_test.shape)
print(X3_train.shape, y3_train.shape, X3_test.shape,  y3_test.shape)
print(X4_train.shape, y4_train.shape, X4_test.shape,  y4_test.shape)

(475, 32) (475, 1) (159, 32) (159, 1)
(475, 32) (475, 1) (159, 32) (159, 1)
(475, 32) (475, 1) (159, 32) (159, 1)
(475, 32) (475, 1) (159, 32) (159, 1)


## Linear Regression Model
---

In [21]:
from sklearn.linear_model import LinearRegression

### Gold
---

In [22]:
model1 = LinearRegression()

model1.fit(X1_train, y1_train)

training_score1 = model1.score(X1_train, y1_train)
testing_score1 = model1.score(X1_test, y1_test)

print('Gold Medals:')
print(f"Gold Training Score: {training_score1}")
print(f"Gold Testing Score: {testing_score1}")

Gold Medals:
Gold Training Score: 0.769302651944956
Gold Testing Score: 0.7513879821564176


### Silver
---

In [23]:
model2 = LinearRegression()

model2.fit(X2_train, y2_train)

training_score2 = model1.score(X2_train, y2_train)
testing_score2 = model1.score(X2_test, y2_test)

print('Silver Medals:')
print(f"Silver Training Score: {training_score2}")
print(f"Silver Testing Score: {testing_score2}")

Silver Medals:
Silver Training Score: 0.7426667887063532
Silver Testing Score: 0.7047764327585497


### Bronze
---

In [24]:
model3 = LinearRegression()

model3.fit(X3_train, y3_train)

training_score3 = model3.score(X3_train, y3_train)
testing_score3 = model3.score(X3_test, y3_test)

print('Bronze Medals:')
print(f"Bronze Training Score: {training_score3}")
print(f"Bronze Testing Score: {testing_score3}")

Bronze Medals:
Bronze Training Score: 0.8011844979516292
Bronze Testing Score: 0.719818759648077


### Total Medals
---

In [25]:
model4 = LinearRegression()

model4.fit(X4_train, y4_train)

training_score4 = model4.score(X4_train, y4_train)
testing_score4 = model4.score(X4_test, y4_test)

print('Total Medals Medals:')
print(f"Total Medals Training Score: {training_score4}")
print(f"Total Medals Testing Score: {testing_score4}")

Total Medals Medals:
Total Medals Training Score: 0.8372685664251086
Total Medals Testing Score: 0.7704399769927465


### Using the models to predict medals for 2020
---

In [26]:
test_data = df_2020

In [27]:
gold_predictions = model1.predict(test_data)
gold_predictions = np.ravel(gold_predictions)
gold_predictions = np.around(gold_predictions, decimals =0).astype(int)

gold_predictions

array([32, 19,  8,  6,  5,  7, 13, 12,  1,  3, 20,  1,  3,  0,  4,  1,  3,
       -1,  0, -1, -1, -2,  0,  0, -2])

In [28]:
silver_predictions = model2.predict(test_data)
silver_predictions = np.ravel(silver_predictions)
silver_predictions = np.around(silver_predictions, decimals =0).astype(int)

silver_predictions

array([24, 15, 10,  9,  6,  5,  8, 13,  2,  2, 16,  0,  5,  1,  6,  2,  4,
        1,  0,  1,  1, -2,  1,  0, -1])

In [29]:
bronze_predictions = model3.predict(test_data)
bronze_predictions = np.ravel(bronze_predictions)
bronze_predictions = np.around(bronze_predictions, decimals =0).astype(int)

bronze_predictions

array([21, 13, 11,  7,  7,  5,  8, 12,  2,  1, 14,  1,  5,  1,  5,  5,  3,
        0, -1,  0, -1, -2,  0, -1, -1])

In [30]:
total_medals_predictions = model4.predict(test_data)
total_medals_predictions = np.ravel(total_medals_predictions)
total_medals_predictions = np.around(total_medals_predictions, decimals =0).astype(int)

total_medals_predictions

array([76, 47, 29, 23, 18, 17, 29, 38,  5,  6, 50,  2, 13,  2, 15,  8,  9,
        0, -1, -1, -1, -6,  1, -1, -4])

### Creating a Dataframe to show all the predictions
---

In [31]:
top_df['Gold Predicted'] = gold_predictions
top_df['Silver Predicted'] = silver_predictions
top_df['Bronze Predicted'] = bronze_predictions

# Not using total_medals_predictions as the below option gave slightly better results
#top_df['Total Medals Predicted'] = total_medals_predictions

top_df['Total Medals Predicted'] = top_df['Gold Predicted'] + \
                                    top_df['Silver Predicted'] + \
                                     top_df['Silver Predicted']

top_df

Unnamed: 0,CountryId,Country,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted
0,1,USA,32,24,21,80
1,2,Russia,19,15,13,49
2,3,Germany,8,10,11,28
3,4,UK,6,9,7,24
4,5,France,5,6,7,17
5,6,Italy,7,5,5,17
6,7,China,13,8,8,29
7,8,Australia,12,13,12,38
8,9,Sweden,1,2,2,5
9,10,Hungary,3,2,1,7


In [32]:
# Rearranging the columns
top_df = top_df[['Country',\
                 'Gold Predicted',\
                 'Silver Predicted',\
                 'Bronze Predicted',\
                 #'Total Medals Predicted',\
                 'Total Medals Predicted' ]]

top_df

Unnamed: 0,Country,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted
0,USA,32,24,21,80
1,Russia,19,15,13,49
2,Germany,8,10,11,28
3,UK,6,9,7,24
4,France,5,6,7,17
5,Italy,7,5,5,17
6,China,13,8,8,29
7,Australia,12,13,12,38
8,Sweden,1,2,2,5
9,Hungary,3,2,1,7


In [33]:
top_df = top_df.sort_values(['Total Medals Predicted'], ascending = [False])\
                .reset_index(drop=True)
top_df

Unnamed: 0,Country,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted
0,USA,32,24,21,80
1,Japan,20,16,14,52
2,Russia,19,15,13,49
3,Australia,12,13,12,38
4,China,13,8,8,29
5,Germany,8,10,11,28
6,UK,6,9,7,24
7,France,5,6,7,17
8,Italy,7,5,5,17
9,Netherlands,4,6,5,16


In [34]:
top_df.to_csv('Predictions-2020.csv', index = False)

In [35]:
top_df['Gold Actual'] = ''
top_df['Silver Actual'] = ''
top_df['Bronze Actual'] = ''
top_df['Total Medals Actual'] = ''
top_df

Unnamed: 0,Country,Gold Predicted,Silver Predicted,Bronze Predicted,Total Medals Predicted,Gold Actual,Silver Actual,Bronze Actual,Total Medals Actual
0,USA,32,24,21,80,,,,
1,Japan,20,16,14,52,,,,
2,Russia,19,15,13,49,,,,
3,Australia,12,13,12,38,,,,
4,China,13,8,8,29,,,,
5,Germany,8,10,11,28,,,,
6,UK,6,9,7,24,,,,
7,France,5,6,7,17,,,,
8,Italy,7,5,5,17,,,,
9,Netherlands,4,6,5,16,,,,


In [36]:
# Rearranging the columns
top_df = top_df[['Country',\
                 'Gold Actual', 'Gold Predicted',\
                 'Silver Actual','Silver Predicted',\
                 'Bronze Actual', 'Bronze Predicted',\
                 'Total Medals Actual', 'Total Medals Predicted']]
top_df

Unnamed: 0,Country,Gold Actual,Gold Predicted,Silver Actual,Silver Predicted,Bronze Actual,Bronze Predicted,Total Medals Actual,Total Medals Predicted
0,USA,,32,,24,,21,,80
1,Japan,,20,,16,,14,,52
2,Russia,,19,,15,,13,,49
3,Australia,,12,,13,,12,,38
4,China,,13,,8,,8,,29
5,Germany,,8,,10,,11,,28
6,UK,,6,,9,,7,,24
7,France,,5,,6,,7,,17
8,Italy,,7,,5,,5,,17
9,Netherlands,,4,,6,,5,,16


In [37]:
top_df.to_csv('Predictions-2020-copy.csv', index = False)