## Machine Learning - Linear Regression: Athlete, Sports, Events count

In [1]:
import pandas as pd


# Data Pre-Processing

In [2]:
# Load Data 

df = pd.read_csv('Resources/summer_athlete_medals_count.csv')
country_df = df


In [3]:
# Filtering for USA
country_df = df[df['Country'] == 'USA']


# #  Filter for the following countries below:
# # 'USA', 'Russia', 'Germany', 'UK', 'France', 'Italy', 'China',
# # 'Australia', 'Sweden', 'Hungary', 'Japan', 'Finlad', 'Canada',
# # 'Romania', 'Netherlands', 'Poland', 'South Korea', 'Bulgaria',
# # 'Cuba', 'Denmark', 'Switzerland', 'Czech Repbulic', 'Belgium',
# # 'Norway', 'Greece'

# countries_25 = ['USA', 'Russia', 'Germany', 'UK', 'France', 'Italy', 'China',
# 'Australia', 'Sweden', 'Hungary', 'Japan', 'Finlad', 'Canada',
# 'Romania', 'Netherlands', 'Poland', 'South Korea', 'Bulgaria',
# 'Cuba', 'Denmark', 'Switzerland', 'Czech Repbulic', 'Belgium',
# 'Norway', 'Greece']

# country_df = df[df.Country.isin(countries_25)]
# country_df.head(3)

In [4]:
#  Create test data
test_data = country_df[country_df['Year'] > 1988]
test_data = pd.DataFrame(test_data)

# Source Link for Athlete, Sport, Event count: https://en.wikipedia.org/wiki/United_States_at_the_2020_Summer_Olympics
# Adding 2020 to prep for future prediction after hyperparameter tuning and training
new_row = pd.Series(data={'Year':2020, 'Country':'USA', 'Athletes':0, 
                          'Host':0, 'Sports':0, 'Events':0, 'Medals':0,'Gold':0,'Silver':0, 'Bronze':0}, name='0')
test_data = test_data.append(new_row, ignore_index=False)
# test_data

# Save into a csv file
# test_data.to_csv('Resources/test.csv', index=False)

In [5]:
#  Create train data
train_data = country_df[country_df['Year'] <= 1988]
train_data = pd.DataFrame(train_data)
# train_data

# Save into a csv file
# train_data.to_csv('Resources/train.csv', index=False)

In [6]:
#  View what columns actually exist
train_data.columns

Index(['Year', 'Country', 'Host', 'Athletes', 'Sports', 'Events', 'Gold',
       'Silver', 'Bronze', 'Medals'],
      dtype='object')

In [7]:
# Assign X and y for test and training data
y_train_Athletes = train_data["Athletes"]
y_train_Sports = train_data["Sports"]
y_train_Events = train_data["Events"]



y_test_Athletes = test_data["Athletes"]
y_test_Sports = test_data["Sports"]
y_test_Events = test_data["Events"]



X_train_features = train_data[['Year','Host', 'Gold', 'Silver', 'Bronze', 'Medals']]
X_train = X_train_features.values


X_test_features = test_data[['Year','Host', 'Gold', 'Silver', 'Bronze', 'Medals']]
X_test = X_test_features.values


# Create and Train(Fit) Model

In [8]:
 # Create a LinearRegression model and fit it to the training data

from sklearn.linear_model import LinearRegression
model_Athletes = LinearRegression()
model_Athletes.fit(X_train, y_train_Athletes)

model_Sports = LinearRegression()
model_Sports.fit(X_train, y_train_Sports)

model_Events = LinearRegression()
model_Events.fit(X_train, y_train_Events)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
print(f"Training Data Score for Medals: {model_Athletes.score(X_train, y_train_Athletes)}")
print(f"Training Data Score for Gold: {model_Sports.score(X_train, y_train_Sports)}")
print(f"Training Data Score for Silver: {model_Events.score(X_train, y_train_Events)}")


print(" ")

print(f"R2 for Medals: {model_Athletes.score(X_test, y_test_Athletes)}")
print(f"R2 for Gold: {model_Sports.score(X_test, y_test_Sports)}")
print(f"R2 for Silver: {model_Events.score(X_test, y_test_Events)}")


Training Data Score for Medals: 0.8866958738378045
Training Data Score for Gold: 0.8098864608056964
Training Data Score for Silver: 0.968155731081799
 
R2 for Medals: 0.2671673576859327
R2 for Gold: 0.23285306438226383
R2 for Silver: -0.07971755664961666


# Hyperparameter Tuning - Ordinary Least Squares

In [10]:
# Source Link: https://www.datarobot.com/blog/ordinary-least-squares-in-python/
# Source Link: https://heartbeat.fritz.ai/implementing-ordinary-least-squares-ols-using-statsmodels-in-python-b1f4dee09419


import statsmodels.api as sm
import numpy as np

ols_Athletes = sm.OLS(y_train_Athletes, X_train_features).fit()

ols_Sports = sm.OLS(y_train_Sports, X_train_features).fit()
ols_Events = sm.OLS(y_train_Events, X_train_features).fit()



# Make Predictions

In [11]:
# # Make predictions with model
# predictions = model.predict(X_test_scaled)

# Make predictions with the hypertuned model
predictions_Athletes = ols_Athletes.predict(X_test)

predictions_Sports = ols_Sports.predict(X_test)
predictions_Events = ols_Events.predict(X_test)


In [12]:
# # Model score

# # Source Link: https://stats.stackexchange.com/questions/311384/is-there-a-way-to-calculate-r-squared-in-ols-without-computing-the-coefficients
# # Given a multiple regression, there is no way to compute R-squared while avoiding the bulk of the other computations.

# # Source Link: https://www.ibm.com/support/knowledgecenter/SSLVMB_24.0.0/spss/tutorials/curveest_modelsummary_virus.html
# # In OLS Model Summary: R, the multiple correlation coefficient, is the linear correlation between 
# # the observed and model-predicted values of the dependent variable. Its large value indicates a strong relationship.



# model_details_Athletes = ols_Athletes.summary()
# print("Model Summary for Athletes")
# print(model_details_Athletes)

# model_details_Sports = ols_Sports.summary()
# print("Model Summary for Sports")
# print(model_details_Sports)

# model_details_Events = ols_Events.summary()
# print("Model Summary for Events")
# print(model_details_Events)


In [13]:
# Predicting - Athletes
output_Athletes = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Athletes Predictions': predictions_Athletes})
# output_Athletes

# Predicting - Sports
output_Sports = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Sports Predictions': predictions_Sports})
# output_Sports

# Predicting - Events
output_Events = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Events Predictions': predictions_Events})
# output_Events



# Prediction table

In [14]:
# Combining Medals, Gold, Silver, Bronze Predictions

combined_predictions_df = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Host':test_data.Host,
                                        'Athletes': test_data.Athletes, 'Sports': test_data.Sports, 'Events': test_data.Events,
                                        'Athletes Predictions': predictions_Athletes.round(), 'Athletes Error': test_data.Athletes- predictions_Athletes.round(),
                                        'Sports Predictions': predictions_Sports.round(),'Sports Error': test_data.Sports- predictions_Sports.round(),
                                        'Events Predictions': predictions_Events.round(),'Events Error': test_data.Events- predictions_Events.round()})

combined_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Athletes Predictions,Athletes Error,Sports Predictions,Sports Error,Events Predictions,Events Error
1415,1992,USA,0,545,28,248,358.0,187.0,18.0,10.0,133.0,115.0
1582,1996,USA,1,648,31,263,275.0,373.0,10.0,21.0,46.0,217.0
1778,2000,USA,0,586,31,265,277.0,309.0,15.0,16.0,105.0,160.0
1977,2004,USA,0,533,31,254,425.0,108.0,23.0,8.0,186.0,68.0
2177,2008,USA,0,588,32,258,408.0,180.0,21.0,11.0,160.0,98.0
2380,2012,USA,0,529,31,245,330.0,199.0,19.0,12.0,145.0,100.0
2584,2016,USA,0,555,33,245,395.0,160.0,21.0,12.0,159.0,86.0
0,2020,USA,0,0,0,0,75.0,-75.0,6.0,-6.0,28.0,-28.0


## Predicting for certain years

In [15]:
#  Predicting for certain year (trained data up to 1988)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 1992]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Athletes Predictions,Athletes Error,Sports Predictions,Sports Error,Events Predictions,Events Error
1415,1992,USA,0,545,28,248,358.0,187.0,18.0,10.0,133.0,115.0


In [15]:
#  Predicting for certain year (trained data up to 1992)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 1996]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Athletes Predictions,Athletes Error,Sports Predictions,Sports Error,Events Predictions,Events Error
1582,1996,USA,1,648,31,263,257.0,391.0,9.0,22.0,35.0,228.0


In [15]:
#  Predicting for certain year (trained data up to 1996)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2000]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Athletes Predictions,Athletes Error,Sports Predictions,Sports Error,Events Predictions,Events Error
1778,2000,USA,0,586,31,265,311.0,275.0,17.0,14.0,125.0,140.0


In [15]:
#  Predicting for certain year (trained data up to 2000)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2004]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Athletes Predictions,Athletes Error,Sports Predictions,Sports Error,Events Predictions,Events Error
1977,2004,USA,0,533,31,254,353.0,180.0,19.0,12.0,147.0,107.0


In [15]:
#  Predicting for certain year (trained data up to 2004)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2008]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Athletes Predictions,Athletes Error,Sports Predictions,Sports Error,Events Predictions,Events Error
2177,2008,USA,0,588,32,258,425.0,163.0,22.0,10.0,171.0,87.0


In [15]:
#  Predicting for certain year (trained data up to 2008)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2012]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Athletes Predictions,Athletes Error,Sports Predictions,Sports Error,Events Predictions,Events Error
2380,2012,USA,0,529,31,245,359.0,170.0,21.0,10.0,160.0,85.0


In [15]:
#  Predicting for certain year (trained data up to 2012)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2016]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Athletes Predictions,Athletes Error,Sports Predictions,Sports Error,Events Predictions,Events Error
2584,2016,USA,0,555,33,245,455.0,100.0,24.0,9.0,191.0,54.0


In [15]:
#  Predicting for certain year (trained data up to 2016)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2020]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Athletes Predictions,Athletes Error,Sports Predictions,Sports Error,Events Predictions,Events Error
0,2020,USA,0,0,0,0,127.0,-127.0,9.0,-9.0,60.0,-60.0
