# Machine Learning - Linear Regression

In [1]:
import pandas as pd


# Data Pre-Processing

In [2]:
# Load Data 
country_df = pd.read_csv('Resources/summer_athlete_medals_count.csv')

# Filtering for USA
country_df = country_df[country_df['Country'] == 'USA']

# country_df

In [3]:
#  Create test data
test_data = country_df[country_df['Year'] > 1988]
test_data = pd.DataFrame(test_data)


# Adding 2020 to prep for future prediction after hyperparameter tuning and training
new_row = pd.Series(data={'Year':2020, 'Country':'USA', 'Athletes':340, 
                          'Host':0, 'Sports':25, 'Events':191}, name='0')

test_data = test_data.append(new_row, ignore_index=False)
# test_data

# Save into a csv file
# test_data.to_csv('Resources/test.csv', index=False)

In [4]:
#  Create train data
train_data = country_df[country_df['Year'] <= 1988]
train_data = pd.DataFrame(train_data)
# train_data

# Save into a csv file
# train_data.to_csv('Resources/train.csv', index=False)

In [5]:
# Assign X and y for test and training data for Total Medals, Gold, Silver, and Bronze (each medal prediction will have it's own Machine Learning Model)

y_train_medals = train_data["Medals"]
y_train_gold = train_data["Gold"]
y_train_silver = train_data["Silver"]
y_train_bronze = train_data["Bronze"]


X_train_features = train_data[['Year','Host', 'Athletes', 'Sports', 'Events']]
X_train = X_train_features.values


X_test_features = test_data[['Year', 'Host', 'Athletes', 'Sports', 'Events']]
X_test = X_test_features.values


# Create and Train(Fit) Model

In [6]:
 # Create a LinearRegression model and fit it to the training data for Total Medals, Gold, Silver, and Bronze

from sklearn.linear_model import LinearRegression

# Total Medals
model_medals = LinearRegression()
model_medals.fit(X_train, y_train_medals)

# Gold
model_gold = LinearRegression()
model_gold.fit(X_train, y_train_gold)

# Silver
model_silver = LinearRegression()
model_silver.fit(X_train, y_train_silver)

# Bronze
model_bronze = LinearRegression()
model_bronze.fit(X_train, y_train_bronze)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
print(f"Training Data Score for Medals: {model_medals.score(X_train, y_train_medals)}")
print(f"Training Data Score for Gold: {model_gold.score(X_train, y_train_gold)}")
print(f"Training Data Score for Silver: {model_silver.score(X_train, y_train_silver)}")
print(f"Training Data Score for Bronze: {model_bronze.score(X_train, y_train_bronze)}")


Training Data Score for Medals: 0.8097440218000025
Training Data Score for Gold: 0.7992558308506773
Training Data Score for Silver: 0.8402738162822352
Training Data Score for Bronze: 0.7323432508920644


# Hyperparameter Tuning - Ordinary Least Squares

In [8]:
# Source Link: https://www.datarobot.com/blog/ordinary-least-squares-in-python/
# Source Link: https://heartbeat.fritz.ai/implementing-ordinary-least-squares-ols-using-statsmodels-in-python-b1f4dee09419


import statsmodels.api as sm
import numpy as np

# Total Medals
ols = sm.OLS(y_train_medals, X_train_features).fit()

# Gold
ols_gold = sm.OLS(y_train_gold, X_train_features).fit()

# Silver
ols_silver = sm.OLS(y_train_silver, X_train_features).fit()

# Bronze
ols_bronze = sm.OLS(y_train_bronze, X_train_features).fit()


# Make Predictions

In [9]:
# Make predictions with the hypertuned model

# Total Medals
predictions = ols.predict(X_test)

# Gold 
predictions_gold = ols_gold.predict(X_test)

# Silver 
predictions_silver = ols_silver.predict(X_test)

# Bronze 
predictions_bronze = ols_bronze.predict(X_test)

In [13]:
# # Model score

# # Source Link: https://stats.stackexchange.com/questions/311384/is-there-a-way-to-calculate-r-squared-in-ols-without-computing-the-coefficients
# # Given a multiple regression, there is no way to compute only R-squared while avoiding the bulk of the other computations.

# # Source Link: https://www.ibm.com/support/knowledgecenter/SSLVMB_24.0.0/spss/tutorials/curveest_modelsummary_virus.html
# # In OLS Model Summary: R, the multiple correlation coefficient, is the linear correlation between 
# # the observed and model-predicted values of the dependent variable. Its large value indicates a strong relationship.

# # Total Medals
# model_details = ols.summary()
# print("Model Summary for Medals")
# print(model_details)

# # Gold
# model_details_gold = ols_gold.summary()
# print("Model Summary for Gold")
# print(model_details_gold)

# # Silver
# model_details_silver = ols_silver.summary()
# print("Model Summary for Silver")
# print(model_details_silver)

# # Bronze
# model_details_bronze = ols_bronze.summary()
# print("Model Summary for Bronze")
# print(model_details_bronze)


In [11]:
# Predicting - medals(total)
output_medals = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Medals Predictions': predictions})
# output_medals

# Predicting - gold
output_gold = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Gold Predictions': predictions_gold})
# output_gold

# Predicting - silver
output_silver = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Silver Predictions': predictions_silver})
# output_silver

# Predicting - bronze
output_bronze = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Bronze Predictions': predictions_bronze})
# output_bronze

# Prediction table

In [12]:
# Combining Medals, Gold, Silver, Bronze Predictions - (View of 1992-2020 with trained data up to 1988)

combined_predictions_df = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Host':test_data.Host,
                                        'Athletes': test_data.Athletes, 'Sports': test_data.Sports, 'Events': test_data.Events,
                                        'Gold': test_data.Gold, 'Silver': test_data.Silver, 'Bronze': test_data.Bronze, 'Medals': test_data.Medals,
                                        'Gold Predictions': predictions_gold.round(), 'Silver Predictions': predictions_silver.round(),
                                        'Bronze Predictions': predictions_bronze.round(), 'Medals Predictions': predictions.round()})

combined_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
1415,1992,USA,0,545,28,248,37.0,34.0,37.0,108.0,47.0,37.0,31.0,115.0
1582,1996,USA,1,648,31,263,44.0,32.0,25.0,101.0,86.0,65.0,37.0,187.0
1778,2000,USA,0,586,31,265,36.0,24.0,31.0,91.0,52.0,40.0,31.0,123.0
1977,2004,USA,0,533,31,254,36.0,39.0,26.0,101.0,53.0,37.0,24.0,115.0
2177,2008,USA,0,588,32,258,36.0,39.0,35.0,110.0,54.0,41.0,32.0,127.0
2380,2012,USA,0,529,31,245,46.0,28.0,29.0,103.0,54.0,37.0,25.0,116.0
2584,2016,USA,0,555,33,245,46.0,37.0,38.0,121.0,57.0,40.0,28.0,124.0
0,2020,USA,0,340,25,191,,,,,47.0,25.0,9.0,82.0


## Predicting for certain years

In [21]:
#  Predicting for certain year (trained data up to 1988)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 1992]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
1415,1992,USA,0,545,28,248,37,34,37,108.0,47.0,37.0,31.0,115.0


In [19]:
#  Predicting for certain year (trained data up to 1992)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 1996]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
1582,1996,USA,1,648,31,263,44,32,25,101.0,84.0,64.0,37.0,186.0


In [19]:
#  Predicting for certain year (trained data up to 1996)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2000]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
1778,2000,USA,0,586,31,265,36,24,31,91.0,44.0,35.0,32.0,111.0


In [19]:
#  Predicting for certain year (trained data up to 2000)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2004]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
1977,2004,USA,0,533,31,254,36,39,26,101.0,41.0,27.0,23.0,92.0


In [19]:
#  Predicting for certain year (trained data up to 2004)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2008]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
2177,2008,USA,0,588,32,258,36,39,35,110.0,43.0,36.0,32.0,111.0


In [19]:
#  Predicting for certain year (trained data up to 2008)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2012]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
2380,2012,USA,0,529,31,245,46,28,29,103.0,39.0,31.0,25.0,96.0


In [18]:
#  Predicting for certain year (trained data up to 2012)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2016]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
2584,2016,USA,0,555,33,245,46.0,37.0,38.0,121.0,43.0,33.0,28.0,104.0


In [19]:
#  Predicting for certain year (trained data up to 2012)

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2020]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
0,2020,USA,0,340,25,191,,,,,34.0,17.0,8.0,59.0
