## Machine Learning - Linear Regression

In [1]:
import pandas as pd


# Data Pre-Processing

In [2]:
# Load Data 

df = pd.read_csv('Resources/summer_athlete_medals_count.csv')

df.head(3)

country_df = df

In [3]:
# #  Filter for the following countries below:
# # 'USA', 'Russia', 'Germany', 'UK', 'France', 'Italy', 'China',
# # 'Australia', 'Sweden', 'Hungary', 'Japan', 'Finlad', 'Canada',
# # 'Romania', 'Netherlands', 'Poland', 'South Korea', 'Bulgaria',
# # 'Cuba', 'Denmark', 'Switzerland', 'Czech Repbulic', 'Belgium',
# # 'Norway', 'Greece'

# countries_25 = ['USA', 'Russia', 'Germany', 'UK', 'France', 'Italy', 'China',
# 'Australia', 'Sweden', 'Hungary', 'Japan', 'Finlad', 'Canada',
# 'Romania', 'Netherlands', 'Poland', 'South Korea', 'Bulgaria',
# 'Cuba', 'Denmark', 'Switzerland', 'Czech Repbulic', 'Belgium',
# 'Norway', 'Greece']

# country_df = df[df.Country.isin(countries_25)]
# country_df.head(3)

In [4]:
#  Create test data
test_data = country_df[country_df['Year'] > 2012]
test_data = pd.DataFrame(test_data)

# Dropping Total Medal count to prep for predictions
# test_data = test_data.drop(["Total_Medals","Gold","Silver","Bronze"], axis=1)
# test_data = test_data.drop(["Medals"], axis=1)


# Adding 2020 to prep for future prediction after hyperparameter tuning and training
new_row = pd.Series(data={'Year':2020, 'Country':'USA', 'Athletes':0, 
                          'Host':0, 'Sports':0, 'Events':0, 'Gold':0,'Silver':0, 'Bronze':0}, name='0')
test_data = test_data.append(new_row, ignore_index=False)
# test_data

#  Save into a csv file
test_data.to_csv('Resources/test.csv', index=False)

In [5]:
#  Create train data
train_data = country_df[country_df['Year'] <= 2012]
train_data = pd.DataFrame(train_data)
# train_data
#  Save into a csv file
train_data.to_csv('Resources/train.csv', index=False)

In [6]:
#  View what columns actually exist
train_data.describe().columns

Index(['Year', 'Host', 'Athletes', 'Sports', 'Events', 'Gold', 'Silver',
       'Bronze', 'Medals'],
      dtype='object')

In [7]:
# Assign X and y for test and training data
y_train_medals = train_data["Medals"]
y_train_gold = train_data["Gold"]
y_train_silver = train_data["Silver"]
y_train_bronze = train_data["Bronze"]

X_train_features = train_data[['Year', 'Host', 'Athletes', 'Sports', 'Events']]
X_train = X_train_features.values


X_test_features = test_data[['Year', 'Host', 'Athletes', 'Sports', 'Events']]
X_test = X_test_features.values


In [8]:
# # Transform the training and testing data using the X_scaler

# from sklearn.preprocessing import StandardScaler
# X_scaler = StandardScaler().fit(X_train)


# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)
# X_test_features_scaled = X_scaler.transform(X_train_features)



# Create and Train(Fit) Model

In [9]:
 # Create a LinearRegression model and fit it to the training data

from sklearn.linear_model import LinearRegression
model_medals = LinearRegression()
model_medals.fit(X_train, y_train_medals)

model_gold = LinearRegression()
model_gold.fit(X_train, y_train_gold)

model_silver = LinearRegression()
model_silver.fit(X_train, y_train_silver)

model_bronze = LinearRegression()
model_bronze.fit(X_train, y_train_bronze)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
print(f"Training Data Score for Medals: {model_medals.score(X_train, y_train_medals)}")
print(f"Training Data Score for Gold: {model_gold.score(X_train, y_train_gold)}")
print(f"Training Data Score for Silver: {model_silver.score(X_train, y_train_silver)}")
print(f"Training Data Score for Bronze: {model_bronze.score(X_train, y_train_bronze)}")


Training Data Score for Medals: 0.7500986803147793
Training Data Score for Gold: 0.6534392769900923
Training Data Score for Silver: 0.7380399872002958
Training Data Score for Bronze: 0.7567591319806977


# Hyperparameter Tuning - Ordinary Least Squares

In [11]:
# Source Link: https://www.datarobot.com/blog/ordinary-least-squares-in-python/
# Source Link: https://heartbeat.fritz.ai/implementing-ordinary-least-squares-ols-using-statsmodels-in-python-b1f4dee09419


import statsmodels.api as sm
import numpy as np

ols = sm.OLS(y_train_medals, X_train_features).fit()

ols_gold = sm.OLS(y_train_gold, X_train_features).fit()
ols_silver = sm.OLS(y_train_silver, X_train_features).fit()
ols_bronze = sm.OLS(y_train_bronze, X_train_features).fit()


In [12]:
# model_details = ols.summary()
# print("Model Summary for Medals")
# print(model_details)

# model_details_gold = ols_gold.summary()
# print("Model Summary for Gold")
# print(model_details_gold)

# model_details_silver = ols_silver.summary()
# print("Model Summary for Silver")
# print(model_details_silver)

# model_details_bronze = ols_bronze.summary()
# print("Model Summary for Bronze")
# print(model_details_bronze)


# Make Predictions

In [13]:
# # Make predictions with model
# predictions = model.predict(X_test_scaled)

# Make predictions with the hypertuned model
predictions = ols.predict(X_test)

predictions_gold = ols_gold.predict(X_test)
predictions_silver = ols_silver.predict(X_test)
predictions_bronze = ols_bronze.predict(X_test)

In [14]:
# Predicting - all

output_medals = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Medals Predictions': predictions})
# output_medals

In [15]:
# Predicting - gold

output_gold = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Gold Predictions': predictions_gold})
# output_gold

In [16]:
# Predicting - silver

output_silver = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Silver Predictions': predictions_silver})
# output_silver

In [17]:
# Predicting - bronze

output_bronze = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Bronze Predictions': predictions_bronze})
# output_bronze

# Prediction table

In [18]:
# Combining Medals, Gold, Silver, Bronze Predictions

combined_predictions_df = pd.DataFrame({'Year': test_data.Year, 'Country': test_data.Country, 'Host':test_data.Host,
                                        'Athletes': test_data.Athletes, 'Sports': test_data.Sports, 'Events': test_data.Events,
                                        'Gold': test_data.Gold, 'Silver': test_data.Silver, 'Bronze': test_data.Bronze, 'Medals': test_data.Medals,
                                        'Gold Predictions': predictions_gold.round(), 'Silver Predictions': predictions_silver.round(),
                                        'Bronze Predictions': predictions_bronze.round(), 'Medals Predictions': predictions.round()})

combined_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
2584,2016,USA,0,555,33,245,46,37,38,121.0,30.0,28.0,28.0,87.0
2585,2016,China,0,427,32,211,26,18,26,70.0,20.0,19.0,20.0,59.0
2586,2016,UK,0,360,26,198,27,23,17,67.0,17.0,16.0,17.0,50.0
2587,2016,Russia,0,284,29,181,19,17,20,56.0,10.0,10.0,11.0,30.0
2588,2016,France,0,392,32,202,10,18,14,42.0,18.0,17.0,17.0,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2786,2016,"Virgin Islands, US",0,7,4,7,0,0,0,0.0,-0.0,-0.0,-0.0,-1.0
2787,2016,Yemen,0,3,3,3,0,0,0,0.0,-0.0,-0.0,-0.0,-0.0
2788,2016,Zambia,0,7,4,7,0,0,0,0.0,-0.0,-0.0,-0.0,-1.0
2789,2016,Zimbabwe,0,30,7,13,0,0,0,0.0,0.0,1.0,1.0,1.0


In [19]:
#  Predicting for certain year

year_predictions_df = combined_predictions_df[combined_predictions_df['Year'] == 2016]
year_predictions_df

Unnamed: 0,Year,Country,Host,Athletes,Sports,Events,Gold,Silver,Bronze,Medals,Gold Predictions,Silver Predictions,Bronze Predictions,Medals Predictions
2584,2016,USA,0,555,33,245,46,37,38,121.0,30.0,28.0,28.0,87.0
2585,2016,China,0,427,32,211,26,18,26,70.0,20.0,19.0,20.0,59.0
2586,2016,UK,0,360,26,198,27,23,17,67.0,17.0,16.0,17.0,50.0
2587,2016,Russia,0,284,29,181,19,17,20,56.0,10.0,10.0,11.0,30.0
2588,2016,France,0,392,32,202,10,18,14,42.0,18.0,17.0,17.0,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2785,2016,"Virgin Islands, British",0,4,2,4,0,0,0,0.0,0.0,0.0,0.0,1.0
2786,2016,"Virgin Islands, US",0,7,4,7,0,0,0,0.0,-0.0,-0.0,-0.0,-1.0
2787,2016,Yemen,0,3,3,3,0,0,0,0.0,-0.0,-0.0,-0.0,-0.0
2788,2016,Zambia,0,7,4,7,0,0,0,0.0,-0.0,-0.0,-0.0,-1.0
