Note: Machine Learning - Linear Regression - Currently Training for Total

In [4]:
import pandas as pd


# Data Pre-Processing

In [5]:
# Load Data 

df = pd.read_csv('Resources/summer_athlete_medals_count.csv')

# df.head(3)

In [6]:
# Filter for country: USA

country_df = df[df['Country'] == 'USA']
# country_df.head(3)

In [7]:
#  Ceate test data
test_data = country_df[country_df['Year'] > 1988]
test_data = pd.DataFrame(test_data)

# Dropping Total Medal count to prep for predictions
# test_data = test_data.drop(["Total_Medals","Gold","Silver","Bronze"], axis=1)
# test_data = test_data.drop(["Medals"], axis=1)


# Adding 2020 to prep for future prediction after hyperparameter tuning and training
new_row = pd.Series(data={'Year':2020, 'Country':'USA', 'Athletes':0, 
                          'Host':0, 'Sports':0, 'Events':0, 'Gold':0,'Silver':0, 'Bronze':0}, name='0')
test_data = test_data.append(new_row, ignore_index=False)
# test_data

#  Save into a csv file
test_data.to_csv('Resources/test.csv', index=False)

In [8]:
#  Create train data
train_data = country_df[country_df['Year'] <= 1988]
train_data = pd.DataFrame(train_data)
# train_data
#  Save into a csv file
train_data.to_csv('Resources/train.csv', index=False)

In [9]:
#  View what columns actually exist
train_data.describe().columns

Index(['Year', 'Host', 'Athletes', 'Sports', 'Events', 'Gold', 'Silver',
       'Bronze', 'Medals'],
      dtype='object')

In [10]:
# Assign X and y for test and training data
y_train = train_data["Medals"]
X_train_features = train_data[['Year', 'Host', 'Athletes', 'Sports', 'Events']]
X_train = X_train_features.values


X_test_features = test_data[['Year', 'Host', 'Athletes', 'Sports', 'Events']]
X_test = X_test_features.values


In [11]:
# # Transform the training and testing data using the X_scaler

# from sklearn.preprocessing import StandardScaler
# X_scaler = StandardScaler().fit(X_train)


# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)
# X_test_features_scaled = X_scaler.transform(X_train_features)



# Create and Train(Fit) Model

In [12]:
 # Create a LinearRegression model and fit it to the training data

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
print(f"Training Data Score: {model.score(X_train, y_train)}")


Training Data Score: 0.8097440218000025


# Hyperparameter Tuning - Ordinary Least Squares

In [14]:
# Source Link: https://www.datarobot.com/blog/ordinary-least-squares-in-python/
# Source Link: https://heartbeat.fritz.ai/implementing-ordinary-least-squares-ols-using-statsmodels-in-python-b1f4dee09419


import statsmodels.api as sm
import numpy as np

ols = sm.OLS(y_train, X_train_features).fit()


In [15]:
model_details = ols.summary()
print(model_details)

                                 OLS Regression Results                                
Dep. Variable:                 Medals   R-squared (uncentered):                   0.953
Model:                            OLS   Adj. R-squared (uncentered):              0.939
Method:                 Least Squares   F-statistic:                              65.60
Date:                Sat, 01 Aug 2020   Prob (F-statistic):                    4.34e-10
Time:                        05:28:59   Log-Likelihood:                         -93.749
No. Observations:                  21   AIC:                                      197.5
Df Residuals:                      16   BIC:                                      202.7
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

# Make Predictions

In [18]:
# # Make predictions with model
# predictions = model.predict(X_test_scaled)

# Make predictions with the hypertuned model
predictions = ols.predict(X_test)

In [19]:
# predictions

In [20]:
# Predicting - currently looking at 1988 only - displaying all years for prepping future predictions

output = pd.DataFrame({'Year': test_data.Year, 'Total_Medals_Predictions': predictions})
output

Unnamed: 0,Year,Total_Medals_Predictions
1415,1992,114.621085
1582,1996,186.978359
1778,2000,123.175725
1977,2004,114.773688
2177,2008,126.756469
2380,2012,116.24481
2584,2016,124.445836
0,2020,23.829327


In [22]:
#  Creating actua column
actual_data = [108, 101,  91, 101, 110,103, 121, 0]

output['Actual'] = actual_data
output

Unnamed: 0,Year,Total_Medals_Predictions,Actual
1415,1992,114.621085,108
1582,1996,186.978359,101
1778,2000,123.175725,91
1977,2004,114.773688,101
2177,2008,126.756469,110
2380,2012,116.24481,103
2584,2016,124.445836,121
0,2020,23.829327,0


In [23]:
#  Numbers for 1988 - predicting 1992
error_data = output.Total_Medals_Predictions.values - output.Actual.values
output['Error'] = error_data
output_1988 = output
output_1988

# Accuracy before OLS: 0.809
# Accurcy after OLS: 0.953

Unnamed: 0,Year,Total_Medals_Predictions,Actual,Error
1415,1992,114.621085,108,6.621085
1582,1996,186.978359,101,85.978359
1778,2000,123.175725,91,32.175725
1977,2004,114.773688,101,13.773688
2177,2008,126.756469,110,16.756469
2380,2012,116.24481,103,13.24481
2584,2016,124.445836,121,3.445836
0,2020,23.829327,0,23.829327


In [60]:
# #  Numbers for 1992 - predicting 1996
# error_data = output.Total_Medals_Predictions.values - output.Actual.values
# output['Error'] = error_data
# output_1996 = output
# output_1996

# # Accuracy before OLS: 0.804
# # Accurcy after OLS:  0.956

Unnamed: 0,Year,Total_Medals_Predictions,Actual,Error
0,1996,186.10716,101,85.10716
1,2000,121.450976,91,30.450976
2,2004,113.279343,101,12.279343
3,2008,125.108816,110,15.108816
4,2012,114.828185,103,11.828185
5,2016,123.017476,121,2.017476
6,2020,24.69597,0,24.69597


In [15]:
# #  Numbers for 1996 - predicting 2000
# error_data = output.Total_Medals_Predictions.values - output.Actual.values
# output['Error'] = error_data
# output_2000 = output
# output_2000

# # Accuracy before OLS: 0.809
# # Accurcy after OLS: 0.953

Unnamed: 0,Year,Total_Medals_Predictions,Actual,Error
0,2000,111.430139,91,20.430139
1,2004,96.189596,101,-4.810404
2,2008,113.718964,110,3.718964
3,2012,98.448751,103,-4.551249
4,2016,105.384901,121,-15.615099
5,2020,42.93765,0,42.93765


In [30]:
# #  Numbers for 2000 - predicting 2004
# error_data = output.Total_Medals_Predictions.values - output.Actual.values
# output['Error'] = error_data
# output_2004 = output
# output_2004

# # Accuracy before OLS: 0.695
# # Accurcy after OLS: 0.941

Unnamed: 0,Year,Total_Medals_Predictions,Actual,Error
0,2004,91.775925,101,-9.224075
1,2008,108.891171,110,-1.108829
2,2012,94.045177,103,-8.954823
3,2016,100.468488,121,-20.531512
4,2020,46.562219,0,46.562219


In [45]:
# #  Numbers for 2004 - predicting 2008
# error_data = output.Total_Medals_Predictions.values - output.Actual.values
# output['Error'] = error_data
# output_2008 = output
# output_2008

# # Accuracy before OLS: 0.694
# # Accurcy after OLS: 0.943

Unnamed: 0,Year,Total_Medals_Predictions,Actual,Error
0,2008,110.751167,110,0.751167
1,2012,96.149736,103,-6.850264
2,2016,102.961367,121,-18.038633
3,2020,44.57166,0,44.57166


In [60]:
#  Numbers for 2008 - predicting 2012
error_data = output.Total_Medals_Predictions.values - output.Actual.values
output['Error'] = error_data
output_2012 = output
output_2012

# Accuracy before OLS: 0.697
# Accurcy after OLS: 0.946

Unnamed: 0,Year,Total_Medals_Predictions,Actual,Error
0,2012,96.018955,103,-6.981045
1,2016,102.800541,121,-18.199459
2,2020,44.703034,0,44.703034


In [92]:
#  Numbers for 2012 - predicting 2016
error_data = output.Total_Medals_Predictions.values - output.Actual.values
output['Error'] = error_data
output_2016 = output
output_2016

# Accuracy before OLS: 0.697
# Accurcy after OLS: 0.948

Unnamed: 0,Year,Total_Medals_Predictions,Actual,Error
0,2016,104.274455,121,-16.725545
1,2020,43.497433,0,43.497433
