In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
# reading csv file
athlete_events = pd.read_csv('../CSV for ML models/athlete_events.csv')

In [3]:
athlete_events.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [4]:
filter_data = athlete_events[["Sex", "Age", "Height", "Weight", "Team", "Year", "Season", "Sport", "Event", "Medal"]]

In [5]:
# get the data for summer and winter Olympic
winter_data = filter_data[filter_data["Season"] == "Winter"]
summer_data = filter_data[(filter_data["Season"] == "Summer")]

In [6]:
summer_sports = summer_data["Sport"].unique()

In [7]:
# For loop to train the Random Forest model and get the testing score for male athletes in every Summer Olympic Sport
for sport in summer_sports:
    try:
        athlete_m  = summer_data[(summer_data["Sport"] == f'{sport}') & (summer_data["Sex"] == "M")]
        athlete_m  = athlete_m [["Age", "Height", "Weight", "Medal"]]
        athlete_m  = athlete_m .dropna(subset=['Height', 'Weight']).reset_index(drop = True)
        
        # One-hot encoding                

        athlete_m  = athlete_m .replace("Gold", 1)
        athlete_m  = athlete_m .replace("Silver", 1)
        athlete_m  = athlete_m .replace("Bronze", 1)
        athlete_m ["Medal"] = athlete_m ["Medal"].fillna(0)
        athlete_m  = athlete_m .dropna()
        athlete_m ["Medal"].unique()

        print(sport)
        
        # Assign X (data) and y (target)        

        target = athlete_m ["Medal"]
        target_names = ["Medals", "None"]
        data = athlete_m .drop("Medal", axis=1)
        feature_names = data.columns
        data.head()
        
        # Split our data into training and testing

        X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
        clf = tree.DecisionTreeClassifier()
        clf = clf.fit(X_train, y_train)
        clf.score(X_test, y_test)
        
        # Create a random forest classifier

        rf = RandomForestClassifier(n_estimators=200)
        rf = rf.fit(X_train, y_train)
        print("Training Data Score:", rf.score(X_train, y_train)) 
        print("Testing Data Score:", rf.score(X_test, y_test))
        
        # Sort the features by their importance

        print(sorted(zip(rf.feature_importances_, feature_names), reverse=True))
        print("----------------------------------------------------")
    except:
        print("An exception occurred")
        print("----------------------------------------------------")

Basketball
Training Data Score: 0.9723577235772358
Testing Data Score: 0.6834415584415584
[(0.42143922527920546, 'Weight'), (0.35490832742816325, 'Height'), (0.22365244729263128, 'Age')]
----------------------------------------------------
Judo
Training Data Score: 0.9588963963963963
Testing Data Score: 0.8195615514333895
[(0.39519015913769856, 'Weight'), (0.3728713953486695, 'Height'), (0.231938445513632, 'Age')]
----------------------------------------------------
Football
Training Data Score: 0.9175019275250579
Testing Data Score: 0.7017341040462428
[(0.38671842766921266, 'Weight'), (0.38104477372215234, 'Height'), (0.23223679860863491, 'Age')]
----------------------------------------------------
Tug-Of-War
Training Data Score: 1.0
Testing Data Score: 0.4
[(0.4575656434064171, 'Weight'), (0.27764178612943496, 'Height'), (0.2647925704641479, 'Age')]
----------------------------------------------------
Athletics
Training Data Score: 0.9291903010658854
Testing Data Score: 0.85826477187

Training Data Score: 1.0
Testing Data Score: 0.8955223880597015
[(0.3819409447193126, 'Height'), (0.3273875824048605, 'Age'), (0.29067147287582695, 'Weight')]
----------------------------------------------------
Rugby
Training Data Score: 1.0
Testing Data Score: 1.0
[(0.0, 'Weight'), (0.0, 'Height'), (0.0, 'Age')]
----------------------------------------------------
Lacrosse
Training Data Score: 1.0
Testing Data Score: 1.0
[(0.0, 'Weight'), (0.0, 'Height'), (0.0, 'Age')]
----------------------------------------------------
Polo
An exception occurred
----------------------------------------------------
Cricket
An exception occurred
----------------------------------------------------
Ice Hockey
Training Data Score: 1.0
Testing Data Score: 1.0
[(0.0, 'Weight'), (0.0, 'Height'), (0.0, 'Age')]
----------------------------------------------------
Racquets
An exception occurred
----------------------------------------------------
Motorboating
An exception occurred
---------------------------

In [8]:
# For loop to train the Random Forest model and get the testing score for female athletes in every Summer Olympic Sport
for sport in summer_sports:
    try:
        athlete_f  = summer_data[(summer_data["Sport"] == f'{sport}') & (summer_data["Sex"] == "M")]
        athlete_f  = athlete_f [["Age", "Height", "Weight", "Medal"]]
        athlete_f  = athlete_f .dropna(subset=['Height', 'Weight']).reset_index(drop = True)
        
        # One-hot encoding                

        athlete_f  = athlete_f .replace("Gold", 1)
        athlete_f  = athlete_f .replace("Silver", 1)
        athlete_f  = athlete_f .replace("Bronze", 1)
        athlete_f ["Medal"] = athlete_f ["Medal"].fillna(0)
        athlete_f  = athlete_f .dropna()
        athlete_f ["Medal"].unique()

        print(sport)
        
        # Assign X (data) and y (target)        

        target = athlete_f ["Medal"]
        target_names = ["Medals", "None"]
        data = athlete_f .drop("Medal", axis=1)
        feature_names = data.columns
        data.head()
        
        # Split our data into training and testing

        X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)
        clf = tree.DecisionTreeClassifier()
        clf = clf.fit(X_train, y_train)
        clf.score(X_test, y_test)
        
        # Create a random forest classifier

        rf = RandomForestClassifier(n_estimators=200)
        rf = rf.fit(X_train, y_train)
        print("Training Data Score:", rf.score(X_train, y_train)) 
        print("Testing Data Score:", rf.score(X_test, y_test))
        
        # Sort the features by their importance
        
        print(sorted(zip(rf.feature_importances_, feature_names), reverse=True))
        print("----------------------------------------------------")
    except:
        print("An exception occurred")
        print("----------------------------------------------------")

Basketball
Training Data Score: 0.9723577235772358
Testing Data Score: 0.6818181818181818
[(0.41693851358630973, 'Weight'), (0.3484694346600359, 'Height'), (0.2345920517536544, 'Age')]
----------------------------------------------------
Judo
Training Data Score: 0.9588963963963963
Testing Data Score: 0.821247892074199
[(0.3885768787864628, 'Weight'), (0.3671280355203363, 'Height'), (0.24429508569320102, 'Age')]
----------------------------------------------------
Football
Training Data Score: 0.9175019275250579
Testing Data Score: 0.7109826589595376
[(0.3883071169269278, 'Height'), (0.37367025314955526, 'Weight'), (0.23802262992351697, 'Age')]
----------------------------------------------------
Tug-Of-War
Training Data Score: 1.0
Testing Data Score: 0.4
[(0.48102521243637325, 'Weight'), (0.2776424642321963, 'Age'), (0.24133232333143045, 'Height')]
----------------------------------------------------
Athletics
Training Data Score: 0.9291903010658854
Testing Data Score: 0.8578908002991

Training Data Score: 1.0
Testing Data Score: 0.9104477611940298
[(0.39656931191610895, 'Height'), (0.3181637458481218, 'Age'), (0.28526694223576937, 'Weight')]
----------------------------------------------------
Rugby
Training Data Score: 1.0
Testing Data Score: 1.0
[(0.0, 'Weight'), (0.0, 'Height'), (0.0, 'Age')]
----------------------------------------------------
Lacrosse
Training Data Score: 1.0
Testing Data Score: 1.0
[(0.0, 'Weight'), (0.0, 'Height'), (0.0, 'Age')]
----------------------------------------------------
Polo
An exception occurred
----------------------------------------------------
Cricket
An exception occurred
----------------------------------------------------
Ice Hockey
Training Data Score: 1.0
Testing Data Score: 1.0
[(0.0, 'Weight'), (0.0, 'Height'), (0.0, 'Age')]
----------------------------------------------------
Racquets
An exception occurred
----------------------------------------------------
Motorboating
An exception occurred
--------------------------