In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
from pathlib import Path

In [2]:
# Import CSV
file_path = Path('../Resources/Revised_CSV/hall_batter.csv')
hall_batter_df = pd.read_csv(file_path)
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,N
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,Y
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,N
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,N
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17501,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,N
17502,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,N
17503,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,34.0,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,N
17504,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,9.0,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,N


In [3]:
# RFC model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_batter_df['inducted'] = hall_batter_df['inducted'].map(b).fillna(hall_batter_df['inducted'])
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,0
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17501,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,0
17502,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,0
17503,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,34.0,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,0
17504,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,9.0,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,0


In [4]:
# Set target and features variables
y = hall_batter_df.inducted
X = hall_batter_df.drop(columns=["playerID", "inducted"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [5]:
# Create a RFC model
rf_model = RandomForestClassifier(n_estimators=128, random_state = 42)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

predictions

array([0, 0, 0, ..., 0, 0, 0])

In [6]:
# Evaluate the model

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confustion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confustion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4335,8
Actual 1,14,20


Accuracy Score : 0.9949737262965501
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4343
           1       0.71      0.59      0.65        34

    accuracy                           0.99      4377
   macro avg       0.86      0.79      0.82      4377
weighted avg       0.99      0.99      0.99      4377



In [7]:
# Calculate feature importance in the RFC model
importances = rf_model.feature_importances_
importances

array([0.06865092, 0.09781351, 0.136507  , 0.12591326, 0.05139588,
       0.0511292 , 0.03085994, 0.07242977, 0.03589417, 0.04700466,
       0.04320209, 0.0312724 , 0.03431289, 0.0302142 , 0.06673822,
       0.04207216, 0.03458974])

In [8]:
# Sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.13650700024171408, 'R'),
 (0.12591325734095346, 'H'),
 (0.09781351207287703, 'AB'),
 (0.07242976834261967, 'RBI'),
 (0.0686509156005499, 'G'),
 (0.06673821520882903, 'AVG'),
 (0.05139587952870257, '2B'),
 (0.051129204747023344, '3B'),
 (0.04700466348661, 'BB'),
 (0.04320208751585259, 'SO'),
 (0.04207215983649316, 'OBP'),
 (0.03589416801126094, 'SB'),
 (0.03458973665357076, 'SLG'),
 (0.03431288510264603, 'SH'),
 (0.031272398208381076, 'HBP'),
 (0.030859944132410215, 'HR'),
 (0.0302142039695062, 'SF')]

In [9]:
# Whole DF
X_2 = hall_batter_df.drop(columns=["playerID", "inducted"])
X_2_scaled = X_scaler.transform(X_2)

# Make predictions using the testing data
predictions = rf_model.predict(X_2_scaled)

predictions

array([0, 1, 0, ..., 0, 0, 0])

In [10]:
predictions = predictions.tolist()

In [11]:
hall_batter_df["HOF Prediction"] = predictions
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0,0
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,0,0
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17501,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,0,0
17502,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,0,0
17503,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,34.0,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,0,0
17504,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,9.0,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,0,0


In [12]:
hall_batter_df.loc[(hall_batter_df['inducted'] == 1) & (hall_batter_df['HOF Prediction'] == 1) ]

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1
239,alomaro01,2379.0,9073.0,1508.0,2724.0,504.0,80.0,210.0,1134.0,474.0,1032.0,1140.0,50.0,148.0,97.0,0.300231,0.371245,0.442852,1,1
376,ansonca01,2524.0,10281.0,1999.0,3435.0,582.0,142.0,97.0,2075.0,277.0,984.0,330.0,32.0,34.0,0.0,0.334111,0.393998,0.446649,1,1
384,aparilu01,2601.0,10230.0,1335.0,2677.0,394.0,92.0,83.0,791.0,506.0,736.0,742.0,27.0,161.0,76.0,0.261681,0.310778,0.342522,1,1
464,ashburi01,2189.0,8365.0,1322.0,2574.0,317.0,109.0,29.0,586.0,234.0,1198.0,571.0,43.0,112.0,18.0,0.307711,0.396405,0.382068,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17053,wilsoha01,1348.0,4760.0,884.0,1461.0,266.0,67.0,244.0,1063.0,52.0,674.0,713.0,20.0,102.0,0.0,0.306933,0.395123,0.544748,1,1
17106,winfida01,2973.0,11003.0,1669.0,3110.0,540.0,88.0,465.0,1833.0,223.0,1216.0,1686.0,25.0,19.0,95.0,0.282650,0.352622,0.474507,1,1
17335,yastrca01,3308.0,11988.0,1816.0,3419.0,646.0,59.0,452.0,1844.0,168.0,1845.0,1393.0,40.0,13.0,105.0,0.285202,0.379453,0.462045,1,1
17415,youngro01,1211.0,4627.0,812.0,1491.0,236.0,93.0,42.0,592.0,153.0,550.0,390.0,37.0,119.0,0.0,0.322239,0.398542,0.440674,1,1


In [13]:
hall_batter_df.loc[(hall_batter_df['inducted'] == 1)]

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1
239,alomaro01,2379.0,9073.0,1508.0,2724.0,504.0,80.0,210.0,1134.0,474.0,1032.0,1140.0,50.0,148.0,97.0,0.300231,0.371245,0.442852,1,1
376,ansonca01,2524.0,10281.0,1999.0,3435.0,582.0,142.0,97.0,2075.0,277.0,984.0,330.0,32.0,34.0,0.0,0.334111,0.393998,0.446649,1,1
384,aparilu01,2601.0,10230.0,1335.0,2677.0,394.0,92.0,83.0,791.0,506.0,736.0,742.0,27.0,161.0,76.0,0.261681,0.310778,0.342522,1,1
464,ashburi01,2189.0,8365.0,1322.0,2574.0,317.0,109.0,29.0,586.0,234.0,1198.0,571.0,43.0,112.0,18.0,0.307711,0.396405,0.382068,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17053,wilsoha01,1348.0,4760.0,884.0,1461.0,266.0,67.0,244.0,1063.0,52.0,674.0,713.0,20.0,102.0,0.0,0.306933,0.395123,0.544748,1,1
17106,winfida01,2973.0,11003.0,1669.0,3110.0,540.0,88.0,465.0,1833.0,223.0,1216.0,1686.0,25.0,19.0,95.0,0.282650,0.352622,0.474507,1,1
17335,yastrca01,3308.0,11988.0,1816.0,3419.0,646.0,59.0,452.0,1844.0,168.0,1845.0,1393.0,40.0,13.0,105.0,0.285202,0.379453,0.462045,1,1
17415,youngro01,1211.0,4627.0,812.0,1491.0,236.0,93.0,42.0,592.0,153.0,550.0,390.0,37.0,119.0,0.0,0.322239,0.398542,0.440674,1,1


In [14]:
# Measure probability of predictions
prediction_proba = rf_model.predict_proba(X_2_scaled)

prediction_proba

array([[1.      , 0.      ],
       [0.078125, 0.921875],
       [1.      , 0.      ],
       ...,
       [1.      , 0.      ],
       [1.      , 0.      ],
       [1.      , 0.      ]])

In [15]:
prediction_proba = prediction_proba.tolist()

In [16]:
# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [17]:
# Create a column with the probability for a Yes
hall_batter_df["Yes HOF Probability"] = Y_proba
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,...,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction,Yes HOF Probability
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0,0,0.000000
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,...,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1,0.921875
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,...,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,0,0,0.000000
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0.000000
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17501,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,...,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,0,0,0.000000
17502,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,0,0,0.000000
17503,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,...,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,0,0,0.000000
17504,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,...,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,0,0,0.000000


In [18]:
# Convert "Yes HOF Probability" column from object to float64
hall_batter_df["Yes HOF Probability"] = pd.to_numeric(hall_batter_df["Yes HOF Probability"])

In [19]:
hall_batter_df["Yes HOF Probability"] = 100 * hall_batter_df["Yes HOF Probability"]

In [20]:
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,...,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction,Yes HOF Probability
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0,0,0.0000
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,...,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1,92.1875
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,...,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,0,0,0.0000
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0.0000
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,0,0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17501,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,...,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,0,0,0.0000
17502,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,0,0,0.0000
17503,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,...,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,0,0,0.0000
17504,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,...,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,0,0,0.0000


In [21]:
hall_batter_df.loc[(hall_batter_df['Yes HOF Probability'] >= 90)]

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,...,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction,Yes HOF Probability
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,...,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1,92.1875
376,ansonca01,2524.0,10281.0,1999.0,3435.0,582.0,142.0,97.0,2075.0,277.0,...,330.0,32.0,34.0,0.0,0.334111,0.393998,0.446649,1,1,91.40625
1829,broutda01,1676.0,6726.0,1529.0,2303.0,462.0,206.0,107.0,1301.0,257.0,...,238.0,105.0,20.0,0.0,0.342403,0.423413,0.520071,1,1,93.75
2056,burkeje01,2067.0,8426.0,1720.0,2850.0,320.0,182.0,75.0,952.0,389.0,...,613.0,75.0,90.0,0.0,0.338239,0.4149,0.446119,1,1,91.40625
2378,carewro01,2469.0,9315.0,1424.0,3053.0,445.0,112.0,92.0,1015.0,353.0,...,1028.0,25.0,128.0,64.0,0.327751,0.393015,0.4292,1,1,92.1875
3001,collied01,2826.0,9949.0,1821.0,3315.0,438.0,187.0,47.0,1300.0,741.0,...,400.0,77.0,512.0,0.0,0.333199,0.424382,0.428988,1,1,91.40625
3097,connoro01,1998.0,7797.0,1620.0,2467.0,441.0,233.0,138.0,1323.0,244.0,...,455.0,39.0,9.0,0.0,0.316404,0.396922,0.485828,1,1,90.625
3346,crawfsa01,2517.0,9570.0,1391.0,2961.0,458.0,309.0,97.0,1519.0,366.0,...,512.0,23.0,241.0,0.0,0.309404,0.361634,0.452247,1,1,93.75
3819,delahed01,1837.0,7510.0,1600.0,2597.0,522.0,186.0,101.0,1466.0,456.0,...,439.0,94.0,54.0,0.0,0.345806,0.411335,0.505193,1,1,96.875
5146,foxxji01,2317.0,8134.0,1751.0,2646.0,458.0,125.0,534.0,1922.0,87.0,...,1311.0,13.0,71.0,0.0,0.325301,0.428274,0.609294,1,1,96.875


In [None]:
# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
# hall_batter_df["Yes HOF Probability"] = hall_batter_df["Yes HOF Probability"].map("{:.2%}".format)