In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
from pathlib import Path

In [2]:
# Import CSV
file_path = Path('../Resources/Revised_CSV/hall_pitching.csv')
hall_pitcher_df = pd.read_csv(file_path)
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,WP,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,12.0,16.0,1475.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,N
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,22.0,7.0,4730.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,N
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,10.0,12.0,1399.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,N
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,18.0,26.0,2568.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,N
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,3.0,1.0,67.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,28.0,4.0,3476.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,N
9289,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,1.0,99.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,N
9290,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,16.0,4.0,911.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,N
9291,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,10.0,27.0,2746.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,N


In [3]:
# RFC model data preprocessing
# Convert inducted column to Y = 1, N = 0

p = {'Y': 1, 'N': 0}
hall_pitcher_df['inducted'] = hall_pitcher_df['inducted'].map(p).fillna(hall_pitcher_df['inducted'])
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,WP,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,12.0,16.0,1475.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,22.0,7.0,4730.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,10.0,12.0,1399.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,18.0,26.0,2568.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,3.0,1.0,67.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,28.0,4.0,3476.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0
9289,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,1.0,99.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0
9290,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,16.0,4.0,911.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0
9291,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,10.0,27.0,2746.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0


In [4]:
import numpy as np
hall_pitcher_df.loc[hall_pitcher_df["Strikeout to Walk"] == np.inf]

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,WP,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
59,adriaeh01,0.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,...,0.0,1.0,13.0,4.0,2.000000,0.0,inf,3.00,18.000000,0
126,altheaa01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,...,0.0,0.0,5.0,1.0,1.000000,0.0,inf,2.00,9.000000,0
166,andermi01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,5.0,0.0,1.000000,0.0,inf,2.00,0.000000,0
323,bakerer01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,2.0,...,0.0,1.0,18.0,4.0,4.000000,0.0,inf,1.75,4.500000,0
387,barnebo01,0.0,0.0,2.0,0.0,0.0,0.0,0.0,14.0,10.0,...,0.0,0.0,29.0,11.0,4.666667,0.0,inf,3.00,19.285714,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9142,wordefr01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,4.0,...,0.0,0.0,13.0,5.0,2.000000,0.0,inf,4.00,18.000000,0
9151,worthda01,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,1.0,...,0.0,0.0,10.0,1.0,2.000000,0.0,inf,2.00,4.500000,0
9154,wrighbo01,0.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,1.0,...,0.0,0.0,17.0,4.0,4.000000,0.0,inf,1.50,2.250000,0
9161,wrighge01,0.0,1.0,3.0,0.0,0.0,0.0,0.0,6.0,3.0,...,1.0,0.0,21.0,3.0,5.000000,0.0,inf,1.20,5.400000,1


In [5]:
indexNames = hall_pitcher_df[(hall_pitcher_df["Strikeout to Walk"] == np.inf)].index
hall_pitcher_df.drop(indexNames, inplace = True)
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,WP,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,12.0,16.0,1475.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,22.0,7.0,4730.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,10.0,12.0,1399.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,18.0,26.0,2568.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,3.0,1.0,67.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,28.0,4.0,3476.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0
9289,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,1.0,99.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0
9290,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,16.0,4.0,911.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0
9291,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,10.0,27.0,2746.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0


In [6]:
# Set target and features variables
y = hall_pitcher_df.inducted
X = hall_pitcher_df.drop(columns=["playerID", "inducted"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a RFC model
rf_model = RandomForestClassifier(n_estimators=128, random_state = 42)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

predictions

array([1, 0, 0, ..., 0, 0, 0])

In [8]:
# Evaluate the model

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confustion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confustion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2259,4
Actual 1,9,12


Accuracy Score : 0.9943082311733801
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2263
           1       0.75      0.57      0.65        21

    accuracy                           0.99      2284
   macro avg       0.87      0.78      0.82      2284
weighted avg       0.99      0.99      0.99      2284



In [9]:
# Calculate feature importance in the RFC model
importances = rf_model.feature_importances_
importances

array([0.12592681, 0.02849383, 0.03717076, 0.03581435, 0.05686457,
       0.05977233, 0.03501889, 0.05738268, 0.03910198, 0.0302367 ,
       0.0306396 , 0.0422412 , 0.02337516, 0.02178504, 0.07015806,
       0.03363978, 0.07383743, 0.05361194, 0.03662109, 0.0528137 ,
       0.05549411])

In [10]:
# Sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1259268123483977, 'W'),
 (0.07383743391692504, 'IP'),
 (0.0701580616947161, 'BFP'),
 (0.05977233239380589, 'SHO'),
 (0.0573826759562148, 'H'),
 (0.056864571057256195, 'CG'),
 (0.05549410588801312, 'ERA_corrected'),
 (0.053611937171204126, 'Win Percentage'),
 (0.0528136971337546, 'WHIP'),
 (0.04224119602821397, 'SO'),
 (0.03910198479849475, 'ER'),
 (0.037170757738471775, 'G'),
 (0.03662108805111601, 'Strikeout to Walk'),
 (0.03581434861057217, 'GS'),
 (0.0350188886638928, 'SV'),
 (0.03363978098459251, 'R'),
 (0.030639603977249328, 'BB'),
 (0.030236697646636647, 'HR'),
 (0.028493826573214424, 'L'),
 (0.023375163582528325, 'WP'),
 (0.021785035784729668, 'HBP')]

In [11]:
# Import CSV
file_path = Path('../Resources/Revised_CSV/2022_HOF_Pitchers_Class.csv')
HOF_P_2022_df = pd.read_csv(file_path)
HOF_P_2022_df

Unnamed: 0,Name,W,L,G,GS,CG,ShO,SV,H,ER,...,SO,WP,HBP,TBF,R,IP,W %,K/BB,WHIP,ERA
0,Billy Wagner,47,40,853,0,0,0,422,601,232,...,1196,43,33,3600,262,903.0,0.54023,3.99,1.0,2.31
1,Jonathan Papelbon,41,36,689,3,0,0,368,572,197,...,808,14,34,2938,226,725.2,0.532468,4.37,1.04,2.44
2,Joe Nathan,64,34,787,29,0,0,377,690,294,...,976,47,23,3771,317,923.1,0.653061,2.84,1.12,2.87
3,Roger Clemens,354,184,709,707,118,46,0,4185,1707,...,4672,143,159,20240,1885,4916.2,0.657993,2.96,1.17,3.12
4,Matt Thornton,36,46,748,1,0,0,23,594,251,...,642,28,20,2799,273,662.2,0.439024,2.56,1.28,3.41
5,Curt Schilling,216,146,569,436,83,20,22,2998,1253,...,3116,72,52,13284,1318,3261.0,0.596685,4.38,1.14,3.46
6,Javier Lopez,30,17,839,0,0,0,14,484,206,...,358,16,27,2273,227,533.1,0.638298,1.52,1.35,3.48
7,Tim Hudson,222,133,482,479,26,13,0,2957,1213,...,2080,84,124,13005,1319,3126.2,0.625352,2.27,1.24,3.49
8,Jake Peavy,152,126,388,377,15,6,0,2134,960,...,2207,46,78,9838,1011,2377.0,0.546763,3.12,1.2,3.63
9,Tim Lincecum,110,89,278,270,10,7,1,1506,699,...,1736,107,44,7120,746,1682.0,0.552764,2.59,1.29,3.74


In [12]:
# Set target and features variables
X_p_2022 = HOF_P_2022_df.set_index("Name")

In [13]:
# Scale data
X_p_scaled = X_scaler.transform(X_p_2022)

In [14]:
# Make predictions using the testing data
predictions = rf_model.predict(X_p_scaled)

predictions

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [15]:
predictions = predictions.tolist()

In [16]:
HOF_P_2022_df["HOF Prediction"] = predictions
HOF_P_2022_df

Unnamed: 0,Name,W,L,G,GS,CG,ShO,SV,H,ER,...,WP,HBP,TBF,R,IP,W %,K/BB,WHIP,ERA,HOF Prediction
0,Billy Wagner,47,40,853,0,0,0,422,601,232,...,43,33,3600,262,903.0,0.54023,3.99,1.0,2.31,0
1,Jonathan Papelbon,41,36,689,3,0,0,368,572,197,...,14,34,2938,226,725.2,0.532468,4.37,1.04,2.44,0
2,Joe Nathan,64,34,787,29,0,0,377,690,294,...,47,23,3771,317,923.1,0.653061,2.84,1.12,2.87,0
3,Roger Clemens,354,184,709,707,118,46,0,4185,1707,...,143,159,20240,1885,4916.2,0.657993,2.96,1.17,3.12,1
4,Matt Thornton,36,46,748,1,0,0,23,594,251,...,28,20,2799,273,662.2,0.439024,2.56,1.28,3.41,0
5,Curt Schilling,216,146,569,436,83,20,22,2998,1253,...,72,52,13284,1318,3261.0,0.596685,4.38,1.14,3.46,0
6,Javier Lopez,30,17,839,0,0,0,14,484,206,...,16,27,2273,227,533.1,0.638298,1.52,1.35,3.48,0
7,Tim Hudson,222,133,482,479,26,13,0,2957,1213,...,84,124,13005,1319,3126.2,0.625352,2.27,1.24,3.49,0
8,Jake Peavy,152,126,388,377,15,6,0,2134,960,...,46,78,9838,1011,2377.0,0.546763,3.12,1.2,3.63,0
9,Tim Lincecum,110,89,278,270,10,7,1,1506,699,...,107,44,7120,746,1682.0,0.552764,2.59,1.29,3.74,0


In [17]:
# Measure probability of predictions
prediction_proba = rf_model.predict_proba(X_p_scaled)

prediction_proba

array([[1.       , 0.       ],
       [0.9921875, 0.0078125],
       [0.9765625, 0.0234375],
       [0.0859375, 0.9140625],
       [1.       , 0.       ],
       [0.859375 , 0.140625 ],
       [1.       , 0.       ],
       [0.9453125, 0.0546875],
       [1.       , 0.       ],
       [1.       , 0.       ],
       [0.953125 , 0.046875 ],
       [0.9453125, 0.0546875],
       [1.       , 0.       ],
       [1.       , 0.       ],
       [1.       , 0.       ],
       [1.       , 0.       ]])

In [18]:
prediction_proba = prediction_proba.tolist()

In [19]:
# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [20]:
# Create a column with the probability for a Yes
HOF_P_2022_df["Yes HOF Probability"] = Y_proba
HOF_P_2022_df

Unnamed: 0,Name,W,L,G,GS,CG,ShO,SV,H,ER,...,HBP,TBF,R,IP,W %,K/BB,WHIP,ERA,HOF Prediction,Yes HOF Probability
0,Billy Wagner,47,40,853,0,0,0,422,601,232,...,33,3600,262,903.0,0.54023,3.99,1.0,2.31,0,0.0
1,Jonathan Papelbon,41,36,689,3,0,0,368,572,197,...,34,2938,226,725.2,0.532468,4.37,1.04,2.44,0,0.007812
2,Joe Nathan,64,34,787,29,0,0,377,690,294,...,23,3771,317,923.1,0.653061,2.84,1.12,2.87,0,0.023438
3,Roger Clemens,354,184,709,707,118,46,0,4185,1707,...,159,20240,1885,4916.2,0.657993,2.96,1.17,3.12,1,0.914062
4,Matt Thornton,36,46,748,1,0,0,23,594,251,...,20,2799,273,662.2,0.439024,2.56,1.28,3.41,0,0.0
5,Curt Schilling,216,146,569,436,83,20,22,2998,1253,...,52,13284,1318,3261.0,0.596685,4.38,1.14,3.46,0,0.140625
6,Javier Lopez,30,17,839,0,0,0,14,484,206,...,27,2273,227,533.1,0.638298,1.52,1.35,3.48,0,0.0
7,Tim Hudson,222,133,482,479,26,13,0,2957,1213,...,124,13005,1319,3126.2,0.625352,2.27,1.24,3.49,0,0.054688
8,Jake Peavy,152,126,388,377,15,6,0,2134,960,...,78,9838,1011,2377.0,0.546763,3.12,1.2,3.63,0,0.0
9,Tim Lincecum,110,89,278,270,10,7,1,1506,699,...,44,7120,746,1682.0,0.552764,2.59,1.29,3.74,0,0.0


In [21]:
# Format the "Yes HOF Probability" column to a percentage
HOF_P_2022_df["Yes HOF Probability"] = HOF_P_2022_df["Yes HOF Probability"].map("{:.2%}".format)

In [22]:
HOF_P_2022_df

Unnamed: 0,Name,W,L,G,GS,CG,ShO,SV,H,ER,...,HBP,TBF,R,IP,W %,K/BB,WHIP,ERA,HOF Prediction,Yes HOF Probability
0,Billy Wagner,47,40,853,0,0,0,422,601,232,...,33,3600,262,903.0,0.54023,3.99,1.0,2.31,0,0.00%
1,Jonathan Papelbon,41,36,689,3,0,0,368,572,197,...,34,2938,226,725.2,0.532468,4.37,1.04,2.44,0,0.78%
2,Joe Nathan,64,34,787,29,0,0,377,690,294,...,23,3771,317,923.1,0.653061,2.84,1.12,2.87,0,2.34%
3,Roger Clemens,354,184,709,707,118,46,0,4185,1707,...,159,20240,1885,4916.2,0.657993,2.96,1.17,3.12,1,91.41%
4,Matt Thornton,36,46,748,1,0,0,23,594,251,...,20,2799,273,662.2,0.439024,2.56,1.28,3.41,0,0.00%
5,Curt Schilling,216,146,569,436,83,20,22,2998,1253,...,52,13284,1318,3261.0,0.596685,4.38,1.14,3.46,0,14.06%
6,Javier Lopez,30,17,839,0,0,0,14,484,206,...,27,2273,227,533.1,0.638298,1.52,1.35,3.48,0,0.00%
7,Tim Hudson,222,133,482,479,26,13,0,2957,1213,...,124,13005,1319,3126.2,0.625352,2.27,1.24,3.49,0,5.47%
8,Jake Peavy,152,126,388,377,15,6,0,2134,960,...,78,9838,1011,2377.0,0.546763,3.12,1.2,3.63,0,0.00%
9,Tim Lincecum,110,89,278,270,10,7,1,1506,699,...,44,7120,746,1682.0,0.552764,2.59,1.29,3.74,0,0.00%
