In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
from pathlib import Path

In [2]:
# Import CSV
file_path = Path('../Resources/Revised_CSV/hall_pitching.csv')
hall_pitcher_df = pd.read_csv(file_path)
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,WP,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,12.0,16.0,1475.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,N
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,22.0,7.0,4730.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,N
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,10.0,12.0,1399.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,N
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,18.0,26.0,2568.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,N
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,3.0,1.0,67.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,28.0,4.0,3476.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,N
9289,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,1.0,99.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,N
9290,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,16.0,4.0,911.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,N
9291,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,10.0,27.0,2746.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,N


In [3]:
# RFC model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_pitcher_df['inducted'] = hall_pitcher_df['inducted'].map(b).fillna(hall_pitcher_df['inducted'])
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,WP,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,12.0,16.0,1475.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,22.0,7.0,4730.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,10.0,12.0,1399.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,18.0,26.0,2568.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,3.0,1.0,67.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,28.0,4.0,3476.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0
9289,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,1.0,99.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0
9290,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,16.0,4.0,911.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0
9291,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,10.0,27.0,2746.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0


In [4]:
import numpy as np
hall_pitcher_df.loc[hall_pitcher_df["Strikeout to Walk"] == np.inf]

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,WP,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
59,adriaeh01,0.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,...,0.0,1.0,13.0,4.0,2.000000,0.0,inf,3.00,18.000000,0
126,altheaa01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,...,0.0,0.0,5.0,1.0,1.000000,0.0,inf,2.00,9.000000,0
166,andermi01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,5.0,0.0,1.000000,0.0,inf,2.00,0.000000,0
323,bakerer01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,2.0,...,0.0,1.0,18.0,4.0,4.000000,0.0,inf,1.75,4.500000,0
387,barnebo01,0.0,0.0,2.0,0.0,0.0,0.0,0.0,14.0,10.0,...,0.0,0.0,29.0,11.0,4.666667,0.0,inf,3.00,19.285714,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9142,wordefr01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,4.0,...,0.0,0.0,13.0,5.0,2.000000,0.0,inf,4.00,18.000000,0
9151,worthda01,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,1.0,...,0.0,0.0,10.0,1.0,2.000000,0.0,inf,2.00,4.500000,0
9154,wrighbo01,0.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,1.0,...,0.0,0.0,17.0,4.0,4.000000,0.0,inf,1.50,2.250000,0
9161,wrighge01,0.0,1.0,3.0,0.0,0.0,0.0,0.0,6.0,3.0,...,1.0,0.0,21.0,3.0,5.000000,0.0,inf,1.20,5.400000,1


In [5]:
indexNames = hall_pitcher_df[(hall_pitcher_df["Strikeout to Walk"] == np.inf)].index
hall_pitcher_df.drop(indexNames, inplace = True)
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,WP,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,12.0,16.0,1475.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,22.0,7.0,4730.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,10.0,12.0,1399.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,18.0,26.0,2568.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,3.0,1.0,67.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,28.0,4.0,3476.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0
9289,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,1.0,99.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0
9290,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,16.0,4.0,911.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0
9291,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,10.0,27.0,2746.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0


In [6]:
# Set target and features variables
y = hall_pitcher_df.inducted
X = hall_pitcher_df.drop(columns=["playerID", "inducted"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a RFC model
rf_model = RandomForestClassifier(n_estimators=128, random_state = 42)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

predictions

array([1, 0, 0, ..., 0, 0, 0])

In [8]:
# Evaluate the model
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confustion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confustion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2259,4
Actual 1,9,12


Accuracy Score : 0.9943082311733801
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2263
           1       0.75      0.57      0.65        21

    accuracy                           0.99      2284
   macro avg       0.87      0.78      0.82      2284
weighted avg       0.99      0.99      0.99      2284



In [9]:
# Calculate feature importance in the RFC model
importances = rf_model.feature_importances_
importances

array([0.12592681, 0.02849383, 0.03717076, 0.03581435, 0.05686457,
       0.05977233, 0.03501889, 0.05738268, 0.03910198, 0.0302367 ,
       0.0306396 , 0.0422412 , 0.02337516, 0.02178504, 0.07015806,
       0.03363978, 0.07383743, 0.05361194, 0.03662109, 0.0528137 ,
       0.05549411])

In [10]:
# Sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1259268123483977, 'W'),
 (0.07383743391692504, 'IP'),
 (0.0701580616947161, 'BFP'),
 (0.05977233239380589, 'SHO'),
 (0.0573826759562148, 'H'),
 (0.056864571057256195, 'CG'),
 (0.05549410588801312, 'ERA_corrected'),
 (0.053611937171204126, 'Win Percentage'),
 (0.0528136971337546, 'WHIP'),
 (0.04224119602821397, 'SO'),
 (0.03910198479849475, 'ER'),
 (0.037170757738471775, 'G'),
 (0.03662108805111601, 'Strikeout to Walk'),
 (0.03581434861057217, 'GS'),
 (0.0350188886638928, 'SV'),
 (0.03363978098459251, 'R'),
 (0.030639603977249328, 'BB'),
 (0.030236697646636647, 'HR'),
 (0.028493826573214424, 'L'),
 (0.023375163582528325, 'WP'),
 (0.021785035784729668, 'HBP')]

In [11]:
# Whole DF
X_2 = hall_pitcher_df.drop(columns=["playerID", "inducted"])
X_2_scaled = X_scaler.transform(X_2)

# Make predictions using the testing data
predictions = rf_model.predict(X_2_scaled)

predictions

array([0, 0, 0, ..., 0, 0, 0])

In [12]:
predictions = predictions.tolist()

In [13]:
hall_pitcher_df["HOF Prediction"] = predictions
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted,HOF Prediction
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,16.0,1475.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0,0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,7.0,4730.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0,0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,12.0,1399.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0,0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,26.0,2568.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0,0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,1.0,67.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,4.0,3476.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0,0
9289,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,99.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0,0
9290,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,4.0,911.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0,0
9291,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,27.0,2746.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0,0


In [14]:
hall_pitcher_df.loc[(hall_pitcher_df['inducted'] == 1) & (hall_pitcher_df['HOF Prediction'] == 1) ]

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted,HOF Prediction
96,alexape01,373.0,208.0,696.0,599.0,437.0,90.0,32.0,4868.0,1476.0,...,70.0,20928.0,1851.0,5190.000000,0.641997,2.311251,1.121195,2.559538,1,1
192,ansonca01,0.0,1.0,3.0,0.0,0.0,0.0,1.0,4.0,2.0,...,0.0,21.0,5.0,4.000000,0.000000,0.500000,1.500000,4.500000,1,1
503,becklja01,0.0,1.0,1.0,1.0,0.0,0.0,0.0,9.0,3.0,...,0.0,21.0,8.0,4.000000,0.000000,2.000000,2.500000,6.750000,1,1
556,bendech01,212.0,127.0,459.0,334.0,255.0,40.0,34.0,2645.0,823.0,...,102.0,11928.0,1110.0,3017.000000,0.625369,2.403090,1.112695,2.455088,1,1
725,blylebe01,287.0,250.0,692.0,685.0,242.0,60.0,0.0,4632.0,1830.0,...,155.0,20491.0,2029.0,4970.000000,0.534451,2.799546,1.197988,3.313883,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8941,wilheho01,143.0,122.0,1070.0,52.0,20.0,5.0,227.0,1757.0,632.0,...,62.0,9164.0,773.0,2254.333333,0.539623,2.069409,1.124501,2.523141,1,1
9005,willivi01,249.0,205.0,513.0,471.0,388.0,50.0,11.0,3621.0,1167.0,...,156.0,16263.0,1620.0,3996.000000,0.548458,1.362211,1.209459,2.628378,1,1
9163,wrighha01,4.0,4.0,36.0,8.0,0.0,0.0,14.0,149.0,41.0,...,0.0,496.0,103.0,100.333333,0.500000,0.428571,1.624585,3.677741,1,1
9185,wynnea01,300.0,244.0,691.0,612.0,290.0,49.0,15.0,4291.0,1796.0,...,64.0,19408.0,2037.0,4564.000000,0.551471,1.314930,1.329097,3.541630,1,1


In [15]:
hall_pitcher_df.loc[(hall_pitcher_df['inducted'] == 1)]

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,HBP,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted,HOF Prediction
96,alexape01,373.0,208.0,696.0,599.0,437.0,90.0,32.0,4868.0,1476.0,...,70.0,20928.0,1851.0,5190.000000,0.641997,2.311251,1.121195,2.559538,1,1
192,ansonca01,0.0,1.0,3.0,0.0,0.0,0.0,1.0,4.0,2.0,...,0.0,21.0,5.0,4.000000,0.000000,0.500000,1.500000,4.500000,1,1
503,becklja01,0.0,1.0,1.0,1.0,0.0,0.0,0.0,9.0,3.0,...,0.0,21.0,8.0,4.000000,0.000000,2.000000,2.500000,6.750000,1,1
556,bendech01,212.0,127.0,459.0,334.0,255.0,40.0,34.0,2645.0,823.0,...,102.0,11928.0,1110.0,3017.000000,0.625369,2.403090,1.112695,2.455088,1,1
725,blylebe01,287.0,250.0,692.0,685.0,242.0,60.0,0.0,4632.0,1830.0,...,155.0,20491.0,2029.0,4970.000000,0.534451,2.799546,1.197988,3.313883,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8941,wilheho01,143.0,122.0,1070.0,52.0,20.0,5.0,227.0,1757.0,632.0,...,62.0,9164.0,773.0,2254.333333,0.539623,2.069409,1.124501,2.523141,1,1
9005,willivi01,249.0,205.0,513.0,471.0,388.0,50.0,11.0,3621.0,1167.0,...,156.0,16263.0,1620.0,3996.000000,0.548458,1.362211,1.209459,2.628378,1,1
9163,wrighha01,4.0,4.0,36.0,8.0,0.0,0.0,14.0,149.0,41.0,...,0.0,496.0,103.0,100.333333,0.500000,0.428571,1.624585,3.677741,1,1
9185,wynnea01,300.0,244.0,691.0,612.0,290.0,49.0,15.0,4291.0,1796.0,...,64.0,19408.0,2037.0,4564.000000,0.551471,1.314930,1.329097,3.541630,1,1


In [16]:
# Measure probability of predictions
prediction_proba = rf_model.predict_proba(X_2_scaled)

prediction_proba

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [17]:
prediction_proba = prediction_proba.tolist()

In [18]:
# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [19]:
# Create a column with the probability for a Yes
hall_pitcher_df["Yes HOF Probability"] = Y_proba
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted,HOF Prediction,Yes HOF Probability
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,1475.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0,0,0.0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,4730.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0,0,0.0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,1399.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0,0,0.0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,2568.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0,0,0.0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,67.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,3476.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0,0,0.0
9289,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,99.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0,0,0.0
9290,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,911.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0,0,0.0
9291,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,2746.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0,0,0.0


In [20]:
# Convert "Yes HOF Probability" column from object to float64
hall_pitcher_df["Yes HOF Probability"] = pd.to_numeric(hall_pitcher_df["Yes HOF Probability"])

In [21]:
hall_pitcher_df["Yes HOF Probability"] = 100 * hall_pitcher_df["Yes HOF Probability"]

In [22]:
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted,HOF Prediction,Yes HOF Probability
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,1475.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0,0,0.0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,4730.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0,0,0.0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,1399.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0,0,0.0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,2568.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0,0,0.0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,67.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9288,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,3476.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0,0,0.0
9289,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,99.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0,0,0.0
9290,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,911.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0,0,0.0
9291,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,2746.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0,0,0.0


In [23]:
hall_pitcher_df.loc[(hall_pitcher_df['Yes HOF Probability'] >= 90)]

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,BFP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted,HOF Prediction,Yes HOF Probability
96,alexape01,373.0,208.0,696.0,599.0,437.0,90.0,32.0,4868.0,1476.0,...,20928.0,1851.0,5190.0,0.641997,2.311251,1.121195,2.559538,1,1,94.53125
1276,carltst01,329.0,244.0,741.0,709.0,254.0,55.0,2.0,4672.0,1864.0,...,21683.0,2130.0,5217.333333,0.574171,2.25641,1.246806,3.215436,1,1,96.09375
1501,clarkjo01,328.0,178.0,531.0,518.0,485.0,37.0,5.0,4295.0,1417.0,...,19146.0,2384.0,4536.333333,0.648221,1.660789,1.209347,2.811301,1,1,96.09375
1533,clemero02,354.0,184.0,709.0,707.0,118.0,46.0,0.0,4185.0,1707.0,...,20240.0,1885.0,4916.666667,0.657993,2.956962,1.172542,3.124678,0,1,91.40625
2875,galvipu01,365.0,310.0,705.0,688.0,646.0,57.0,2.0,6405.0,1903.0,...,25415.0,3352.0,6003.333333,0.540741,2.425503,1.191005,2.852915,1,1,95.3125
4165,johnswa01,417.0,279.0,802.0,666.0,531.0,110.0,34.0,4913.0,1424.0,...,23642.0,1902.0,5914.666667,0.599138,2.574468,1.061091,2.166817,1,1,98.4375
4296,keefeti01,342.0,225.0,600.0,594.0,554.0,39.0,2.0,4438.0,1474.0,...,20941.0,2470.0,5049.666667,0.603175,2.079481,1.123044,2.627104,1,1,96.09375
5054,maddugr01,355.0,227.0,744.0,740.0,109.0,35.0,0.0,4726.0,1756.0,...,20421.0,1981.0,5008.333333,0.609966,3.374374,1.143095,3.155541,1,1,91.40625
5155,maricju01,243.0,142.0,471.0,457.0,244.0,52.0,2.0,3153.0,1126.0,...,14236.0,1329.0,3507.333333,0.631169,3.248237,1.101121,2.889375,1,1,93.75
5255,mathech01,373.0,188.0,636.0,552.0,435.0,79.0,29.0,4219.0,1135.0,...,18943.0,1620.0,4788.666667,0.664884,2.956368,1.058123,2.133162,1,1,96.875


In [24]:
# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
# hall_pitcher_df["Yes HOF Probability"] = hall_pitcher_df["Yes HOF Probability"].map("{:.2%}".format)