In [1]:
# Import our dependencies
import pandas as pd
import joblib
import dataframe_image as dfi
from sklearn.preprocessing import StandardScaler
from pathlib import Path

In [2]:
import psycopg2
from config import db_password

# Establish a connection to the database by creating a cursor object
# The PostgreSQL server must be accessed through the PostgreSQL APP or Terminal Shell

conn = psycopg2.connect(host="localhost", port = 5432, database="baseball_data", user="postgres", password=db_password)

In [3]:
# Create a cursor object
cur = conn.cursor()

In [4]:
# import entire hall_pitching table from postgres to dataframe
hall_pitching = pd.read_sql('SELECT * FROM hall_pitching', conn)
hall_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,SO,WP,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,340.0,12.0,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,N
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,641.0,22.0,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,N
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,280.0,10.0,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,N
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,161.0,18.0,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,N
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,1.0,3.0,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,383.0,28.0,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,N
9114,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,30.0,1.0,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,N
9115,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,210.0,16.0,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,N
9116,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,223.0,10.0,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,N


In [5]:
# XGB model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_pitching['inducted'] = hall_pitching['inducted'].map(b).fillna(hall_pitching['inducted'])
hall_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,SO,WP,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,340.0,12.0,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,641.0,22.0,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,280.0,10.0,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,161.0,18.0,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,1.0,3.0,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,383.0,28.0,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0
9114,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,30.0,1.0,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0
9115,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,210.0,16.0,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0
9116,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,223.0,10.0,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0


In [6]:
# Set target and features variables
X = hall_pitching.drop(columns=["playerID", "inducted"])

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X)

# Scale data
X_scaled = X_scaler.transform(X)

In [7]:
# Load the saved Model
loaded_model = joblib.load('Final_XGB_Pitcher_Model.sav')

In [8]:
# Make predictions using the testing data
predictions = loaded_model.predict(X_scaled)

predictions = predictions.tolist()

In [9]:
# Measure probability of predictions
prediction_proba = loaded_model.predict_proba(X_scaled)
#prediction_proba = loaded_model.predict_proba(XBatter_2022)

prediction_proba = prediction_proba.tolist()

# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [10]:
# DF post-processing for visualizations
hall_pitching["HOF Prediction"] = predictions

# Create a column with the probability for a Yes
hall_pitching["Yes HOF Probability"] = Y_proba

# Convert "Yes HOF Probability" column from object to float64
hall_pitching["Yes HOF Probability"] = pd.to_numeric(hall_pitching["Yes HOF Probability"])

# hall_pitching["Yes HOF Probability"] = 100 * hall_pitching["Yes HOF Probability"]
hall_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA,inducted,HOF Prediction,Yes HOF Probability
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0,0,0.000177
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0,0,0.000037
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0,0,0.000002
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0,0,0.000001
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0,0,0.000002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0,0,0.000002
9114,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0,0,0.000003
9115,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0,0,0.000015
9116,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0,0,0.000272


In [11]:
# Import CSV
file_path = Path('./Resources/Revised_CSV/Player_Names.csv')
Player_Names_df = pd.read_csv(file_path)
Player_Names_df

Unnamed: 0.1,Unnamed: 0,playerID,First Name,Last Name
0,0,aardsda01,David,Aardsma
1,1,aaronha01,Hank,Aaron
2,2,aaronto01,Tommie,Aaron
3,3,aasedo01,Don,Aase
4,4,abadan01,Andy,Abad
...,...,...,...,...
20088,20088,zupofr01,Frank,Zupo
20089,20089,zuvelpa01,Paul,Zuvella
20090,20090,zuverge01,George,Zuverink
20091,20091,zwilldu01,Dutch,Zwilling


In [12]:
# Merge with Name DF for deployment
Names_Pitcher_df = Player_Names_df.merge(hall_pitching, left_on = 'playerID', right_on = 'playerID')

Names_Pitcher_df = Names_Pitcher_df.drop(columns = ["Unnamed: 0", "playerID"])

Names_Pitcher_df = Names_Pitcher_df.rename(columns = {'inducted' : 'Inducted'})

In [13]:
Names_Pitcher_df.loc[(Names_Pitcher_df['Inducted'] == 1) & (Names_Pitcher_df['HOF Prediction'] == 1)]

Unnamed: 0,First Name,Last Name,W,L,G,GS,CG,SHO,SV,H,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA,Inducted,HOF Prediction,Yes HOF Probability
95,Pete,Alexander,373.0,208.0,696.0,599.0,437.0,90.0,32.0,4868.0,...,70.0,1851.0,5190.000000,0.641997,2.311251,1.121195,2.559538,1,1,0.999999
544,Chief,Bender,212.0,127.0,459.0,334.0,255.0,40.0,34.0,2645.0,...,102.0,1110.0,3017.000000,0.625369,2.403090,1.112695,2.455088,1,1,0.999959
706,Bert,Blyleven,287.0,250.0,692.0,685.0,242.0,60.0,0.0,4632.0,...,155.0,2029.0,4970.000000,0.534451,2.799546,1.197988,3.313883,1,1,0.999535
978,Mordecai,Brown,239.0,130.0,481.0,332.0,271.0,55.0,49.0,2708.0,...,63.0,1044.0,3172.333333,0.647696,2.043091,1.065777,2.056846,1,1,0.999949
1049,Jim,Bunning,224.0,184.0,591.0,519.0,151.0,40.0,16.0,3433.0,...,160.0,1527.0,3760.333333,0.549020,2.855000,1.178885,3.269391,1,1,0.999889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8659,Mickey,Welch,307.0,210.0,565.0,549.0,525.0,41.0,4.0,4588.0,...,52.0,2556.0,4802.000000,0.593810,1.426369,1.225531,2.711995,1,1,0.999798
8775,Hoyt,Wilhelm,143.0,122.0,1070.0,52.0,20.0,5.0,227.0,1757.0,...,62.0,773.0,2254.333333,0.539623,2.069409,1.124501,2.523141,1,1,0.999388
8837,Vic,Willis,249.0,205.0,513.0,471.0,388.0,50.0,11.0,3621.0,...,156.0,1620.0,3996.000000,0.548458,1.362211,1.209459,2.628378,1,1,0.999433
9011,Early,Wynn,300.0,244.0,691.0,612.0,290.0,49.0,15.0,4291.0,...,64.0,2037.0,4564.000000,0.551471,1.314930,1.329097,3.541630,1,1,0.931714


In [14]:
Inducted_Pitchers = Names_Pitcher_df.loc[(Names_Pitcher_df['Inducted'] == 1)]
Inducted_Pitchers.to_csv('Inducted_Pitchers.csv', index = False)

In [15]:
Predicted_Pitchers = Names_Pitcher_df.loc[(Names_Pitcher_df['Inducted'] == 0) & (Names_Pitcher_df['HOF Prediction'] == 1)]
Predicted_Pitchers.to_csv('Predicted_Pitchers.csv', index = False)

In [16]:
Names_Pitcher_df.loc[(Names_Pitcher_df['Inducted'] == 1) & (Names_Pitcher_df['HOF Prediction'] == 0)]

Unnamed: 0,First Name,Last Name,W,L,G,GS,CG,SHO,SV,H,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA,Inducted,HOF Prediction,Yes HOF Probability
3000,Tom,Glavine,305.0,203.0,682.0,682.0,56.0,25.0,0.0,4298.0,...,66.0,1900.0,4413.333333,0.600394,1.738,1.313746,3.536103,1,0,0.313678
3035,Lefty,Gomez,189.0,102.0,368.0,320.0,173.0,28.0,9.0,2290.0,...,19.0,1091.0,2503.0,0.649485,1.340639,1.352377,3.343987,1,0,0.370245
3286,Jesse,Haines,210.0,158.0,555.0,388.0,208.0,24.0,10.0,3460.0,...,57.0,1556.0,3208.666667,0.570652,1.126292,1.349782,3.640765,1,0,0.142012
4929,Ted,Lyons,260.0,230.0,594.0,484.0,356.0,27.0,23.0,4489.0,...,31.0,2056.0,4161.0,0.530612,0.957181,1.348234,3.668349,1,0,0.089622
6909,Eppa,Rixey,266.0,251.0,692.0,552.0,290.0,37.0,14.0,4633.0,...,76.0,1986.0,4494.666667,0.514507,1.247689,1.271507,3.147731,1,0,0.023846


In [17]:
Low_Pitcher_Prob = Names_Pitcher_df.loc[(Names_Pitcher_df['Yes HOF Probability'] > 50) & (Names_Pitcher_df['Yes HOF Probability'] < 75)]
Low_Pitcher_Prob.to_csv('Low_Pitcher_Probability.csv', index = False)

In [18]:
High_Pitcher_Prob = Names_Pitcher_df.loc[(Names_Pitcher_df['Yes HOF Probability'] > 75) & (Names_Pitcher_df['Yes HOF Probability'] < 100)]
High_Pitcher_Prob.to_csv('High_Pitcher_Probability.csv', index = False)

In [19]:
Top_Pitcher_Prob = Names_Pitcher_df.loc[(Names_Pitcher_df['Yes HOF Probability'] > 90)]
Top_Pitcher_Prob.to_csv('Top_Pitcher_Probability.csv', index = False)

In [20]:
# ML model data post-processing
# Convert inducted column to 1 = Y, 0 = N

b = {1 : 'Y', 0 : 'N'}
Names_Pitcher_df['Inducted'] = Names_Pitcher_df['Inducted'].map(b).fillna(Names_Pitcher_df['Inducted'])
Names_Pitcher_df

b = {1 : 'Y', 0 : 'N'}
Names_Pitcher_df['HOF Prediction'] = Names_Pitcher_df['HOF Prediction'].map(b).fillna(Names_Pitcher_df['HOF Prediction'])
Names_Pitcher_df

# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
Names_Pitcher_df["Yes HOF Probability"] = Names_Pitcher_df["Yes HOF Probability"].map("{:.2%}".format)

# Convert Names_Batter_df to JSON for JS files/deployment
Names_Pitcher_df.to_json('Names_Pitcher.json', orient = 'records')

In [21]:
# Import 2022 Batting HOF Ballot CSV
file_path = Path('./Resources/Revised_CSV/2022_HOF_Pitchers_Class.csv')
HOF_2022_df = pd.read_csv(file_path)
HOF_2022_df

Unnamed: 0,Name,W,L,G,GS,CG,ShO,SV,H,ER,...,SO,WP,HBP,TBF,R,IP,W %,K/BB,WHIP,ERA
0,Billy Wagner,47,40,853,0,0,0,422,601,232,...,1196,43,33,3600,262,903.0,0.54023,3.99,1.0,2.31
1,Jonathan Papelbon,41,36,689,3,0,0,368,572,197,...,808,14,34,2938,226,725.2,0.532468,4.37,1.04,2.44
2,Joe Nathan,64,34,787,29,0,0,377,690,294,...,976,47,23,3771,317,923.1,0.653061,2.84,1.12,2.87
3,Roger Clemens,354,184,709,707,118,46,0,4185,1707,...,4672,143,159,20240,1885,4916.2,0.657993,2.96,1.17,3.12
4,Matt Thornton,36,46,748,1,0,0,23,594,251,...,642,28,20,2799,273,662.2,0.439024,2.56,1.28,3.41
5,Curt Schilling,216,146,569,436,83,20,22,2998,1253,...,3116,72,52,13284,1318,3261.0,0.596685,4.38,1.14,3.46
6,Javier Lopez,30,17,839,0,0,0,14,484,206,...,358,16,27,2273,227,533.1,0.638298,1.52,1.35,3.48
7,Tim Hudson,222,133,482,479,26,13,0,2957,1213,...,2080,84,124,13005,1319,3126.2,0.625352,2.27,1.24,3.49
8,Jake Peavy,152,126,388,377,15,6,0,2134,960,...,2207,46,78,9838,1011,2377.0,0.546763,3.12,1.2,3.63
9,Tim Lincecum,110,89,278,270,10,7,1,1506,699,...,1736,107,44,7120,746,1682.0,0.552764,2.59,1.29,3.74


In [22]:
# Set target and features variables
XPitcher_2022 = HOF_2022_df.drop(columns = ['TBF'])

XPitcher_2022 = XPitcher_2022.set_index("Name")

In [23]:
# Scale data
XPitcher_2022_scaled = X_scaler.transform(XPitcher_2022)

In [24]:
# Make predictions using the testing data
predictions = loaded_model.predict(XPitcher_2022_scaled)

predictions = predictions.tolist()

In [25]:
# Measure probability of predictions
prediction_proba = loaded_model.predict_proba(XPitcher_2022_scaled)

prediction_proba = prediction_proba.tolist()

# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [26]:
# DF post-processing for visualizations
HOF_2022_df["HOF Prediction"] = predictions

# Create a column with the probability for a Yes
HOF_2022_df["Yes HOF Probability"] = Y_proba

# Convert "Yes HOF Probability" column from object to float64
HOF_2022_df["Yes HOF Probability"] = pd.to_numeric(HOF_2022_df["Yes HOF Probability"])

# HOF_2022_df["Yes HOF Probability"] = 100 * HOF_2022_df["Yes HOF Probability"]

# Convert inducted column to 1 = Y, 0 = N
b = {1 : 'Y', 0 : 'N'}
HOF_2022_df["HOF Prediction"] = HOF_2022_df["HOF Prediction"].map(b).fillna(HOF_2022_df["HOF Prediction"])

# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
HOF_2022_df["Yes HOF Probability"] = HOF_2022_df["Yes HOF Probability"].map("{:.2%}".format)

# export DF Ballot as png
dfi.export(HOF_2022_df.style, 'Pitching_HOF_2022_Results.png')

In [27]:
# Import 2021 Batting HOF Ballot CSV
file_path = Path('./Resources/Revised_CSV/2021_HOF_Class_Pitchers.csv')
HOF_2021_df = pd.read_csv(file_path)
HOF_2021_df

Unnamed: 0,Name,Team,W,L,G,GS,CG,ShO,SV,H,...,SO,WP,HBP,R,IP,W%,K/BB,WHIP,ERA,playerid
0,Billy Wagner,- - -,47,40,853,0,0,0,422,601,...,1196,43,33,262,903.0,0.54023,3.99,1.0,2.31,578
1,Roger Clemens,- - -,354,184,709,707,118,46,0,4185,...,4672,143,159,1885,4916.2,0.657993,2.96,1.17,3.12,815
2,Curt Schilling,- - -,216,146,569,436,83,20,22,2998,...,3116,72,52,1318,3261.0,0.596685,4.38,1.14,3.46,73
3,Tim Hudson,- - -,222,133,482,479,26,13,0,2957,...,2080,84,124,1319,3126.2,0.625352,2.27,1.24,3.49,921
4,Dan Haren,- - -,153,131,391,380,16,6,1,2357,...,2013,98,67,1105,2419.2,0.538732,4.03,1.18,3.75,1757
5,Mark Buehrle,- - -,214,160,518,493,33,10,0,3472,...,1870,27,79,1542,3283.1,0.572193,2.55,1.28,3.81,225
6,Andy Pettitte,- - -,256,153,531,521,26,4,0,3448,...,2448,69,55,1572,3316.0,0.625917,2.37,1.35,3.85,840
7,A.J. Burnett,- - -,164,157,435,430,24,10,0,2519,...,2513,161,143,1328,2731.1,0.510903,2.28,1.32,3.99,512
8,Barry Zito,- - -,165,143,433,421,12,5,0,2381,...,1885,50,97,1254,2576.2,0.535714,1.77,1.34,4.04,944
9,LaTroy Hawkins,- - -,75,94,1042,98,2,0,127,1607,...,983,70,21,763,1467.1,0.443787,2.16,1.41,4.31,729


In [28]:
# Set target and features variables
HOF_2021_df = HOF_2021_df.drop(columns = ['Team', 'playerid'])

XPitcher_2021 = HOF_2021_df.set_index("Name")

In [29]:
# Scale data
XPitcher_2021_scaled = X_scaler.transform(XPitcher_2021)

In [30]:
# Make predictions using the testing data
predictions = loaded_model.predict(XPitcher_2021_scaled)

predictions = predictions.tolist()

In [31]:
# Measure probability of predictions
prediction_proba = loaded_model.predict_proba(XPitcher_2021_scaled)

prediction_proba = prediction_proba.tolist()

# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [32]:
# DF post-processing for visualizations
HOF_2021_df["HOF Prediction"] = predictions

# Create a column with the probability for a Yes
HOF_2021_df["Yes HOF Probability"] = Y_proba

# Convert "Yes HOF Probability" column from object to float64
HOF_2021_df["Yes HOF Probability"] = pd.to_numeric(HOF_2021_df["Yes HOF Probability"])

HOF_2021_df["Yes HOF Probability"] = 100 * HOF_2021_df["Yes HOF Probability"]

# Convert inducted column to 1 = Y, 0 = N
b = {1 : 'Y', 0 : 'N'}
HOF_2021_df["HOF Prediction"] = HOF_2021_df["HOF Prediction"].map(b).fillna(HOF_2021_df["HOF Prediction"])

# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
HOF_2021_df["Yes HOF Probability"] = HOF_2021_df["Yes HOF Probability"].map("{:.2%}".format)

# export DF Ballot as png
dfi.export(HOF_2021_df.style, 'Pitching_HOF_2021_Results.png')

In [33]:
# Import 2021 Batting HOF Ballot CSV
file_path = Path('./Resources/Revised_CSV/2023_HOF_Class_Pitchers.csv')
HOF_2023_df = pd.read_csv(file_path)
HOF_2023_df

Unnamed: 0,Name,Team,W,L,G,GS,CG,ShO,SV,H,...,SO,WP,HBP,R,IP,W%,K/BB,WHIP,ERA,playerid
0,Billy Wagner,- - -,47,40,853,0,0,0,422,601,...,1196,43,33,262,903.0,0.54023,3.99,1.0,2.31,578
1,Jonathan Papelbon,- - -,41,36,689,3,0,0,368,572,...,808,14,34,226,725.2,0.532468,4.37,1.04,2.44,5975
2,Francisco Rodriguez,- - -,52,53,948,0,0,0,437,738,...,1142,68,17,336,976.0,0.495238,2.94,1.15,2.86,1642
3,Joe Nathan,- - -,64,34,787,29,0,0,377,690,...,976,47,23,317,923.1,0.653061,2.84,1.12,2.87,1122
4,Huston Street,- - -,42,34,668,0,0,0,324,542,...,665,20,8,237,680.0,0.552632,3.63,1.07,2.95,8258
5,Curt Schilling,- - -,216,146,569,436,83,20,22,2998,...,3116,72,52,1318,3261.0,0.596685,4.38,1.14,3.46,73
6,Tim Hudson,- - -,222,133,482,479,26,13,0,2957,...,2080,84,124,1319,3126.2,0.625352,2.27,1.24,3.49,921
7,Jered Weaver,- - -,150,98,331,331,14,8,0,1912,...,1621,37,55,879,2067.1,0.604839,2.94,1.19,3.63,4235
8,Jake Peavy,- - -,152,126,388,377,15,6,0,2134,...,2207,46,78,1011,2377.0,0.546763,3.12,1.2,3.63,1051
9,Matt Cain,SFG,104,118,342,331,15,6,0,1849,...,1694,70,62,910,2085.2,0.468468,2.38,1.23,3.68,4732


In [34]:
# Set target and features variables
HOF_2023_df = HOF_2023_df.drop(columns = ['Team', 'playerid'])

XPitcher_2023 = HOF_2023_df.set_index("Name")

In [35]:
# Scale data
XPitcher_2023_scaled = X_scaler.transform(XPitcher_2023)

In [36]:
# Make predictions using the testing data
predictions = loaded_model.predict(XPitcher_2023_scaled)

predictions = predictions.tolist()

In [37]:
# Measure probability of predictions
prediction_proba = loaded_model.predict_proba(XPitcher_2023_scaled)

prediction_proba = prediction_proba.tolist()

# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [38]:
# DF post-processing for visualizations
HOF_2023_df["HOF Prediction"] = predictions

# Create a column with the probability for a Yes
HOF_2023_df["Yes HOF Probability"] = Y_proba

# Convert "Yes HOF Probability" column from object to float64
HOF_2023_df["Yes HOF Probability"] = pd.to_numeric(HOF_2023_df["Yes HOF Probability"])

HOF_2023_df["Yes HOF Probability"] = 100 * HOF_2023_df["Yes HOF Probability"]

# Convert inducted column to 1 = Y, 0 = N
b = {1 : 'Y', 0 : 'N'}
HOF_2023_df["HOF Prediction"] = HOF_2023_df["HOF Prediction"].map(b).fillna(HOF_2023_df["HOF Prediction"])

# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
HOF_2023_df["Yes HOF Probability"] = HOF_2023_df["Yes HOF Probability"].map("{:.2%}".format)

# export DF Ballot as png
dfi.export(HOF_2023_df.style, 'Pitching_HOF_2023_Results.png')