In [1]:
# Import our dependencies
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from pathlib import Path

In [2]:
import psycopg2
from config import db_password

# Establish a connection to the database by creating a cursor object
# The PostgreSQL server must be accessed through the PostgreSQL APP or Terminal Shell

conn = psycopg2.connect(host="localhost", port = 5432, database="baseball_data", user="postgres", password=db_password)

In [3]:
# Create a cursor object
cur = conn.cursor()

In [4]:
# import entire hall_batter table from postgres to dataframe
hall_batter = pd.read_sql('SELECT * FROM hall_batter', conn)
hall_batter

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,N
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,Y
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,N
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,N
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17517,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,N
17518,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,N
17519,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,34.0,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,N
17520,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,9.0,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,N


In [5]:
cur.close()
conn.close()

In [6]:
# ML model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_batter['inducted'] = hall_batter['inducted'].map(b).fillna(hall_batter['inducted'])
hall_batter

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,0
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17517,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,0
17518,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,0
17519,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,34.0,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,0
17520,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,9.0,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,0


In [7]:
# Set target and features variables
X = hall_batter.drop(columns=["playerID", "inducted"])

In [8]:
#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X)

# Scale data
X_scaled = X_scaler.transform(X)

In [9]:
# Load the saved Model
loaded_model = joblib.load('Final_XGB_Batter_Model.sav')

In [10]:
# Make predictions using the testing data
predictions = loaded_model.predict(X_scaled)

predictions = predictions.tolist()

In [11]:
# Measure probability of predictions
prediction_proba = loaded_model.predict_proba(X_scaled)
#prediction_proba = loaded_model.predict_proba(XBatter_2022)

prediction_proba = prediction_proba.tolist()

# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [12]:
# DF post-processing for visualizations
hall_batter["HOF Prediction"] = predictions

# Create a column with the probability for a Yes
hall_batter["Yes HOF Probability"] = Y_proba

# Convert "Yes HOF Probability" column from object to float64
hall_batter["Yes HOF Probability"] = pd.to_numeric(hall_batter["Yes HOF Probability"])

hall_batter["Yes HOF Probability"] = 100 * hall_batter["Yes HOF Probability"]

hall_batter

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,...,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction,Yes HOF Probability
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0,0,0.000121
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,...,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1,99.983084
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,...,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,0,0,0.000149
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0,0.000121
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,0,0,0.000121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17517,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,...,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,0,0,0.000200
17518,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,0,0,0.000121
17519,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,...,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,0,0,0.000184
17520,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,...,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,0,0,0.000156


In [13]:
# Import CSV
file_path = Path('../Resources/Revised_CSV/Player_Names.csv')
Player_Names_df = pd.read_csv(file_path)
Player_Names_df

Unnamed: 0.1,Unnamed: 0,playerID,First Name,Last Name
0,0,aardsda01,David,Aardsma
1,1,aaronha01,Hank,Aaron
2,2,aaronto01,Tommie,Aaron
3,3,aasedo01,Don,Aase
4,4,abadan01,Andy,Abad
...,...,...,...,...
20088,20088,zupofr01,Frank,Zupo
20089,20089,zuvelpa01,Paul,Zuvella
20090,20090,zuverge01,George,Zuverink
20091,20091,zwilldu01,Dutch,Zwilling


In [14]:
# Merge with Name DF for deployment
Names_Batter_df = Player_Names_df.merge(hall_batter, left_on = 'playerID', right_on = 'playerID')

Names_Batter_df = Names_Batter_df.drop(columns = ["Unnamed: 0", "playerID"])

Names_Batter_df = Names_Batter_df.rename(columns = {'inducted' : 'Inducted'})

In [15]:
Names_Batter_df.loc[(Names_Batter_df['Inducted'] == 1) & (Names_Batter_df['HOF Prediction'] == 1)]

Unnamed: 0,First Name,Last Name,G,AB,R,H,2B,3B,HR,RBI,...,SO,HBP,SH,SF,AVG,OBP,SLG,Inducted,HOF Prediction,Yes HOF Probability
1,Hank,Aaron,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,...,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1,99.983084
239,Roberto,Alomar,2379.0,9073.0,1508.0,2724.0,504.0,80.0,210.0,1134.0,...,1140.0,50.0,148.0,97.0,0.300231,0.371245,0.442852,1,1,99.997354
378,Cap,Anson,2524.0,10281.0,1999.0,3435.0,582.0,142.0,97.0,2075.0,...,330.0,32.0,34.0,0.0,0.334111,0.393998,0.446649,1,1,99.996984
386,Luis,Aparicio,2601.0,10230.0,1335.0,2677.0,394.0,92.0,83.0,791.0,...,742.0,27.0,161.0,76.0,0.261681,0.310778,0.342522,1,1,99.893051
466,Richie,Ashburn,2189.0,8365.0,1322.0,2574.0,317.0,109.0,29.0,586.0,...,571.0,43.0,112.0,18.0,0.307711,0.396405,0.382068,1,1,89.595985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17067,Hack,Wilson,1348.0,4760.0,884.0,1461.0,266.0,67.0,244.0,1063.0,...,713.0,20.0,102.0,0.0,0.306933,0.395123,0.544748,1,1,96.709019
17120,Dave,Winfield,2973.0,11003.0,1669.0,3110.0,540.0,88.0,465.0,1833.0,...,1686.0,25.0,19.0,95.0,0.282650,0.352622,0.474507,1,1,99.983644
17351,Carl,Yastrzemski,3308.0,11988.0,1816.0,3419.0,646.0,59.0,452.0,1844.0,...,1393.0,40.0,13.0,105.0,0.285202,0.379453,0.462045,1,1,99.940014
17431,Ross,Youngs,1211.0,4627.0,812.0,1491.0,236.0,93.0,42.0,592.0,...,390.0,37.0,119.0,0.0,0.322239,0.398542,0.440674,1,1,99.981493


In [16]:
Names_Batter_df.loc[(Names_Batter_df['Inducted'] == 1)]

Unnamed: 0,First Name,Last Name,G,AB,R,H,2B,3B,HR,RBI,...,SO,HBP,SH,SF,AVG,OBP,SLG,Inducted,HOF Prediction,Yes HOF Probability
1,Hank,Aaron,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,...,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1,99.983084
239,Roberto,Alomar,2379.0,9073.0,1508.0,2724.0,504.0,80.0,210.0,1134.0,...,1140.0,50.0,148.0,97.0,0.300231,0.371245,0.442852,1,1,99.997354
378,Cap,Anson,2524.0,10281.0,1999.0,3435.0,582.0,142.0,97.0,2075.0,...,330.0,32.0,34.0,0.0,0.334111,0.393998,0.446649,1,1,99.996984
386,Luis,Aparicio,2601.0,10230.0,1335.0,2677.0,394.0,92.0,83.0,791.0,...,742.0,27.0,161.0,76.0,0.261681,0.310778,0.342522,1,1,99.893051
466,Richie,Ashburn,2189.0,8365.0,1322.0,2574.0,317.0,109.0,29.0,586.0,...,571.0,43.0,112.0,18.0,0.307711,0.396405,0.382068,1,1,89.595985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17067,Hack,Wilson,1348.0,4760.0,884.0,1461.0,266.0,67.0,244.0,1063.0,...,713.0,20.0,102.0,0.0,0.306933,0.395123,0.544748,1,1,96.709019
17120,Dave,Winfield,2973.0,11003.0,1669.0,3110.0,540.0,88.0,465.0,1833.0,...,1686.0,25.0,19.0,95.0,0.282650,0.352622,0.474507,1,1,99.983644
17351,Carl,Yastrzemski,3308.0,11988.0,1816.0,3419.0,646.0,59.0,452.0,1844.0,...,1393.0,40.0,13.0,105.0,0.285202,0.379453,0.462045,1,1,99.940014
17431,Ross,Youngs,1211.0,4627.0,812.0,1491.0,236.0,93.0,42.0,592.0,...,390.0,37.0,119.0,0.0,0.322239,0.398542,0.440674,1,1,99.981493


In [17]:
Names_Batter_df.loc[(Names_Batter_df['Inducted'] == 0) & (Names_Batter_df['HOF Prediction'] == 1)]

Unnamed: 0,First Name,Last Name,G,AB,R,H,2B,3B,HR,RBI,...,SO,HBP,SH,SF,AVG,OBP,SLG,Inducted,HOF Prediction,Yes HOF Probability
247,Moises,Alou,1942.0,7037.0,1109.0,2134.0,421.0,39.0,332.0,1287.0,...,894.0,48.0,9.0,82.0,0.303254,0.369307,0.515703,0,1,63.637656
602,Harold,Baines,2830.0,9908.0,1299.0,2866.0,488.0,49.0,384.0,1628.0,...,1441.0,14.0,9.0,99.0,0.289261,0.355680,0.464675,0,1,77.877462
938,Ginger,Beaumont,1463.0,5660.0,955.0,1759.0,182.0,82.0,39.0,617.0,...,314.0,30.0,166.0,0.0,0.310777,0.362061,0.392580,0,1,69.286424
1121,Wally,Berger,1350.0,5163.0,809.0,1550.0,299.0,59.0,242.0,898.0,...,694.0,38.0,27.0,0.0,0.300213,0.358943,0.521596,0,1,99.068403
1377,Ossie,Bluege,1867.0,6440.0,883.0,1751.0,276.0,67.0,43.0,848.0,...,515.0,71.0,218.0,0.0,0.271894,0.351811,0.355590,0,1,67.113680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16731,Billy,Werber,1295.0,5024.0,875.0,1363.0,271.0,50.0,78.0,539.0,...,363.0,32.0,88.0,0.0,0.271298,0.364079,0.391720,0,1,77.524775
16762,Sam,West,1753.0,6148.0,934.0,1838.0,347.0,101.0,75.0,838.0,...,540.0,5.0,123.0,0.0,0.298959,0.370711,0.424854,0,1,64.531475
16940,Bernie,Williams,2076.0,7869.0,1366.0,2336.0,449.0,55.0,287.0,1257.0,...,1212.0,39.0,12.0,64.0,0.296861,0.380931,0.477316,0,1,57.110327
16947,Cy,Williams,2002.0,6780.0,1024.0,1981.0,306.0,74.0,251.0,1005.0,...,721.0,86.0,164.0,0.0,0.292183,0.364876,0.470206,0,1,58.962649


In [18]:
Names_Batter_df.loc[(Names_Batter_df['Inducted'] == 1) & (Names_Batter_df['HOF Prediction'] == 0)]

Unnamed: 0,First Name,Last Name,G,AB,R,H,2B,3B,HR,RBI,...,SO,HBP,SH,SF,AVG,OBP,SLG,Inducted,HOF Prediction,Yes HOF Probability
676,Dave,Bancroft,1913.0,7182.0,1048.0,2004.0,320.0,77.0,32.0,591.0,...,487.0,23.0,212.0,0.0,0.279031,0.355329,0.358396,1,0,35.340378
1061,Johnny,Bench,2158.0,7658.0,1091.0,2048.0,381.0,24.0,389.0,1376.0,...,1278.0,19.0,11.0,90.0,0.267433,0.341649,0.475842,1,0,26.863223
4069,Larry,Doby,1533.0,5348.0,960.0,1515.0,243.0,52.0,253.0,970.0,...,1011.0,38.0,20.0,25.0,0.283283,0.385864,0.49009,1,0,2.46269
8341,Ralph,Kiner,1472.0,5205.0,971.0,1451.0,216.0,39.0,369.0,1015.0,...,749.0,24.0,9.0,7.0,0.27877,0.397951,0.547935,1,0,3.024101
9273,Ernie,Lombardi,1853.0,5855.0,601.0,1792.0,277.0,27.0,190.0,990.0,...,262.0,46.0,18.0,0.0,0.306063,0.358237,0.459949,1,0,4.123737
10031,Bill,Mazeroski,2163.0,7755.0,769.0,2016.0,294.0,62.0,138.0,853.0,...,706.0,20.0,87.0,70.0,0.259961,0.299445,0.367247,1,0,0.002418
12461,Mike,Piazza,1912.0,6911.0,1048.0,2127.0,344.0,8.0,427.0,1335.0,...,1113.0,30.0,0.0,45.0,0.30777,0.376501,0.545218,1,0,1.164138
14029,Ray,Schalk,1762.0,5306.0,579.0,1345.0,199.0,49.0,11.0,594.0,...,355.0,59.0,214.0,0.0,0.253487,0.340163,0.31568,1,0,24.526992


In [19]:
# Import 2022 Batting HOF Ballot CSV
file_path = Path('../Resources/Revised_CSV/2022_HOF_Class_hitters.csv')
HOF_2022_df = pd.read_csv(file_path)
HOF_2022_df

Unnamed: 0,Name,Team,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,playerid
0,Barry Bonds,- - -,2986,9847,2227,2935,601,77,762,1996,514,2558,1539,106,4,91,0.298,0.444,0.607,1109
1,Manny Ramirez,- - -,2302,8244,1544,2574,547,20,555,1831,38,1329,1813,109,2,90,0.312,0.411,0.585,210
2,Todd Helton,Rockies,2247,7962,1401,2519,592,37,369,1406,37,1335,1175,57,3,93,0.316,0.414,0.539,432
3,Alex Rodriguez,- - -,2784,10566,2021,3115,548,31,696,2086,329,1338,2287,176,16,111,0.295,0.38,0.55,1274
4,David Ortiz,- - -,2408,8640,1419,2472,632,19,541,1768,17,1319,1750,38,2,92,0.286,0.38,0.552,745
5,Gary Sheffield,- - -,2576,9217,1636,2689,467,27,509,1676,253,1475,1171,135,9,111,0.292,0.393,0.514,114
6,Bobby Abreu,- - -,2425,8480,1453,2470,574,59,288,1363,400,1476,1840,33,7,85,0.291,0.395,0.475,945
7,Prince Fielder,- - -,1611,5821,862,1645,321,10,319,1028,18,847,1155,124,0,61,0.283,0.382,0.506,4613
8,Mark Teixeira,- - -,1862,6936,1099,1862,408,18,409,1298,26,918,1441,111,0,64,0.268,0.36,0.509,1281
9,Sammy Sosa,- - -,2354,8813,1475,2408,379,45,609,1667,234,929,2306,59,17,78,0.273,0.344,0.534,302


In [20]:
# Set target and features variables
XBatter_2022 = HOF_2022_df.drop(columns=["Team", "playerid"])

XBatter_2022 = XBatter_2022.set_index('Name')

In [21]:
#Scale the data
XBatter_2022_scaled = X_scaler.transform(XBatter_2022)

In [22]:
# Make predictions using the testing data
predictions = loaded_model.predict(XBatter_2022_scaled)

predictions = predictions.tolist()

In [23]:
# Measure probability of predictions
prediction_proba = loaded_model.predict_proba(XBatter_2022_scaled)

prediction_proba = prediction_proba.tolist()

# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [24]:
# DF post-processing for visualizations
HOF_2022_df["HOF Prediction"] = predictions

# Create a column with the probability for a Yes
HOF_2022_df["Yes HOF Probability"] = Y_proba

# Convert "Yes HOF Probability" column from object to float64
HOF_2022_df["Yes HOF Probability"] = pd.to_numeric(HOF_2022_df["Yes HOF Probability"])

HOF_2022_df["Yes HOF Probability"] = 100 * HOF_2022_df["Yes HOF Probability"]

HOF_2022_df = HOF_2022_df.drop(columns = ["Team", "playerid"])

HOF_2022_df
# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
# HOF_2022_df["Yes HOF Probability"] = HOF_2022_df["Yes HOF Probability"].map("{:.2%}".format)

Unnamed: 0,Name,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,HOF Prediction,Yes HOF Probability
0,Barry Bonds,2986,9847,2227,2935,601,77,762,1996,514,2558,1539,106,4,91,0.298,0.444,0.607,1,99.016374
1,Manny Ramirez,2302,8244,1544,2574,547,20,555,1831,38,1329,1813,109,2,90,0.312,0.411,0.585,1,63.031465
2,Todd Helton,2247,7962,1401,2519,592,37,369,1406,37,1335,1175,57,3,93,0.316,0.414,0.539,0,9.083164
3,Alex Rodriguez,2784,10566,2021,3115,548,31,696,2086,329,1338,2287,176,16,111,0.295,0.38,0.55,1,55.802155
4,David Ortiz,2408,8640,1419,2472,632,19,541,1768,17,1319,1750,38,2,92,0.286,0.38,0.552,0,1.164552
5,Gary Sheffield,2576,9217,1636,2689,467,27,509,1676,253,1475,1171,135,9,111,0.292,0.393,0.514,1,99.950743
6,Bobby Abreu,2425,8480,1453,2470,574,59,288,1363,400,1476,1840,33,7,85,0.291,0.395,0.475,0,2.879627
7,Prince Fielder,1611,5821,862,1645,321,10,319,1028,18,847,1155,124,0,61,0.283,0.382,0.506,0,0.007427
8,Mark Teixeira,1862,6936,1099,1862,408,18,409,1298,26,918,1441,111,0,64,0.268,0.36,0.509,0,0.232098
9,Sammy Sosa,2354,8813,1475,2408,379,45,609,1667,234,929,2306,59,17,78,0.273,0.344,0.534,1,57.197458


In [25]:
# Import 2021 Batting HOF Ballot CSV
file_path = Path('../Resources/Revised_CSV/2021_HOF_Class_Batters.csv')
HOF_2021_df = pd.read_csv(file_path)
HOF_2021_df

Unnamed: 0,Name,Team,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,playerid
0,Barry Bonds,- - -,2986,9847,2227,2935,601,77,762,1996,514,2558,1539,106,4,91,0.298,0.444,0.607,1109
1,Manny Ramirez,- - -,2302,8244,1544,2574,547,20,555,1831,38,1329,1813,109,2,90,0.312,0.411,0.585,210
2,Todd Helton,COL,2247,7962,1401,2519,592,37,369,1406,37,1335,1175,57,3,93,0.316,0.414,0.539,432
3,Gary Sheffield,- - -,2576,9217,1636,2689,467,27,509,1676,253,1475,1171,135,9,111,0.292,0.393,0.514,114
4,Bobby Abreu,- - -,2425,8480,1453,2470,574,59,288,1363,400,1476,1840,33,7,85,0.291,0.395,0.475,945
5,Sammy Sosa,- - -,2354,8813,1475,2408,379,45,609,1667,234,929,2306,59,17,78,0.273,0.344,0.534,302
6,Scott Rolen,- - -,2038,7398,1211,2077,517,43,316,1287,118,899,1410,127,1,93,0.281,0.364,0.49,970
7,Jeff Kent,- - -,2298,8498,1320,2461,560,47,377,1518,94,801,1522,125,10,103,0.29,0.356,0.5,1119
8,Aramis Ramirez,- - -,2194,8136,1098,2303,495,24,386,1417,29,633,1238,127,3,87,0.283,0.341,0.492,1002
9,Andruw Jones,- - -,2196,7599,1204,1933,383,36,434,1289,152,891,1748,97,6,71,0.254,0.337,0.486,96


In [26]:
# Set target and features variables
XBatter_2021 = HOF_2021_df.drop(columns=["Team", "playerid"])

XBatter_2021 = XBatter_2021.set_index('Name')

In [27]:
#Scale the data
XBatter_2021_scaled = X_scaler.transform(XBatter_2021)

In [28]:
# Make predictions using the testing data
predictions = loaded_model.predict(XBatter_2021_scaled)

predictions = predictions.tolist()

In [29]:
# Measure probability of predictions
prediction_proba = loaded_model.predict_proba(XBatter_2021_scaled)

prediction_proba = prediction_proba.tolist()

# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [30]:
# DF post-processing for visualizations
HOF_2021_df["HOF Prediction"] = predictions

# Create a column with the probability for a Yes
HOF_2021_df["Yes HOF Probability"] = Y_proba

# Convert "Yes HOF Probability" column from object to float64
HOF_2021_df["Yes HOF Probability"] = pd.to_numeric(HOF_2021_df["Yes HOF Probability"])

HOF_2021_df["Yes HOF Probability"] = 100 * HOF_2021_df["Yes HOF Probability"]

HOF_2021_df = HOF_2021_df.drop(columns = ["Team", "playerid"])

HOF_2021_df
# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
# HOF_2022_df["Yes HOF Probability"] = HOF_2022_df["Yes HOF Probability"].map("{:.2%}".format)

Unnamed: 0,Name,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,HOF Prediction,Yes HOF Probability
0,Barry Bonds,2986,9847,2227,2935,601,77,762,1996,514,2558,1539,106,4,91,0.298,0.444,0.607,1,99.016374
1,Manny Ramirez,2302,8244,1544,2574,547,20,555,1831,38,1329,1813,109,2,90,0.312,0.411,0.585,1,63.031465
2,Todd Helton,2247,7962,1401,2519,592,37,369,1406,37,1335,1175,57,3,93,0.316,0.414,0.539,0,9.083164
3,Gary Sheffield,2576,9217,1636,2689,467,27,509,1676,253,1475,1171,135,9,111,0.292,0.393,0.514,1,99.950743
4,Bobby Abreu,2425,8480,1453,2470,574,59,288,1363,400,1476,1840,33,7,85,0.291,0.395,0.475,0,2.879627
5,Sammy Sosa,2354,8813,1475,2408,379,45,609,1667,234,929,2306,59,17,78,0.273,0.344,0.534,1,57.197458
6,Scott Rolen,2038,7398,1211,2077,517,43,316,1287,118,899,1410,127,1,93,0.281,0.364,0.49,0,0.171231
7,Jeff Kent,2298,8498,1320,2461,560,47,377,1518,94,801,1522,125,10,103,0.29,0.356,0.5,0,33.142805
8,Aramis Ramirez,2194,8136,1098,2303,495,24,386,1417,29,633,1238,127,3,87,0.283,0.341,0.492,0,2.61706
9,Andruw Jones,2196,7599,1204,1933,383,36,434,1289,152,891,1748,97,6,71,0.254,0.337,0.486,0,1.175741


In [31]:
# Import 2021 Batting HOF Ballot CSV
file_path = Path('../Resources/Revised_CSV/2023_HOF_Class_Batters.csv')
HOF_2023_df = pd.read_csv(file_path)
HOF_2023_df

Unnamed: 0,Name,Team,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,playerid
0,Todd Helton,COL,2247,7962,1401,2519,592,37,369,1406,37,1335,1175,57,3,93,0.316,0.414,0.539,432
1,Alex Rodriguez,- - -,2784,10566,2021,3115,548,31,696,2086,329,1338,2287,176,16,111,0.295,0.38,0.55,1274
2,David Ortiz,- - -,2408,8640,1419,2472,632,19,541,1768,17,1319,1750,38,2,92,0.286,0.38,0.552,745
3,Bobby Abreu,- - -,2425,8480,1453,2470,574,59,288,1363,400,1476,1840,33,7,85,0.291,0.395,0.475,945
4,Prince Fielder,- - -,1611,5821,862,1645,321,10,319,1028,18,847,1155,124,0,61,0.283,0.382,0.506,4613
5,Mark Teixeira,- - -,1862,6936,1099,1862,408,18,409,1298,26,918,1441,111,0,64,0.268,0.36,0.509,1281
6,Scott Rolen,- - -,2038,7398,1211,2077,517,43,316,1287,118,899,1410,127,1,93,0.281,0.364,0.49,970
7,Jeff Kent,- - -,2298,8498,1320,2461,560,47,377,1518,94,801,1522,125,10,103,0.29,0.356,0.5,1119
8,Ryan Howard,PHI,1572,5707,848,1475,277,21,382,1194,12,709,1843,59,0,55,0.258,0.343,0.515,2154
9,Carlos Beltran,- - -,2586,9768,1582,2725,565,78,435,1587,312,1084,1795,51,18,110,0.279,0.35,0.486,589


In [32]:
# Set target and features variables
XBatter_2023 = HOF_2023_df.drop(columns=["Team", "playerid"])

XBatter_2023 = XBatter_2023.set_index('Name')

In [33]:
#Scale the data
XBatter_2023_scaled = X_scaler.transform(XBatter_2023)

In [34]:
# Make predictions using the testing data
predictions = loaded_model.predict(XBatter_2023_scaled)

predictions = predictions.tolist()

In [35]:
# Measure probability of predictions
prediction_proba = loaded_model.predict_proba(XBatter_2023_scaled)

prediction_proba = prediction_proba.tolist()

# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [36]:
# DF post-processing for visualizations
HOF_2023_df["HOF Prediction"] = predictions

# Create a column with the probability for a Yes
HOF_2023_df["Yes HOF Probability"] = Y_proba

# Convert "Yes HOF Probability" column from object to float64
HOF_2023_df["Yes HOF Probability"] = pd.to_numeric(HOF_2023_df["Yes HOF Probability"])

HOF_2023_df["Yes HOF Probability"] = 100 * HOF_2023_df["Yes HOF Probability"]

HOF_2023_df = HOF_2023_df.drop(columns = ["Team", "playerid"])

HOF_2023_df
# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
# HOF_2022_df["Yes HOF Probability"] = HOF_2022_df["Yes HOF Probability"].map("{:.2%}".format)

Unnamed: 0,Name,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,HOF Prediction,Yes HOF Probability
0,Todd Helton,2247,7962,1401,2519,592,37,369,1406,37,1335,1175,57,3,93,0.316,0.414,0.539,0,9.083164
1,Alex Rodriguez,2784,10566,2021,3115,548,31,696,2086,329,1338,2287,176,16,111,0.295,0.38,0.55,1,55.802155
2,David Ortiz,2408,8640,1419,2472,632,19,541,1768,17,1319,1750,38,2,92,0.286,0.38,0.552,0,1.164552
3,Bobby Abreu,2425,8480,1453,2470,574,59,288,1363,400,1476,1840,33,7,85,0.291,0.395,0.475,0,2.879627
4,Prince Fielder,1611,5821,862,1645,321,10,319,1028,18,847,1155,124,0,61,0.283,0.382,0.506,0,0.007427
5,Mark Teixeira,1862,6936,1099,1862,408,18,409,1298,26,918,1441,111,0,64,0.268,0.36,0.509,0,0.232098
6,Scott Rolen,2038,7398,1211,2077,517,43,316,1287,118,899,1410,127,1,93,0.281,0.364,0.49,0,0.171231
7,Jeff Kent,2298,8498,1320,2461,560,47,377,1518,94,801,1522,125,10,103,0.29,0.356,0.5,0,33.142805
8,Ryan Howard,1572,5707,848,1475,277,21,382,1194,12,709,1843,59,0,55,0.258,0.343,0.515,0,0.024013
9,Carlos Beltran,2586,9768,1582,2725,565,78,435,1587,312,1084,1795,51,18,110,0.279,0.35,0.486,0,49.936971
