In [1]:
# Import our dependencies

import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from pathlib import Path
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

In [None]:
import psycopg2
from config import db_password

# Establish a connection to the database by creating a cursor object
# The PostgreSQL server must be accessed through the PostgreSQL APP or Terminal Shell

conn = psycopg2.connect(host="localhost", port = 5432, database="baseball_data", user="postgres", password=db_password)


In [None]:
# Create a cursor object
cur = conn.cursor()

In [None]:
# A sample query of all data from the "career_batter" table in the "baseball_data" database
cur.execute("""SELECT * FROM career_batter LIMIT 5""")
query_results = cur.fetchall()
print(query_results)

In [None]:
# import entire hall_pitching table from postgres to dataframe
hall_pitching = pd.read_sql('SELECT * FROM hall_pitching', conn)
hall_pitching

In [None]:
# import entire hall_pitching table from postgres to dataframe
career_pitching = pd.read_sql('SELECT * FROM career_pitching', conn)
career_pitching

In [None]:
cur.close()
conn.close()

In [2]:
# Import CSV
file_path = Path('../Resources/Revised_CSV/hall_pitching.csv')
hall_pitching = pd.read_csv(file_path)
hall_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,SO,WP,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,340.0,12.0,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,N
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,641.0,22.0,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,N
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,280.0,10.0,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,N
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,161.0,18.0,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,N
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,1.0,3.0,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9130,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,383.0,28.0,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,N
9131,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,30.0,1.0,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,N
9132,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,210.0,16.0,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,N
9133,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,223.0,10.0,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,N


In [3]:
# RFC model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_pitching['inducted'] = hall_pitching['inducted'].map(b).fillna(hall_pitching['inducted'])
hall_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,SO,WP,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,340.0,12.0,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,641.0,22.0,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,280.0,10.0,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,161.0,18.0,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,1.0,3.0,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9130,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,383.0,28.0,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0
9131,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,30.0,1.0,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0
9132,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,210.0,16.0,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0
9133,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,223.0,10.0,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0


In [4]:
# Set target and features variables
y = hall_pitching.inducted
X = hall_pitching.drop(columns=["playerID", "inducted"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [5]:
# Use SMOTE to addres imbalanced dataset
X_resampled, y_resampled = SMOTE(random_state=42,
sampling_strategy='auto').fit_resample(
   X_train_scaled, y_train)

In [6]:
# Create a XGB model
XGB_model = XGBClassifier(max_depth = 5, min_child_weight = 1, gamma = 0, subsample = 0.8, colsample_bytree = 0.8, scale_pos_weight = 1, use_label_encoder = False)

# Fit the model
XGB_model = XGB_model.fit(X_resampled, y_resampled)



In [7]:
print(XGB_model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)


In [8]:
predictions = XGB_model.predict(X_test_scaled)

predictions

array([1, 0, 0, ..., 0, 0, 0])

In [9]:
# Evaluate the model

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confustion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confustion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2251,12
Actual 1,5,16


Accuracy Score : 0.9925569176882661
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2263
           1       0.57      0.76      0.65        21

    accuracy                           0.99      2284
   macro avg       0.78      0.88      0.82      2284
weighted avg       0.99      0.99      0.99      2284



In [10]:
# Calculate feature importance in the RFC model
importances = XGB_model.feature_importances_
importances

array([0.47458616, 0.01553954, 0.04010931, 0.04815817, 0.07176445,
       0.06199738, 0.04456316, 0.01231529, 0.01381674, 0.03951965,
       0.00763004, 0.03835852, 0.01386172, 0.01216148, 0.00565775,
       0.00936862, 0.02027355, 0.01126593, 0.02477773, 0.03427481],
      dtype=float32)

In [11]:
# Sort the features by their importance
sorted(zip(XGB_model.feature_importances_, X.columns), reverse=True)

[(0.47458616, 'W'),
 (0.071764454, 'CG'),
 (0.061997376, 'SHO'),
 (0.048158173, 'GS'),
 (0.044563156, 'SV'),
 (0.040109314, 'G'),
 (0.039519653, 'HR'),
 (0.03835852, 'SO'),
 (0.03427481, 'ERA_corrected'),
 (0.02477773, 'WHIP'),
 (0.020273546, 'Win Percentage'),
 (0.015539536, 'L'),
 (0.0138617195, 'WP'),
 (0.013816736, 'ER'),
 (0.012315285, 'H'),
 (0.01216148, 'HBP'),
 (0.011265931, 'Strikeout to Walk'),
 (0.009368624, 'IP'),
 (0.0076300413, 'BB'),
 (0.0056577474, 'R')]

In [12]:
# Whole DF
X_2 = hall_pitching.drop(columns=["playerID", "inducted"])
X_2_scaled = X_scaler.transform(X_2)

# Make predictions using the testing data
predictions = XGB_model.predict(X_2_scaled)

predictions

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
predictions = predictions.tolist()

In [14]:
hall_pitching["HOF Prediction"] = predictions
hall_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,WP,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted,HOF Prediction
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,12.0,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0,0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,22.0,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0,0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,10.0,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0,0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,18.0,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0,0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,3.0,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9130,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,28.0,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0,0
9131,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0,0
9132,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,16.0,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0,0
9133,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,10.0,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0,0


In [15]:
# Measure probability of predictions
prediction_proba = XGB_model.predict_proba(X_2_scaled)

prediction_proba

array([[9.99973834e-01, 2.61460718e-05],
       [9.98019338e-01, 1.98067329e-03],
       [9.99998093e-01, 1.90846117e-06],
       ...,
       [9.99892294e-01, 1.07703614e-04],
       [9.95591521e-01, 4.40849969e-03],
       [9.99965191e-01, 3.48108442e-05]], dtype=float32)

In [16]:
prediction_proba = prediction_proba.tolist()

In [17]:
# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [18]:
# Create a column with the probability for a Yes
hall_pitching["Yes HOF Probability"] = Y_proba
hall_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted,HOF Prediction,Yes HOF Probability
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0,0,0.000026
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0,0,0.001981
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0,0,0.000002
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0,0,0.000119
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0,0,0.003873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9130,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0,0,0.000080
9131,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0,0,0.000012
9132,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0,0,0.000108
9133,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0,0,0.004408


In [19]:
# Import CSV
file_path = Path('../Resources/Revised_CSV/Player_Names.csv')
Player_Names_df = pd.read_csv(file_path)
Player_Names_df

Unnamed: 0.1,Unnamed: 0,playerID,First Name,Last Name
0,0,aardsda01,David,Aardsma
1,1,aaronha01,Hank,Aaron
2,2,aaronto01,Tommie,Aaron
3,3,aasedo01,Don,Aase
4,4,abadan01,Andy,Abad
...,...,...,...,...
20088,20088,zupofr01,Frank,Zupo
20089,20089,zuvelpa01,Paul,Zuvella
20090,20090,zuverge01,George,Zuverink
20091,20091,zwilldu01,Dutch,Zwilling


In [20]:
Names_Pitching_df = Player_Names_df.merge(hall_pitching, left_on = 'playerID', right_on = 'playerID')

In [21]:
Names_Pitching_df = Names_Pitching_df.drop(columns = ["Unnamed: 0", "playerID"])
Names_Pitching_df

Unnamed: 0,First Name,Last Name,W,L,G,GS,CG,SHO,SV,H,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,inducted,HOF Prediction,Yes HOF Probability
0,David,Aardsma,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,...,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0,0,0.000026
1,Don,Aase,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,...,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0,0,0.001981
2,Fernando,Abad,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,...,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0,0,0.000002
3,Bert,Abbey,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,...,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0,0,0.000119
4,Dan,Abbott,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,...,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0,0,0.003873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9130,Bill,Zuber,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,...,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0,0,0.000080
9131,Tyler,Zuber,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,...,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0,0,0.000012
9132,Joel,Zumaya,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,...,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0,0,0.000108
9133,George,Zuverink,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,...,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0,0,0.004408


In [22]:
Names_Pitching_df = Names_Pitching_df.rename(columns = {'inducted' : 'Inducted'})

In [23]:
# Convert "Yes HOF Probability" column from object to float64
Names_Pitching_df["Yes HOF Probability"] = pd.to_numeric(Names_Pitching_df["Yes HOF Probability"])

In [24]:
Names_Pitching_df["Yes HOF Probability"] = 100 * Names_Pitching_df["Yes HOF Probability"]
Names_Pitching_df

Unnamed: 0,First Name,Last Name,W,L,G,GS,CG,SHO,SV,H,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,Inducted,HOF Prediction,Yes HOF Probability
0,David,Aardsma,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,...,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0,0,0.002615
1,Don,Aase,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,...,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0,0,0.198067
2,Fernando,Abad,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,...,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0,0,0.000191
3,Bert,Abbey,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,...,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0,0,0.011942
4,Dan,Abbott,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,...,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0,0,0.387298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9130,Bill,Zuber,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,...,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0,0,0.007954
9131,Tyler,Zuber,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,...,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0,0,0.001200
9132,Joel,Zumaya,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,...,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0,0,0.010770
9133,George,Zuverink,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,...,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0,0,0.440850


In [25]:
Names_Pitching_df.loc[(Names_Pitching_df['Inducted'] == 1) & (Names_Pitching_df['HOF Prediction'] == 1) ]

Unnamed: 0,First Name,Last Name,W,L,G,GS,CG,SHO,SV,H,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,Inducted,HOF Prediction,Yes HOF Probability
95,Pete,Alexander,373.0,208.0,696.0,599.0,437.0,90.0,32.0,4868.0,...,70.0,1851.0,5190.000000,0.641997,2.311251,1.121195,2.559538,1,1,99.993217
189,Cap,Anson,0.0,1.0,3.0,0.0,0.0,0.0,1.0,4.0,...,0.0,5.0,4.000000,0.000000,0.500000,1.500000,4.500000,1,1,90.041989
493,Jake,Beckley,0.0,1.0,1.0,1.0,0.0,0.0,0.0,9.0,...,0.0,8.0,4.000000,0.000000,2.000000,2.500000,6.750000,1,1,66.216177
546,Chief,Bender,212.0,127.0,459.0,334.0,255.0,40.0,34.0,2645.0,...,102.0,1110.0,3017.000000,0.625369,2.403090,1.112695,2.455088,1,1,99.976796
708,Bert,Blyleven,287.0,250.0,692.0,685.0,242.0,60.0,0.0,4632.0,...,155.0,2029.0,4970.000000,0.534451,2.799546,1.197988,3.313883,1,1,99.277544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8675,Mickey,Welch,307.0,210.0,565.0,549.0,525.0,41.0,4.0,4588.0,...,52.0,2556.0,4802.000000,0.593810,1.426369,1.225531,2.711995,1,1,99.990201
8792,Hoyt,Wilhelm,143.0,122.0,1070.0,52.0,20.0,5.0,227.0,1757.0,...,62.0,773.0,2254.333333,0.539623,2.069409,1.124501,2.523141,1,1,99.898225
8854,Vic,Willis,249.0,205.0,513.0,471.0,388.0,50.0,11.0,3621.0,...,156.0,1620.0,3996.000000,0.548458,1.362211,1.209459,2.628378,1,1,99.865556
9028,Early,Wynn,300.0,244.0,691.0,612.0,290.0,49.0,15.0,4291.0,...,64.0,2037.0,4564.000000,0.551471,1.314930,1.329097,3.541630,1,1,95.448738


In [26]:
Names_Pitching_df.loc[(Names_Pitching_df['Inducted'] == 1)]

Unnamed: 0,First Name,Last Name,W,L,G,GS,CG,SHO,SV,H,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,Inducted,HOF Prediction,Yes HOF Probability
95,Pete,Alexander,373.0,208.0,696.0,599.0,437.0,90.0,32.0,4868.0,...,70.0,1851.0,5190.000000,0.641997,2.311251,1.121195,2.559538,1,1,99.993217
189,Cap,Anson,0.0,1.0,3.0,0.0,0.0,0.0,1.0,4.0,...,0.0,5.0,4.000000,0.000000,0.500000,1.500000,4.500000,1,1,90.041989
493,Jake,Beckley,0.0,1.0,1.0,1.0,0.0,0.0,0.0,9.0,...,0.0,8.0,4.000000,0.000000,2.000000,2.500000,6.750000,1,1,66.216177
546,Chief,Bender,212.0,127.0,459.0,334.0,255.0,40.0,34.0,2645.0,...,102.0,1110.0,3017.000000,0.625369,2.403090,1.112695,2.455088,1,1,99.976796
708,Bert,Blyleven,287.0,250.0,692.0,685.0,242.0,60.0,0.0,4632.0,...,155.0,2029.0,4970.000000,0.534451,2.799546,1.197988,3.313883,1,1,99.277544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8737,Deacon,White,0.0,0.0,2.0,0.0,0.0,0.0,1.0,19.0,...,0.0,15.0,10.000000,0.000000,1.500000,2.100000,7.200000,1,0,1.396417
8792,Hoyt,Wilhelm,143.0,122.0,1070.0,52.0,20.0,5.0,227.0,1757.0,...,62.0,773.0,2254.333333,0.539623,2.069409,1.124501,2.523141,1,1,99.898225
8854,Vic,Willis,249.0,205.0,513.0,471.0,388.0,50.0,11.0,3621.0,...,156.0,1620.0,3996.000000,0.548458,1.362211,1.209459,2.628378,1,1,99.865556
9028,Early,Wynn,300.0,244.0,691.0,612.0,290.0,49.0,15.0,4291.0,...,64.0,2037.0,4564.000000,0.551471,1.314930,1.329097,3.541630,1,1,95.448738


In [27]:
Names_Pitching_df.loc[(Names_Pitching_df['Inducted'] == 0) & (Names_Pitching_df['HOF Prediction'] == 1) ]

Unnamed: 0,First Name,Last Name,W,L,G,GS,CG,SHO,SV,H,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,Inducted,HOF Prediction,Yes HOF Probability
37,Babe,Adams,194.0,140.0,482.0,355.0,206.0,44.0,15.0,2841.0,...,47.0,1129.0,2995.333333,0.580838,2.409302,1.092032,2.755286,0,1,99.945146
288,King,Bader,5.0,3.0,22.0,6.0,3.0,1.0,1.0,83.0,...,5.0,29.0,75.333333,0.625,0.75,1.579646,2.50885,0,1,73.786259
744,Tommy,Bond,234.0,163.0,417.0,408.0,386.0,42.0,0.0,3765.0,...,2.0,1931.0,3628.666667,0.589421,5.036269,1.090759,2.137975,0,1,65.69674
1038,Charlie,Buffinton,233.0,152.0,414.0,396.0,351.0,30.0,3.0,3344.0,...,31.0,1824.0,3404.0,0.605195,1.985981,1.233843,2.961222,0,1,99.691391
1509,Roger,Clemens,354.0,184.0,709.0,707.0,118.0,46.0,0.0,4185.0,...,159.0,1885.0,4916.666667,0.657993,2.956962,1.172542,3.124678,0,1,99.99367
1656,Larry,Corcoran,177.0,89.0,277.0,268.0,256.0,22.0,2.0,2147.0,...,2.0,1235.0,2392.333333,0.665414,2.22379,1.104779,2.355023,0,1,88.461822
2603,Freddie,Fitzsimmons,217.0,146.0,513.0,426.0,186.0,29.0,13.0,3335.0,...,33.0,1505.0,3223.666667,0.597796,1.028369,1.29697,3.509358,0,1,69.921279
4836,Mickey,Lolich,217.0,191.0,586.0,496.0,195.0,41.0,11.0,3366.0,...,92.0,1537.0,3638.333333,0.531863,2.576888,1.22721,3.438388,0,1,96.43805
5171,Bobby,Mathews,297.0,248.0,578.0,568.0,525.0,20.0,3.0,5601.0,...,48.0,3497.0,4956.0,0.544954,2.87218,1.23749,2.856538,0,1,98.595965
5799,Tony,Mullane,284.0,220.0,555.0,504.0,468.0,30.0,15.0,4195.0,...,185.0,2523.0,4531.333333,0.563492,1.28054,1.236501,3.052744,0,1,99.912494


In [28]:
Names_Pitching_df.loc[(Names_Pitching_df['Inducted'] == 1) & (Names_Pitching_df['HOF Prediction'] == 0) ]

Unnamed: 0,First Name,Last Name,W,L,G,GS,CG,SHO,SV,H,...,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA_corrected,Inducted,HOF Prediction,Yes HOF Probability
724,Wade,Boggs,0.0,0.0,2.0,0.0,0.0,0.0,0.0,3.0,...,0.0,1.0,2.333333,0.0,2.0,1.714286,3.857143,1,0,0.005779
3044,Lefty,Gomez,189.0,102.0,368.0,320.0,173.0,28.0,9.0,2290.0,...,19.0,1091.0,2503.0,0.649485,1.340639,1.352377,3.343987,1,0,30.707464
3091,Rich,Gossage,124.0,107.0,1002.0,37.0,16.0,0.0,310.0,1497.0,...,47.0,670.0,1809.333333,0.536797,2.051913,1.231945,3.009396,1,0,33.417326
6129,Jim,O'Rourke,0.0,1.0,6.0,0.0,0.0,0.0,2.0,17.0,...,0.0,14.0,19.666667,0.0,2.0,0.966102,4.118644,1,0,0.446685
8737,Deacon,White,0.0,0.0,2.0,0.0,0.0,0.0,1.0,19.0,...,0.0,15.0,10.0,0.0,1.5,2.1,7.2,1,0,1.396417


In [29]:
# Import CSV
file_path = Path('../Resources/Revised_CSV/2022_HOF_Pitchers_Class.csv')
HOF_Pitchers_2022 = pd.read_csv(file_path)
HOF_Pitchers_2022

Unnamed: 0,Name,W,L,G,GS,CG,ShO,SV,H,ER,...,SO,WP,HBP,TBF,R,IP,W %,K/BB,WHIP,ERA
0,Billy Wagner,47,40,853,0,0,0,422,601,232,...,1196,43,33,3600,262,903.0,0.54023,3.99,1.0,2.31
1,Jonathan Papelbon,41,36,689,3,0,0,368,572,197,...,808,14,34,2938,226,725.2,0.532468,4.37,1.04,2.44
2,Joe Nathan,64,34,787,29,0,0,377,690,294,...,976,47,23,3771,317,923.1,0.653061,2.84,1.12,2.87
3,Roger Clemens,354,184,709,707,118,46,0,4185,1707,...,4672,143,159,20240,1885,4916.2,0.657993,2.96,1.17,3.12
4,Matt Thornton,36,46,748,1,0,0,23,594,251,...,642,28,20,2799,273,662.2,0.439024,2.56,1.28,3.41
5,Curt Schilling,216,146,569,436,83,20,22,2998,1253,...,3116,72,52,13284,1318,3261.0,0.596685,4.38,1.14,3.46
6,Javier Lopez,30,17,839,0,0,0,14,484,206,...,358,16,27,2273,227,533.1,0.638298,1.52,1.35,3.48
7,Tim Hudson,222,133,482,479,26,13,0,2957,1213,...,2080,84,124,13005,1319,3126.2,0.625352,2.27,1.24,3.49
8,Jake Peavy,152,126,388,377,15,6,0,2134,960,...,2207,46,78,9838,1011,2377.0,0.546763,3.12,1.2,3.63
9,Tim Lincecum,110,89,278,270,10,7,1,1506,699,...,1736,107,44,7120,746,1682.0,0.552764,2.59,1.29,3.74


In [30]:
# Set target and features variables
XP_2022 = HOF_Pitchers_2022.drop(columns = ['TBF'])

XP_2022 = XP_2022.set_index('Name')

In [31]:
# Scale data
X_scaled = X_scaler.transform(XP_2022)

In [32]:
# Make predictions using the testing data
predictions = XGB_model.predict(X_scaled)

predictions

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [33]:
predictions = predictions.tolist()

In [34]:
HOF_Pitchers_2022["HOF Prediction"] = predictions
HOF_Pitchers_2022

Unnamed: 0,Name,W,L,G,GS,CG,ShO,SV,H,ER,...,WP,HBP,TBF,R,IP,W %,K/BB,WHIP,ERA,HOF Prediction
0,Billy Wagner,47,40,853,0,0,0,422,601,232,...,43,33,3600,262,903.0,0.54023,3.99,1.0,2.31,0
1,Jonathan Papelbon,41,36,689,3,0,0,368,572,197,...,14,34,2938,226,725.2,0.532468,4.37,1.04,2.44,0
2,Joe Nathan,64,34,787,29,0,0,377,690,294,...,47,23,3771,317,923.1,0.653061,2.84,1.12,2.87,0
3,Roger Clemens,354,184,709,707,118,46,0,4185,1707,...,143,159,20240,1885,4916.2,0.657993,2.96,1.17,3.12,1
4,Matt Thornton,36,46,748,1,0,0,23,594,251,...,28,20,2799,273,662.2,0.439024,2.56,1.28,3.41,0
5,Curt Schilling,216,146,569,436,83,20,22,2998,1253,...,72,52,13284,1318,3261.0,0.596685,4.38,1.14,3.46,0
6,Javier Lopez,30,17,839,0,0,0,14,484,206,...,16,27,2273,227,533.1,0.638298,1.52,1.35,3.48,0
7,Tim Hudson,222,133,482,479,26,13,0,2957,1213,...,84,124,13005,1319,3126.2,0.625352,2.27,1.24,3.49,0
8,Jake Peavy,152,126,388,377,15,6,0,2134,960,...,46,78,9838,1011,2377.0,0.546763,3.12,1.2,3.63,0
9,Tim Lincecum,110,89,278,270,10,7,1,1506,699,...,107,44,7120,746,1682.0,0.552764,2.59,1.29,3.74,0


In [35]:
# Measure probability of predictions
prediction_proba = XGB_model.predict_proba(X_scaled)

prediction_proba

array([[9.8123747e-01, 1.8762542e-02],
       [9.9453938e-01, 5.4606469e-03],
       [9.6631098e-01, 3.3689015e-02],
       [6.3300133e-05, 9.9993670e-01],
       [9.9997848e-01, 2.1487898e-05],
       [7.5097597e-01, 2.4902402e-01],
       [9.9992967e-01, 7.0336857e-05],
       [9.9993861e-01, 6.1400016e-05],
       [9.9999732e-01, 2.6603695e-06],
       [9.9999332e-01, 6.6940761e-06],
       [9.9997640e-01, 2.3592296e-05],
       [9.9977142e-01, 2.2860015e-04],
       [9.9999946e-01, 5.2915073e-07],
       [9.9999833e-01, 1.6500522e-06],
       [9.9999988e-01, 1.1279088e-07],
       [9.9999917e-01, 8.4313950e-07]], dtype=float32)

In [36]:
prediction_proba = prediction_proba.tolist()

In [37]:
# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [38]:
# Create a column with the probability for a Yes
HOF_Pitchers_2022["Yes HOF Probability"] = Y_proba
HOF_Pitchers_2022

Unnamed: 0,Name,W,L,G,GS,CG,ShO,SV,H,ER,...,HBP,TBF,R,IP,W %,K/BB,WHIP,ERA,HOF Prediction,Yes HOF Probability
0,Billy Wagner,47,40,853,0,0,0,422,601,232,...,33,3600,262,903.0,0.54023,3.99,1.0,2.31,0,0.01876254
1,Jonathan Papelbon,41,36,689,3,0,0,368,572,197,...,34,2938,226,725.2,0.532468,4.37,1.04,2.44,0,0.005460647
2,Joe Nathan,64,34,787,29,0,0,377,690,294,...,23,3771,317,923.1,0.653061,2.84,1.12,2.87,0,0.03368901
3,Roger Clemens,354,184,709,707,118,46,0,4185,1707,...,159,20240,1885,4916.2,0.657993,2.96,1.17,3.12,1,0.9999367
4,Matt Thornton,36,46,748,1,0,0,23,594,251,...,20,2799,273,662.2,0.439024,2.56,1.28,3.41,0,2.14879e-05
5,Curt Schilling,216,146,569,436,83,20,22,2998,1253,...,52,13284,1318,3261.0,0.596685,4.38,1.14,3.46,0,0.249024
6,Javier Lopez,30,17,839,0,0,0,14,484,206,...,27,2273,227,533.1,0.638298,1.52,1.35,3.48,0,7.033686e-05
7,Tim Hudson,222,133,482,479,26,13,0,2957,1213,...,124,13005,1319,3126.2,0.625352,2.27,1.24,3.49,0,6.140002e-05
8,Jake Peavy,152,126,388,377,15,6,0,2134,960,...,78,9838,1011,2377.0,0.546763,3.12,1.2,3.63,0,2.66037e-06
9,Tim Lincecum,110,89,278,270,10,7,1,1506,699,...,44,7120,746,1682.0,0.552764,2.59,1.29,3.74,0,6.694076e-06


In [39]:
# Convert "Yes HOF Probability" column from object to float64
HOF_Pitchers_2022["Yes HOF Probability"] = pd.to_numeric(HOF_Pitchers_2022["Yes HOF Probability"])

In [40]:
HOF_Pitchers_2022["Yes HOF Probability"] = 100 * HOF_Pitchers_2022["Yes HOF Probability"]

In [41]:
HOF_Pitchers_2022

Unnamed: 0,Name,W,L,G,GS,CG,ShO,SV,H,ER,...,HBP,TBF,R,IP,W %,K/BB,WHIP,ERA,HOF Prediction,Yes HOF Probability
0,Billy Wagner,47,40,853,0,0,0,422,601,232,...,33,3600,262,903.0,0.54023,3.99,1.0,2.31,0,1.876254
1,Jonathan Papelbon,41,36,689,3,0,0,368,572,197,...,34,2938,226,725.2,0.532468,4.37,1.04,2.44,0,0.546065
2,Joe Nathan,64,34,787,29,0,0,377,690,294,...,23,3771,317,923.1,0.653061,2.84,1.12,2.87,0,3.368901
3,Roger Clemens,354,184,709,707,118,46,0,4185,1707,...,159,20240,1885,4916.2,0.657993,2.96,1.17,3.12,1,99.99367
4,Matt Thornton,36,46,748,1,0,0,23,594,251,...,20,2799,273,662.2,0.439024,2.56,1.28,3.41,0,0.002149
5,Curt Schilling,216,146,569,436,83,20,22,2998,1253,...,52,13284,1318,3261.0,0.596685,4.38,1.14,3.46,0,24.902402
6,Javier Lopez,30,17,839,0,0,0,14,484,206,...,27,2273,227,533.1,0.638298,1.52,1.35,3.48,0,0.007034
7,Tim Hudson,222,133,482,479,26,13,0,2957,1213,...,124,13005,1319,3126.2,0.625352,2.27,1.24,3.49,0,0.00614
8,Jake Peavy,152,126,388,377,15,6,0,2134,960,...,78,9838,1011,2377.0,0.546763,3.12,1.2,3.63,0,0.000266
9,Tim Lincecum,110,89,278,270,10,7,1,1506,699,...,44,7120,746,1682.0,0.552764,2.59,1.29,3.74,0,0.000669


In [None]:
# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
# Names_Pitching_df["Yes HOF Probability"] = Names_Pitching_df["Yes HOF Probability"].map("{:.2%}".format)

In [None]:
# Names_Pitcher_df.to_json('Names_Batter.json', orient = 'records')