In [1]:
# Import our dependencies
import pandas as pd
import xgboost as xgb
import joblib
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from pathlib import Path
from imblearn.over_sampling import SMOTE

In [2]:
# Import CSV
file_path = Path('../Resources/Revised_CSV/2022_HOF_Class_hitters.csv')
HOF_2022_df = pd.read_csv(file_path)
HOF_2022_df

Unnamed: 0,Name,Team,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,playerid
0,Barry Bonds,- - -,2986,9847,2227,2935,601,77,762,1996,514,2558,1539,106,4,91,0.298,0.444,0.607,1109
1,Manny Ramirez,- - -,2302,8244,1544,2574,547,20,555,1831,38,1329,1813,109,2,90,0.312,0.411,0.585,210
2,Todd Helton,Rockies,2247,7962,1401,2519,592,37,369,1406,37,1335,1175,57,3,93,0.316,0.414,0.539,432
3,Alex Rodriguez,- - -,2784,10566,2021,3115,548,31,696,2086,329,1338,2287,176,16,111,0.295,0.38,0.55,1274
4,David Ortiz,- - -,2408,8640,1419,2472,632,19,541,1768,17,1319,1750,38,2,92,0.286,0.38,0.552,745
5,Gary Sheffield,- - -,2576,9217,1636,2689,467,27,509,1676,253,1475,1171,135,9,111,0.292,0.393,0.514,114
6,Bobby Abreu,- - -,2425,8480,1453,2470,574,59,288,1363,400,1476,1840,33,7,85,0.291,0.395,0.475,945
7,Prince Fielder,- - -,1611,5821,862,1645,321,10,319,1028,18,847,1155,124,0,61,0.283,0.382,0.506,4613
8,Mark Teixeira,- - -,1862,6936,1099,1862,408,18,409,1298,26,918,1441,111,0,64,0.268,0.36,0.509,1281
9,Sammy Sosa,- - -,2354,8813,1475,2408,379,45,609,1667,234,929,2306,59,17,78,0.273,0.344,0.534,302


In [3]:
# Set target and features variables
XBatter_2022 = HOF_2022_df.drop(columns=["Team", "playerid"])

XBatter_2022 = XBatter_2022.set_index('Name')

In [7]:
#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(XBatter_2022)
X_scaled = X_scaler.transform(XBatter_2022)

In [8]:
# Load the saved Model
loaded_model = joblib.load('Final_XGB_Batter_Model.sav')
# result = loaded_model.score(X_test_scaled, y_test)
# print(result)

In [9]:
# Make predictions using the testing data
predictions = loaded_model.predict(X_scaled)
# predictions = loaded_model.predict(XBatter_2022)

predictions = predictions.tolist()

In [10]:
# Measure probability of predictions
prediction_proba = loaded_model.predict_proba(X_scaled)
#prediction_proba = loaded_model.predict_proba(XBatter_2022)

prediction_proba = prediction_proba.tolist()

# Split the list into two lists
N_proba, Y_proba = map(list, zip(*prediction_proba))

In [11]:
# DF post-processing for visualizations
HOF_2022_df["HOF Prediction"] = predictions

# Create a column with the probability for a Yes
HOF_2022_df["Yes HOF Probability"] = Y_proba

# Convert "Yes HOF Probability" column from object to float64
HOF_2022_df["Yes HOF Probability"] = pd.to_numeric(HOF_2022_df["Yes HOF Probability"])

HOF_2022_df["Yes HOF Probability"] = 100 * HOF_2022_df["Yes HOF Probability"]

HOF_2022_df = HOF_2022_df.drop(columns = ["Team", "playerid"])

HOF_2022_df
# Format the "Yes HOF Probability" column to a percentage, does not maintain float 64 type
# HOF_2022_df["Yes HOF Probability"] = HOF_2022_df["Yes HOF Probability"].map("{:.2%}".format)

Unnamed: 0,Name,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,HOF Prediction,Yes HOF Probability
0,Barry Bonds,2986,9847,2227,2935,601,77,762,1996,514,2558,1539,106,4,91,0.298,0.444,0.607,0,0.030299
1,Manny Ramirez,2302,8244,1544,2574,547,20,555,1831,38,1329,1813,109,2,90,0.312,0.411,0.585,0,0.002751
2,Todd Helton,2247,7962,1401,2519,592,37,369,1406,37,1335,1175,57,3,93,0.316,0.414,0.539,0,0.000459
3,Alex Rodriguez,2784,10566,2021,3115,548,31,696,2086,329,1338,2287,176,16,111,0.295,0.38,0.55,0,0.039094
4,David Ortiz,2408,8640,1419,2472,632,19,541,1768,17,1319,1750,38,2,92,0.286,0.38,0.552,0,0.000375
5,Gary Sheffield,2576,9217,1636,2689,467,27,509,1676,253,1475,1171,135,9,111,0.292,0.393,0.514,0,0.001862
6,Bobby Abreu,2425,8480,1453,2470,574,59,288,1363,400,1476,1840,33,7,85,0.291,0.395,0.475,0,0.001089
7,Prince Fielder,1611,5821,862,1645,321,10,319,1028,18,847,1155,124,0,61,0.283,0.382,0.506,0,0.000214
8,Mark Teixeira,1862,6936,1099,1862,408,18,409,1298,26,918,1441,111,0,64,0.268,0.36,0.509,0,0.000181
9,Sammy Sosa,2354,8813,1475,2408,379,45,609,1667,234,929,2306,59,17,78,0.273,0.344,0.534,0,0.000234
