In [1]:
# Import our dependencies
import pandas as pd
import xgboost as xgb
import joblib
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from pathlib import Path
from imblearn.over_sampling import SMOTE

In [2]:
import psycopg2
from config import db_password

# Establish a connection to the database by creating a cursor object
# The PostgreSQL server must be accessed through the PostgreSQL APP or Terminal Shell

conn = psycopg2.connect(host="localhost", port = 5432, database="baseball_data", user="postgres", password=db_password)

In [3]:
# Create a cursor object
cur = conn.cursor()

In [4]:
# import entire hall_pitching table from postgres to dataframe
hall_pitching = pd.read_sql('SELECT * FROM hall_pitching', conn)
hall_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,SO,WP,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,340.0,12.0,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,N
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,641.0,22.0,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,N
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,280.0,10.0,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,N
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,161.0,18.0,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,N
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,1.0,3.0,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,383.0,28.0,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,N
9114,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,30.0,1.0,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,N
9115,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,210.0,16.0,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,N
9116,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,223.0,10.0,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,N


In [5]:
# import entire hall_pitching table from postgres to dataframe
career_pitching = pd.read_sql('SELECT * FROM career_pitching', conn)
career_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,BB,SO,WP,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA
0,aardsda01,16,18,331,0,0,0,69,296,160,...,183,340,12,16.0,169,337.000000,0.470588,1.857923,1.421365,4.272997
1,aasedo01,66,60,448,91,22,5,82,1085,468,...,457,641,22,7.0,503,1109.333333,0.523810,1.402626,1.390024,3.796875
2,abadfe01,8,29,384,6,0,0,2,309,135,...,116,280,10,12.0,143,330.666667,0.216216,2.413793,1.285282,3.674395
3,abbeybe01,22,40,79,65,52,0,1,686,285,...,192,161,18,26.0,442,568.000000,0.354839,0.838542,1.545775,4.515845
4,abbotgl01,62,83,248,206,37,5,0,1405,627,...,352,484,18,32.0,707,1286.000000,0.427586,1.375000,1.366252,4.388025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5936,zuberbi01,43,42,224,65,23,3,6,767,374,...,468,383,28,4.0,418,786.000000,0.505882,0.818376,1.571247,4.282443
5937,zuberty01,1,2,23,0,0,0,0,15,10,...,20,30,1,1.0,11,22.000000,0.333333,1.500000,1.590909,4.090909
5938,zumayjo01,13,12,171,0,0,0,5,169,71,...,114,210,16,4.0,80,209.666667,0.520000,1.842105,1.349762,3.047695
5939,zuverge01,32,36,265,31,9,2,40,660,253,...,203,223,10,27.0,296,642.333333,0.470588,1.098522,1.343539,3.544888


In [6]:
cur.close()
conn.close()

In [7]:
# XGBoost model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_pitching['inducted'] = hall_pitching['inducted'].map(b).fillna(hall_pitching['inducted'])
hall_pitching

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,H,ER,...,SO,WP,HBP,R,IP,Win Percentage,Strikeout to Walk,WHIP,ERA,inducted
0,aardsda01,16.0,18.0,331.0,0.0,0.0,0.0,69.0,296.0,160.0,...,340.0,12.0,16.0,169.0,337.000000,0.470588,1.857923,1.421365,4.272997,0
1,aasedo01,66.0,60.0,448.0,91.0,22.0,5.0,82.0,1085.0,468.0,...,641.0,22.0,7.0,503.0,1109.333333,0.523810,1.402626,1.390024,3.796875,0
2,abadfe01,8.0,29.0,384.0,6.0,0.0,0.0,2.0,309.0,135.0,...,280.0,10.0,12.0,143.0,330.666667,0.216216,2.413793,1.285282,3.674395,0
3,abbeybe01,22.0,40.0,79.0,65.0,52.0,0.0,1.0,686.0,285.0,...,161.0,18.0,26.0,442.0,568.000000,0.354839,0.838542,1.545775,4.515845,0
4,abbotda01,0.0,2.0,3.0,1.0,1.0,0.0,1.0,19.0,9.0,...,1.0,3.0,1.0,14.0,13.000000,0.000000,0.125000,2.076923,6.230769,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9113,zuberbi01,43.0,42.0,224.0,65.0,23.0,3.0,6.0,767.0,374.0,...,383.0,28.0,4.0,418.0,786.000000,0.505882,0.818376,1.571247,4.282443,0
9114,zuberty01,1.0,2.0,23.0,0.0,0.0,0.0,0.0,15.0,10.0,...,30.0,1.0,1.0,11.0,22.000000,0.333333,1.500000,1.590909,4.090909,0
9115,zumayjo01,13.0,12.0,171.0,0.0,0.0,0.0,5.0,169.0,71.0,...,210.0,16.0,4.0,80.0,209.666667,0.520000,1.842105,1.349762,3.047695,0
9116,zuverge01,32.0,36.0,265.0,31.0,9.0,2.0,40.0,660.0,253.0,...,223.0,10.0,27.0,296.0,642.333333,0.470588,1.098522,1.343539,3.544888,0


In [8]:
# Set target and features variables
y = hall_pitching.inducted
X = hall_pitching.drop(columns=["playerID", "inducted"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Use SMOTE to addres imbalanced dataset
X_resampled, y_resampled = SMOTE(random_state=42,
sampling_strategy='auto').fit_resample(
   X_train_scaled, y_train)

In [10]:
# Create a XGB model
XGB_model = XGBClassifier(max_depth = 5, min_child_weight = 1, gamma = 0, subsample = 0.8, colsample_bytree = 0.8, scale_pos_weight = 1, use_label_encoder = False)

# Fit the model
XGB_model = XGB_model.fit(X_resampled, y_resampled)

print(XGB_model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)


In [11]:
# Use trained XGB_model to make predictions
predictions = XGB_model.predict(X_test_scaled)

predictions

array([0, 0, 0, ..., 0, 0, 0])

In [12]:
# Evaluate the model

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confustion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confustion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2245,11
Actual 1,7,17


Accuracy Score : 0.9921052631578947
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2256
           1       0.61      0.71      0.65        24

    accuracy                           0.99      2280
   macro avg       0.80      0.85      0.82      2280
weighted avg       0.99      0.99      0.99      2280



In [13]:
# Calculate feature importance in the RFC model
importances = XGB_model.feature_importances_
importances

array([0.7349225 , 0.011985  , 0.01937745, 0.00904639, 0.00396811,
       0.01658899, 0.02829536, 0.0035745 , 0.01898309, 0.00587483,
       0.00269092, 0.0427653 , 0.01378527, 0.00390495, 0.00225415,
       0.03018437, 0.00955933, 0.00849205, 0.00936323, 0.02438406],
      dtype=float32)

In [14]:
# Sort the features by their importance
sorted(zip(XGB_model.feature_importances_, X.columns), reverse=True)

[(0.7349225, 'W'),
 (0.042765304, 'SO'),
 (0.030184368, 'IP'),
 (0.028295364, 'SV'),
 (0.024384055, 'ERA'),
 (0.019377448, 'G'),
 (0.018983092, 'ER'),
 (0.016588992, 'SHO'),
 (0.01378527, 'WP'),
 (0.0119849965, 'L'),
 (0.009559331, 'Win Percentage'),
 (0.009363227, 'WHIP'),
 (0.009046385, 'GS'),
 (0.008492049, 'Strikeout to Walk'),
 (0.0058748284, 'HR'),
 (0.003968109, 'CG'),
 (0.00390495, 'HBP'),
 (0.0035745006, 'H'),
 (0.002690919, 'BB'),
 (0.002254146, 'R')]

In [15]:
# Save the Model
filename = 'Final_XGB_Pitcher_Model.sav'
joblib.dump(XGB_model, filename)

['Final_XGB_Pitcher_Model.sav']