In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
from pathlib import Path

In [2]:
# Import CSV
file_path = Path('hall_pitching.csv')
hall_pitcher_df = pd.read_csv(file_path)
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,IPouts,H,...,BK,BFP,GF,R,IP,Win %,(K/BB),WHIP,ERA_corrected,inducted
0,abbotji01,87,108,263,254,31,6,0,5022,1779,...,11,7211.0,5,880,1674.000000,0.446154,1.432258,1.433094,4.252688,N
1,adamsba01,194,140,482,355,206,44,15,8986,2841,...,2,11947.0,89,1129,2995.333333,0.580838,2.409302,1.092032,2.755286,N
2,aguilri01,86,81,732,89,10,0,318,3874,1233,...,10,5391.0,557,568,1291.333333,0.514970,2.934473,1.226639,3.568405,N
3,akerja01,47,45,495,0,0,0,123,2238,679,...,0,3144.0,321,312,746.000000,0.510870,1.474453,1.277480,3.281501,N
4,alexado01,194,174,561,464,98,18,3,10103,3376,...,10,14162.0,56,1541,3367.666667,0.527174,1.562372,1.292883,3.757498,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,youngcy01,511,315,906,815,749,76,18,22068,7092,...,3,29579.0,84,3167,7356.000000,0.618644,2.303205,1.129554,2.626835,Y
405,zachato01,186,191,533,409,186,24,22,9379,3580,...,15,13551.0,84,1552,3126.333333,0.493369,0.787746,1.437467,3.728009,N
406,zachrpa01,69,67,293,154,29,7,3,3532,1147,...,10,5050.0,44,529,1177.333333,0.507353,1.351515,1.394677,3.524066,N
407,zahnge01,111,109,304,270,79,20,1,5547,1978,...,4,7798.0,13,889,1849.000000,0.504545,1.340304,1.354246,3.743104,N


In [3]:
# RFC model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_pitcher_df['inducted'] = hall_pitcher_df['inducted'].map(b).fillna(hall_pitcher_df['inducted'])
hall_pitcher_df

Unnamed: 0,playerID,W,L,G,GS,CG,SHO,SV,IPouts,H,...,BK,BFP,GF,R,IP,Win %,(K/BB),WHIP,ERA_corrected,inducted
0,abbotji01,87,108,263,254,31,6,0,5022,1779,...,11,7211.0,5,880,1674.000000,0.446154,1.432258,1.433094,4.252688,0
1,adamsba01,194,140,482,355,206,44,15,8986,2841,...,2,11947.0,89,1129,2995.333333,0.580838,2.409302,1.092032,2.755286,0
2,aguilri01,86,81,732,89,10,0,318,3874,1233,...,10,5391.0,557,568,1291.333333,0.514970,2.934473,1.226639,3.568405,0
3,akerja01,47,45,495,0,0,0,123,2238,679,...,0,3144.0,321,312,746.000000,0.510870,1.474453,1.277480,3.281501,0
4,alexado01,194,174,561,464,98,18,3,10103,3376,...,10,14162.0,56,1541,3367.666667,0.527174,1.562372,1.292883,3.757498,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,youngcy01,511,315,906,815,749,76,18,22068,7092,...,3,29579.0,84,3167,7356.000000,0.618644,2.303205,1.129554,2.626835,1
405,zachato01,186,191,533,409,186,24,22,9379,3580,...,15,13551.0,84,1552,3126.333333,0.493369,0.787746,1.437467,3.728009,0
406,zachrpa01,69,67,293,154,29,7,3,3532,1147,...,10,5050.0,44,529,1177.333333,0.507353,1.351515,1.394677,3.524066,0
407,zahnge01,111,109,304,270,79,20,1,5547,1978,...,4,7798.0,13,889,1849.000000,0.504545,1.340304,1.354246,3.743104,0


In [4]:
# Set target and features variables
y = hall_pitcher_df.inducted
X = hall_pitcher_df.drop(columns=["playerID", "inducted"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [5]:
# Create a RFC model
rf_model = RandomForestClassifier(n_estimators=128, random_state = 42)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

predictions

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0])

In [6]:
# Evaluate the model
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confustion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confustion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,78,6
Actual 1,9,10


Accuracy Score : 0.8543689320388349
Classification Report
              precision    recall  f1-score   support

           0       0.90      0.93      0.91        84
           1       0.62      0.53      0.57        19

    accuracy                           0.85       103
   macro avg       0.76      0.73      0.74       103
weighted avg       0.85      0.85      0.85       103



In [7]:
# Calculate feature importance in the RFC model
importances = rf_model.feature_importances_
importances

array([0.13412929, 0.02330586, 0.01059343, 0.0258935 , 0.07168094,
       0.088373  , 0.0122843 , 0.07820059, 0.03859075, 0.01907013,
       0.01301043, 0.02302566, 0.04355891, 0.01572083, 0.02217052,
       0.01404885, 0.02238459, 0.01146593, 0.07350763, 0.01812958,
       0.01514446, 0.05730099, 0.0415839 , 0.02832406, 0.04791412,
       0.05058774])

In [8]:
# Sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.13412929410412164, 'W'),
 (0.08837299679024964, 'SHO'),
 (0.07820058590543419, 'IPouts'),
 (0.07350762839920195, 'BFP'),
 (0.07168094492539209, 'CG'),
 (0.057300991496485244, 'IP'),
 (0.050587741906875795, 'ERA_corrected'),
 (0.04791412120681047, 'WHIP'),
 (0.04355891332816745, 'SO'),
 (0.041583900428795284, 'Win %'),
 (0.038590746962072715, 'H'),
 (0.02832406299763819, '(K/BB)'),
 (0.025893500550260778, 'GS'),
 (0.023305863771144585, 'L'),
 (0.023025662267877334, 'BB'),
 (0.022384586808894883, 'HBP'),
 (0.02217051708843294, 'ERA'),
 (0.01907012639288142, 'ER'),
 (0.01812957821758074, 'GF'),
 (0.015720827878757438, 'BAOpp'),
 (0.015144459124882714, 'R'),
 (0.014048853690705652, 'WP'),
 (0.013010432104751581, 'HR'),
 (0.012284301047227126, 'SV'),
 (0.011465934331532335, 'BK'),
 (0.010593428273825785, 'G')]