In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
from pathlib import Path

In [2]:
# Import CSV
file_path = Path('./Resources/hall_batter.csv')
hall_batter_df = pd.read_csv(file_path)
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted
0,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,Y
1,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,N
2,abbated01,855.0,3044.0,355.0,772.0,99.0,43.0,11.0,324.0,142.0,289.0,283.0,33.0,93.0,0.0,0.253614,0.325015,0.325230,N
3,abbeych01,452.0,1756.0,307.0,493.0,67.0,46.0,19.0,280.0,93.0,167.0,122.0,23.0,19.0,0.0,0.280752,0.350976,0.403759,N
4,abbotfr01,160.0,513.0,48.0,107.0,21.0,6.0,1.0,49.0,14.0,19.0,75.0,8.0,20.0,0.0,0.208577,0.248148,0.278752,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5344,zitzmbi01,406.0,1004.0,197.0,268.0,38.0,11.0,3.0,89.0,42.0,83.0,85.0,16.0,39.0,0.0,0.266932,0.332729,0.335657,N
5345,zobribe01,1651.0,5880.0,884.0,1566.0,349.0,44.0,167.0,768.0,116.0,832.0,994.0,31.0,26.0,67.0,0.266327,0.356681,0.425850,N
5346,zuninmi01,705.0,2226.0,244.0,446.0,100.0,3.0,108.0,283.0,2.0,164.0,849.0,51.0,8.0,11.0,0.200359,0.269576,0.393531,N
5347,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,N


In [3]:
# RFC model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_batter_df['inducted'] = hall_batter_df['inducted'].map(b).fillna(hall_batter_df['inducted'])
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted
0,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1
1,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,0
2,abbated01,855.0,3044.0,355.0,772.0,99.0,43.0,11.0,324.0,142.0,289.0,283.0,33.0,93.0,0.0,0.253614,0.325015,0.325230,0
3,abbeych01,452.0,1756.0,307.0,493.0,67.0,46.0,19.0,280.0,93.0,167.0,122.0,23.0,19.0,0.0,0.280752,0.350976,0.403759,0
4,abbotfr01,160.0,513.0,48.0,107.0,21.0,6.0,1.0,49.0,14.0,19.0,75.0,8.0,20.0,0.0,0.208577,0.248148,0.278752,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5344,zitzmbi01,406.0,1004.0,197.0,268.0,38.0,11.0,3.0,89.0,42.0,83.0,85.0,16.0,39.0,0.0,0.266932,0.332729,0.335657,0
5345,zobribe01,1651.0,5880.0,884.0,1566.0,349.0,44.0,167.0,768.0,116.0,832.0,994.0,31.0,26.0,67.0,0.266327,0.356681,0.425850,0
5346,zuninmi01,705.0,2226.0,244.0,446.0,100.0,3.0,108.0,283.0,2.0,164.0,849.0,51.0,8.0,11.0,0.200359,0.269576,0.393531,0
5347,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,0


In [4]:
# Set target and features variables
y = hall_batter_df.inducted
X = hall_batter_df.drop(columns=["playerID", "inducted"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [5]:
# Create a RFC model
rf_model = RandomForestClassifier(n_estimators=128, random_state = 42)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

predictions

array([0, 0, 0, ..., 0, 0, 0])

In [6]:
# Evaluate the model

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confustion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confustion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1268,11
Actual 1,35,24


Accuracy Score : 0.9656203288490284
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1279
           1       0.69      0.41      0.51        59

    accuracy                           0.97      1338
   macro avg       0.83      0.70      0.75      1338
weighted avg       0.96      0.97      0.96      1338



In [7]:
# Calculate feature importance in the RFC model
importances = rf_model.feature_importances_
importances

array([0.06746546, 0.08908997, 0.11695237, 0.11113178, 0.03679682,
       0.05091145, 0.03163592, 0.06548757, 0.03525189, 0.04401107,
       0.04851476, 0.02923039, 0.04702414, 0.02316697, 0.08682003,
       0.0628991 , 0.05361034])

In [8]:
# Sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1169523696428618, 'R'),
 (0.11113178283060492, 'H'),
 (0.08908996810659284, 'AB'),
 (0.08682002600838046, 'AVG'),
 (0.06746545646700405, 'G'),
 (0.06548756607183005, 'RBI'),
 (0.06289909512688004, 'OBP'),
 (0.05361033597337505, 'SLG'),
 (0.05091144571187617, '3B'),
 (0.0485147590398928, 'SO'),
 (0.047024144447216115, 'SH'),
 (0.04401106844033163, 'BB'),
 (0.036796816145029804, '2B'),
 (0.03525188776867097, 'SB'),
 (0.031635922263010756, 'HR'),
 (0.029230387820923985, 'HBP'),
 (0.023166968135518527, 'SF')]