In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
from pathlib import Path

In [2]:
# Import CSV
file_path = Path('./Resources/hall_batter.csv')
hall_batter_df = pd.read_csv(file_path)
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,N
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,Y
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,N
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,N
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17501,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,N
17502,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,N
17503,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,34.0,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,N
17504,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,9.0,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,N


In [3]:
# RFC model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_batter_df['inducted'] = hall_batter_df['inducted'].map(b).fillna(hall_batter_df['inducted'])
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,0
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17501,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,0
17502,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,0
17503,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,34.0,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,0
17504,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,9.0,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,0


In [4]:
# Set target and features variables
y = hall_batter_df.inducted
X = hall_batter_df.drop(columns=["playerID", "inducted"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [5]:
# Create logisitic regression model
from sklearn.linear_model import LogisticRegression
import numpy as np

classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model
classifier.fit(X_train_scaled, y_train)

THRESHOLD = 0.75

predictions = np.where(classifier.predict_proba(X_test_scaled)[:,1] >= THRESHOLD, 1, 0)

predictions

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0, 0, 0, ..., 0, 0, 0])

In [6]:
# Evaluate the model

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confustion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confustion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4338,5
Actual 1,22,12


Accuracy Score : 0.9938313913639479
Classification Report
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4343
           1       0.71      0.35      0.47        34

    accuracy                           0.99      4377
   macro avg       0.85      0.68      0.73      4377
weighted avg       0.99      0.99      0.99      4377



In [None]:
# Calculate feature importance in the RFC model
#importances = classifier.feature_importances_
#importances

In [None]:
# Sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
#predictions = predictions.tolist()

In [None]:
#hall_batter_df['HOF Prediction'] = predictions
#hall_batter_df

In [7]:
# Import CSV
file_path = Path('./Resources/2022_HOF_Class_hitters.csv')
HOF_2022_df = pd.read_csv(file_path)
HOF_2022_df

Unnamed: 0,Name,Team,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,playerid
0,Barry Bonds,- - -,2986,9847,2227,2935,601,77,762,1996,514,2558,1539,106,4,91,0.298,0.444,0.607,1109
1,Manny Ramirez,- - -,2302,8244,1544,2574,547,20,555,1831,38,1329,1813,109,2,90,0.312,0.411,0.585,210
2,Todd Helton,Rockies,2247,7962,1401,2519,592,37,369,1406,37,1335,1175,57,3,93,0.316,0.414,0.539,432
3,Alex Rodriguez,- - -,2784,10566,2021,3115,548,31,696,2086,329,1338,2287,176,16,111,0.295,0.38,0.55,1274
4,David Ortiz,- - -,2408,8640,1419,2472,632,19,541,1768,17,1319,1750,38,2,92,0.286,0.38,0.552,745
5,Gary Sheffield,- - -,2576,9217,1636,2689,467,27,509,1676,253,1475,1171,135,9,111,0.292,0.393,0.514,114
6,Bobby Abreu,- - -,2425,8480,1453,2470,574,59,288,1363,400,1476,1840,33,7,85,0.291,0.395,0.475,945
7,Prince Fielder,- - -,1611,5821,862,1645,321,10,319,1028,18,847,1155,124,0,61,0.283,0.382,0.506,4613
8,Mark Teixeira,- - -,1862,6936,1099,1862,408,18,409,1298,26,918,1441,111,0,64,0.268,0.36,0.509,1281
9,Sammy Sosa,- - -,2354,8813,1475,2408,379,45,609,1667,234,929,2306,59,17,78,0.273,0.344,0.534,302


In [8]:
# Set target and features variables
X_2022 = HOF_2022_df.drop(columns=["Team", "playerid"])

X_2022 = X_2022.set_index('Name')

In [9]:
# Scale data
X_scaled = X_scaler.transform(X_2022)

In [10]:
# Make predictions using the testing data
predictions = classifier.predict(X_scaled)

predictions

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0])

In [11]:
predictions = predictions.tolist()

In [12]:
HOF_2022_df["HOF Prediction"] = predictions
HOF_2022_df

Unnamed: 0,Name,Team,G,AB,R,H,2B,3B,HR,RBI,...,BB,SO,HBP,SH,SF,AVG,OBP,SLG,playerid,HOF Prediction
0,Barry Bonds,- - -,2986,9847,2227,2935,601,77,762,1996,...,2558,1539,106,4,91,0.298,0.444,0.607,1109,1
1,Manny Ramirez,- - -,2302,8244,1544,2574,547,20,555,1831,...,1329,1813,109,2,90,0.312,0.411,0.585,210,1
2,Todd Helton,Rockies,2247,7962,1401,2519,592,37,369,1406,...,1335,1175,57,3,93,0.316,0.414,0.539,432,0
3,Alex Rodriguez,- - -,2784,10566,2021,3115,548,31,696,2086,...,1338,2287,176,16,111,0.295,0.38,0.55,1274,1
4,David Ortiz,- - -,2408,8640,1419,2472,632,19,541,1768,...,1319,1750,38,2,92,0.286,0.38,0.552,745,0
5,Gary Sheffield,- - -,2576,9217,1636,2689,467,27,509,1676,...,1475,1171,135,9,111,0.292,0.393,0.514,114,1
6,Bobby Abreu,- - -,2425,8480,1453,2470,574,59,288,1363,...,1476,1840,33,7,85,0.291,0.395,0.475,945,0
7,Prince Fielder,- - -,1611,5821,862,1645,321,10,319,1028,...,847,1155,124,0,61,0.283,0.382,0.506,4613,0
8,Mark Teixeira,- - -,1862,6936,1099,1862,408,18,409,1298,...,918,1441,111,0,64,0.268,0.36,0.509,1281,0
9,Sammy Sosa,- - -,2354,8813,1475,2408,379,45,609,1667,...,929,2306,59,17,78,0.273,0.344,0.534,302,0


In [13]:
# Whole DF
X_2 = hall_batter_df.drop(columns=["playerID", "inducted"])
X_2_scaled = X_scaler.transform(X_2)

# Make predictions using the testing data
predictions = classifier.predict(X_2_scaled)

predictions

array([0, 1, 0, ..., 0, 0, 0])

In [14]:
predictions = predictions.tolist()

In [15]:
hall_batter_df["HOF Prediction"] = predictions
hall_batter_df

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction
0,aardsda01,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0,0
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1
2,aaronto01,437.0,944.0,102.0,216.0,42.0,6.0,13.0,94.0,9.0,86.0,145.0,0.0,9.0,6.0,0.228814,0.291506,0.327331,0,0
3,aasedo01,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0,0
4,abadan01,15.0,21.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.095238,0.240000,0.095238,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17501,zupcibo01,319.0,795.0,99.0,199.0,47.0,4.0,7.0,80.0,7.0,57.0,137.0,6.0,20.0,8.0,0.250314,0.302540,0.345912,0,0
17502,zupofr01,16.0,18.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.166667,0.250000,0.222222,0,0
17503,zuvelpa01,209.0,491.0,41.0,109.0,17.0,2.0,2.0,20.0,2.0,34.0,50.0,2.0,18.0,0.0,0.221996,0.275142,0.276986,0,0
17504,zuverge01,266.0,142.0,5.0,21.0,2.0,1.0,0.0,7.0,0.0,9.0,39.0,0.0,16.0,0.0,0.147887,0.198675,0.176056,0,0


In [17]:
hall_batter_df.loc[(hall_batter_df['inducted'] == 1) & (hall_batter_df['HOF Prediction'] == 1) ]

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1
376,ansonca01,2524.0,10281.0,1999.0,3435.0,582.0,142.0,97.0,2075.0,277.0,984.0,330.0,32.0,34.0,0.0,0.334111,0.393998,0.446649,1,1
464,ashburi01,2189.0,8365.0,1322.0,2574.0,317.0,109.0,29.0,586.0,234.0,1198.0,571.0,43.0,112.0,18.0,0.307711,0.396405,0.382068,1,1
523,averiea01,1668.0,6353.0,1224.0,2019.0,401.0,128.0,238.0,1164.0,70.0,774.0,518.0,33.0,55.0,0.0,0.317803,0.394693,0.533606,1,1
685,bankser01,2528.0,9421.0,1305.0,2583.0,407.0,90.0,512.0,1636.0,50.0,763.0,1236.0,70.0,45.0,96.0,0.274175,0.330048,0.499522,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16765,wheatza01,2410.0,9106.0,1289.0,2884.0,476.0,172.0,132.0,1248.0,205.0,650.0,572.0,77.0,163.0,0.0,0.316714,0.367233,0.450253,1,1
16928,willibi01,2488.0,9350.0,1410.0,2711.0,434.0,88.0,426.0,1475.0,90.0,1045.0,1046.0,43.0,8.0,73.0,0.289947,0.361431,0.491872,1,1
17006,willite01,2292.0,7706.0,1798.0,2654.0,525.0,71.0,521.0,1839.0,24.0,2021.0,709.0,39.0,5.0,20.0,0.344407,0.481709,0.633792,1,1
17106,winfida01,2973.0,11003.0,1669.0,3110.0,540.0,88.0,465.0,1833.0,223.0,1216.0,1686.0,25.0,19.0,95.0,0.282650,0.352622,0.474507,1,1


In [18]:
hall_batter_df.loc[(hall_batter_df['inducted'] == 1)]

Unnamed: 0,playerID,G,AB,R,H,2B,3B,HR,RBI,SB,BB,SO,HBP,SH,SF,AVG,OBP,SLG,inducted,HOF Prediction
1,aaronha01,3298.0,12364.0,2174.0,3771.0,624.0,98.0,755.0,2297.0,240.0,1402.0,1383.0,32.0,21.0,121.0,0.304998,0.373949,0.554513,1,1
239,alomaro01,2379.0,9073.0,1508.0,2724.0,504.0,80.0,210.0,1134.0,474.0,1032.0,1140.0,50.0,148.0,97.0,0.300231,0.371245,0.442852,1,0
376,ansonca01,2524.0,10281.0,1999.0,3435.0,582.0,142.0,97.0,2075.0,277.0,984.0,330.0,32.0,34.0,0.0,0.334111,0.393998,0.446649,1,1
384,aparilu01,2601.0,10230.0,1335.0,2677.0,394.0,92.0,83.0,791.0,506.0,736.0,742.0,27.0,161.0,76.0,0.261681,0.310778,0.342522,1,0
464,ashburi01,2189.0,8365.0,1322.0,2574.0,317.0,109.0,29.0,586.0,234.0,1198.0,571.0,43.0,112.0,18.0,0.307711,0.396405,0.382068,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17053,wilsoha01,1348.0,4760.0,884.0,1461.0,266.0,67.0,244.0,1063.0,52.0,674.0,713.0,20.0,102.0,0.0,0.306933,0.395123,0.544748,1,0
17106,winfida01,2973.0,11003.0,1669.0,3110.0,540.0,88.0,465.0,1833.0,223.0,1216.0,1686.0,25.0,19.0,95.0,0.282650,0.352622,0.474507,1,1
17335,yastrca01,3308.0,11988.0,1816.0,3419.0,646.0,59.0,452.0,1844.0,168.0,1845.0,1393.0,40.0,13.0,105.0,0.285202,0.379453,0.462045,1,1
17415,youngro01,1211.0,4627.0,812.0,1491.0,236.0,93.0,42.0,592.0,153.0,550.0,390.0,37.0,119.0,0.0,0.322239,0.398542,0.440674,1,0
