In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
from pathlib import Path

In [None]:
# Import CSV
file_path = Path('./Resources/hall_pitcher.csv')
hall_pitcher_df = pd.read_csv(file_path)
hall_pitcher_df

In [None]:
# RFC model data preprocessing
# Convert inducted column to Y = 1, N = 0

b = {'Y': 1, 'N': 0}
hall_pitcher_df['inducted'] = hall_pitcher_df['inducted'].map(b).fillna(hall_pitcher_df['inducted'])
hall_pitcher_df

In [None]:
# Set target and features variables
y = hall_pitcher_df.inducted
X = hall_pitcher_df.drop(columns=["playerID", "inducted"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

#Scale the data
#Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a RFC model
rf_model = RandomForestClassifier(n_estimators=128, random_state = 42)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

predictions

In [None]:
# Evaluate the model
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confustion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Calculate feature importance in the RFC model
importances = rf_model.feature_importances_
importances

In [None]:
# Sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)