<a href="https://colab.research.google.com/github/frankwillard/StatGPT/blob/main/HOF_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [6]:
player_stats = pd.read_csv('players_stats.csv')
hof = pd.read_csv('hof_binary.csv')

In [7]:
# Select, Player, Points, Rebounds, Assists, Steals, Blocks

In [8]:
player_stats.columns

Index(['Player', 'Points', 'Rebounds', 'Assists', 'Steals', 'Blocks'], dtype='object')

In [9]:
train_data_with_player, test_data_with_player, train_labels_with_player, test_labels_with_player = train_test_split(player_stats, hof, test_size=0.25, random_state=42)

In [10]:
train_data = train_data_with_player.iloc[:, 1:]
test_data = test_data_with_player.iloc[:, 1:]

train_labels = train_labels_with_player.iloc[:, 1]
test_labels = test_labels_with_player.iloc[:, 1]

In [11]:
# Initialize the logistic regression model
logreg_model = LogisticRegression()
decision_tree_model = DecisionTreeClassifier()
random_forest_model = RandomForestClassifier()
xgboost_model = XGBClassifier()

In [12]:
# Train the model
logreg_model.fit(train_data, train_labels)
decision_tree_model.fit(train_data, train_labels)
random_forest_model.fit(train_data, train_labels)
xgboost_model.fit(train_data, train_labels)

In [13]:
# Make predictions on the test set
logreg_predictions = logreg_model.predict(test_data)
decision_tree_predictions = decision_tree_model.predict(test_data)
random_forest_predictions = random_forest_model.predict(test_data)
xgboost_predictions = xgboost_model.predict(test_data)

In [14]:
def evaluate(predictions, test_labels):
  # Print the evaluation metrics
  accuracy = accuracy_score(test_labels, predictions)
  conf_matrix = confusion_matrix(test_labels, predictions)
  classification_rep = classification_report(test_labels, predictions)

  precision = precision_score(test_labels, predictions)
  recall = recall_score(test_labels, predictions)
  f1 = f1_score(test_labels, predictions)

  print(f'Accuracy: {accuracy}')
  print(f'Precision: {precision}')
  print(f'Recall: {recall}')
  print(f'F1: {f1}')
  print(f'Confusion Matrix:\n{conf_matrix}')
  # print(f'Classification Report:\n{classification_rep}')

In [15]:
evaluate(logreg_predictions, test_labels)

Accuracy: 0.9651162790697675
Precision: 0.5333333333333333
Recall: 0.2857142857142857
F1: 0.37209302325581395
Confusion Matrix:
[[739   7]
 [ 20   8]]


In [16]:
evaluate(decision_tree_predictions, test_labels)

Accuracy: 0.962532299741602
Precision: 0.4827586206896552
Recall: 0.5
F1: 0.49122807017543857
Confusion Matrix:
[[731  15]
 [ 14  14]]


In [17]:
evaluate(random_forest_predictions, test_labels)

Accuracy: 0.9702842377260982
Precision: 0.6470588235294118
Recall: 0.39285714285714285
F1: 0.4888888888888888
Confusion Matrix:
[[740   6]
 [ 17  11]]


In [18]:
evaluate(xgboost_predictions, test_labels)

Accuracy: 0.9664082687338501
Precision: 0.55
Recall: 0.39285714285714285
F1: 0.45833333333333337
Confusion Matrix:
[[737   9]
 [ 17  11]]
