In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [2]:
# Read the data
df = pd.read_csv('allGames.csv')

# Cut off the first 21,294 rows, which include limited stats from before things like rebounds and steals were recorded
# All stats tracked in 1985, so we start our data here
df = df[21294:]
print(df.shape)

(41073, 54)


In [3]:
# Select only the numerical attributes
# Removed - 'fgm_home', 'fg3m_home', 'ftm_home', 'fgm_away', 'fg3m_away', 'ftm_away', 'pts_away'
df_X = df[['reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home', 'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away']]
df_y = df['wl_home']

print(df_X.shape)
print(df_y.shape)

(41073, 10)
(41073,)


In [4]:
# Check the current state of df_y: A column of W and L
print("df_y of 1 and 0")
df_y

df_y of 1 and 0


21294    L
21295    W
21296    L
21297    L
21298    W
        ..
62362    W
62363    L
62364    W
62365    L
62366    L
Name: wl_home, Length: 41073, dtype: object

In [5]:
# Convert the W/L strings of df_y to 0's and 1's, where W = 1 and L = 0
df_y = (df_y == 'W').astype(int)

# Sanity check: A column of 1 and 0 
print("\ndf_y of 1 and 0")
df_y


df_y of 1 and 0


21294    0
21295    1
21296    0
21297    0
21298    1
        ..
62362    1
62363    0
62364    1
62365    0
62366    0
Name: wl_home, Length: 41073, dtype: int32

In [6]:
# Normalize the data using Z-score
df_X = (df_X - df_X.mean()) / df_X.std()

In [7]:
# Create the training and testing sets 
# Splitting between 1985-2015, and 2015-2023
X_train = df_X.iloc[:32248] 
X_test = df_X.iloc[32249:]

y_train = df_y.iloc[:32248]
y_test = df_y.iloc[32249:]

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(32248, 10)
(8824, 10)
(32248,)
(8824,)


In [8]:
# Convert to numpy arrays
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

# Verify the data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(32248, 10)
(32248,)
(8824, 10)
(8824,)


In [11]:
# Import the logistic regression functionality from scikit-learn
from sklearn.neural_network import MLPClassifier

# Create a logistic regression model:
model = MLPClassifier(hidden_layer_sizes=(50, 50), activation='logistic', solver='sgd', max_iter=1000)

# Train the model on the traing data:
model.fit(X_train, y_train)

# Output the weight vector
W_direct = model.coefs_
#print("Weight vector W:", W_direct) # 10 values

# Predict the output values for the testing data:
y_pred = model.predict(X_test)

# Evaluate the model:
print("\nClassification Evaluation Metrics")
print("Training accuracy: ", model.score(X_train, y_train) * 100)
print("Testing accuracy: ", model.score(X_test, y_test) * 100)

# Print the first 10 test and predicted values:
print()
print("Y_test: \n", y_test[0:10])
print()
print("Y_pred: \n", y_pred[0:10])


Classification Evaluation Metrics
Training accuracy:  79.23902257504342
Testing accuracy:  78.85312783318224

Y_test: 
 [1 0 0 0 0 0 0 1 1 0]

Y_pred: 
 [1 0 0 0 0 1 0 1 1 0]
