In [2]:
#Step 0: Preprocessing the Data
#Importing libraries
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd

#Initializing and indicating the years we are pulling the data from 
seasons = ["2023-24", "2024-25"]
#The empty list for the NBA data to be added in
dataset_2023_2025 = []

#For Loop to find each teams performance per game from 2023 to 2025
for season in seasons:
    gamefinder = leaguegamefinder.LeagueGameFinder(
        season_nullable=season,
        season_type_nullable="Regular Season"
    )
    df = gamefinder.get_data_frames()[0]

    dataset_2023_2025.append(df)

league_games = pd.concat(dataset_2023_2025, ignore_index = True)

#Removing Duplicates in the Dataset
league_games = league_games.drop_duplicates(subset=["GAME_ID"])
#Removing Not a Number (NAN) in the Dataset
DroppedNAN = league_games.dropna(subset=['PTS', 'AST', 'REB', 'TOV', 'WL', 'FG_PCT'])
#Creating the dataset from a DataFrame to a Excel CSV file
DroppedNAN.to_csv("NBA_Dataset.csv", index = False)



In [6]:
#Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#Asking the users to select to pick two NBA teams to see who would win 
print("Make you sure write the full name of the National Basketball Association (NBA) teams with a space below")
user_input_first_team = input ("Select a NBA Team: ")
print("The first NBA team selected is ", user_input_first_team,".", sep="")
user_input_second_team = input("Select a NBA Team: ")
print("The second NBA team selected is ", user_input_second_team,".", sep="")

# Step 1: Loading the Data from the csv file called NBA_Dataset and Prepare Data
df = pd.read_csv("NBA_Dataset.csv")
df = pd.DataFrame(df)
#Mapping the Wins as 1 and Loses as 0 for classification
df["WL"]= df["WL"].map({'W': 1, 'L': 0})
# Define the features and target variables
Features = df[['PTS', 'AST', 'REB', 'TOV', 'FG_PCT']]
Target_Variables = df['WL']

# Step 2: Split the dataset into training and testing sets
#80% Training and 20% Testing
X_train, X_test, y_train, y_test = train_test_split(Features, Target_Variables, test_size=0.2, random_state=42)

# Step 3: Build and Train the Logistic Regression Model
#LogisticRegression is a built in-function in SKLearn, mapping the Sigmoid Function
logreg = LogisticRegression()
# Fitting the model to the training data
logreg.fit(X_train, y_train) 

# Step 4: Making Predictions and Evaluates Them
#Each feature is tested in the testing data
predictions = logreg.predict(X_test)
#Prediction Probability gets you the probability value that the team will win
prediction_probabilities = logreg.predict_proba(X_test) # Get win probabilities
#Determining the model accuracy as a percentage
print(f"Model Accuracy: {accuracy_score(y_test, predictions) * 100:.2f}%")

#Filters out the dataset for the two teams selected by the users 
team_1_stats = df[df['TEAM_NAME'] == user_input_first_team][['PTS', 'AST', 'REB', 'TOV', 'FG_PCT']].mean()
team_2_stats = df[df['TEAM_NAME'] == user_input_second_team][['PTS', 'AST', 'REB', 'TOV', 'FG_PCT']].mean()

# Using the filtered data, this will predict the probability values to determine who would win among the two teams selected by the user
home_prob = logreg.predict_proba(pd.DataFrame([team_1_stats]))[0][1]
away_prob = logreg.predict_proba(pd.DataFrame([team_2_stats]))[0][1]

#To determine what the team would be the winner based on the teams provided
if home_prob > away_prob:
    winner = user_input_first_team
else:
    winner = user_input_second_team

total = home_prob + away_prob
#Normalizing the probability to be equal to 1
home_prob_normalization = home_prob / total
away_prob_normalization = away_prob / total

# Print probabilities and winners
print(f"Predicted Winner: {winner}")
print(f"{user_input_first_team}: {home_prob_normalization * 100:.2f}%")
print(f"{user_input_second_team}: {away_prob_normalization * 100 :.2f}%")

Make you sure write the full name of the National Basketball Association (NBA) teams with a space below
The first NBA team selected is .
The second NBA team selected is .
Model Accuracy: 80.28%


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [10]:
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, brier_score_loss

def eval_set(name, model, Xs, ys):
    p = model.predict_proba(Xs)[:, 1]
    pred = (p >= 0.5).astype(int)

    print(f"\n{name}")
    print("  logloss :", log_loss(ys, p))
    print("  brier   :", brier_score_loss(ys, p))
    print("  auc     :", roc_auc_score(ys, p))
    print("  acc     :", accuracy_score(ys, pred))

eval_set("TEST", logreg, X_test, y_test)


TEST
  logloss : 0.4693178782104013
  brier   : 0.1520597910900921
  auc     : 0.8645812677638972
  acc     : 0.8028455284552846
